In [None]:
import requests
from bs4 import BeautifulSoup
import spacy
from transformers import pipeline

def scrape_website(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        text = ' '.join([p.text for p in soup.find_all('p')])
        return text
    except requests.exceptions.RequestException as e:
        print(f"Error fetching the URL: {e}")
        return ""

def split_text(text, max_length=512):
    words = text.split()
    chunks = [' '.join(words[i:i + max_length]) for i in range(0, len(words), max_length)]
    return chunks

def detect_bias(text, categories, classifier):
    result = classifier(text, candidate_labels=categories)
    return result

def analyze_sentiment(text, sentiment_analyzer):
    result = sentiment_analyzer(text[:512])  # Truncate text to fit model's max length
    return result[0]['score'] if result[0]['label'] == 'POSITIVE' else -result[0]['score']

def grade_content(bias_results, sentiment_scores):
    # Normalize bias results to a 0-100 scale
    bias_score = 100 - (sum(bias_results) / len(bias_results) * 100)

    # Normalize sentiment scores to a 0-100 scale
    sentiment_score = (sum(sentiment_scores) / len(sentiment_scores) + 1) * 50

    # Calculate final score as an average of bias and sentiment scores
    final_score = (bias_score + sentiment_score) / 2
    return final_score


# Main script
url = 'https://en.wikipedia.org/wiki/Shah_Rukh_Khan'
content = scrape_website(url)

if content:
    # Load spaCy's English model
    nlp = spacy.load('en_core_web_sm')

    # Split the text into smaller chunks
    chunks = split_text(content)

    # Initialize transformers pipelines
    classifier = pipeline('zero-shot-classification', model='facebook/bart-large-mnli')
    sentiment_analyzer = pipeline('sentiment-analysis')

    # Categories for bias detection
    categories = ["gender bias", "cultural bias", "political bias"]

    # Process each chunk
    bias_results = []
    sentiment_scores = []
    for chunk in nlp.pipe(chunks, batch_size=10):
        bias_result = detect_bias(chunk.text, categories, classifier)
        sentiment_score = analyze_sentiment(chunk.text, sentiment_analyzer)
        bias_results.extend(bias_result['scores'])
        sentiment_scores.append(sentiment_score)

    # Calculate final grade
    final_grade = grade_content(bias_results, sentiment_scores)
    print("Final Grade:", final_grade)
else:
    print("Failed to retrieve content from the URL.")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Final Grade: 70.73998197447509


In [None]:
!pip install names


Collecting names
  Downloading names-0.3.0.tar.gz (789 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/789.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m788.5/789.1 kB[0m [31m25.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m789.1/789.1 kB[0m [31m16.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: names
  Building wheel for names (setup.py) ... [?25l[?25hdone
  Created wheel for names: filename=names-0.3.0-py3-none-any.whl size=803682 sha256=568297d8dd50027facc97180229328942e76d820bb6dedefd13cb49925712399
  Stored in directory: /root/.cache/pip/wheels/fc/9a/6f/78f4282bbcaa2d8c678b73c54c0bb1b7a04009f0d7cec79fce
Successfully built names
Installing collected packages: names
Successfully installed names-0.3.0


In [None]:
import requests
from bs4 import BeautifulSoup
import spacy
from transformers import pipeline
import names

def scrape_website(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        text = ' '.join([p.text for p in soup.find_all('p')])
        return text
    except requests.exceptions.RequestException as e:
        print(f"Error fetching the URL: {e}")
        return ""

def split_text(text, max_length=512):
    words = text.split()
    chunks = [' '.join(words[i:i + max_length]) for i in range(0, len(words), max_length)]
    return chunks

def detect_bias(text, categories, classifier):
    result = classifier(text, candidate_labels=categories)
    return result

def calculate_bias_scores(bias_results, categories):
    category_scores = {category: 0 for category in categories}
    for result in bias_results:
        for i, category in enumerate(result['labels']):
            category_scores[category] += result['scores'][i]

    # Normalize scores to percentage
    total_scores = sum(category_scores.values())
    for category in category_scores:
        category_scores[category] = (category_scores[category] / total_scores) * 100

    return category_scores

def analyze_gender_bias(text, nlp):
    male_names = [names.get_full_name(gender='male').split()[0].lower() for _ in range(1000)]
    female_names = [names.get_full_name(gender='female').split()[0].lower() for _ in range(1000)]

    doc = nlp(text)
    male_count = sum(1 for ent in doc.ents if ent.label_ == 'PERSON' and ent.text.lower() in male_names)
    female_count = sum(1 for ent in doc.ents if ent.label_ == 'PERSON' and ent.text.lower() in female_names)
    return male_count, female_count

# Main script
url = 'https://en.wikipedia.org/wiki/Shah_Rukh_Khan'
content = scrape_website(url)

if content:
    # Load spaCy's English model
    nlp = spacy.load('en_core_web_sm')

    # Split the text into smaller chunks
    chunks = split_text(content)

    # Initialize transformers pipelines
    classifier = pipeline('zero-shot-classification', model='facebook/bart-large-mnli')

    # Categories for bias detection
    categories = ["gender bias", "cultural bias", "political bias"]

    # Process each chunk
    bias_results = []
    male_bias_count = 0
    female_bias_count = 0
    for chunk in nlp.pipe(chunks, batch_size=10):
        bias_result = detect_bias(chunk.text, categories, classifier)
        bias_results.append(bias_result)

        # Analyze gender bias
        male_count, female_count = analyze_gender_bias(chunk.text, nlp)
        male_bias_count += male_count
        female_bias_count += female_count

    # Calculate bias scores
    bias_scores = calculate_bias_scores(bias_results, categories)

    # Add gender-specific bias information
    gender_bias_info = {
        "male_bias": (male_bias_count / (male_bias_count + female_bias_count)) * 100 if (male_bias_count + female_bias_count) > 0 else 0,
        "female_bias": (female_bias_count / (male_bias_count + female_bias_count)) * 100 if (male_bias_count + female_bias_count) > 0 else 0
    }

    print("Bias Scores:", bias_scores)
    print("Gender Bias Info:", gender_bias_info)
else:
    print("Failed to retrieve content from the URL.")




Bias Scores: {'gender bias': 54.18755920659215, 'cultural bias': 27.145511965022283, 'political bias': 18.666928828385558}
Gender Bias Info: {'male_bias': 100.0, 'female_bias': 0.0}


In [None]:
def detect_bias_per_sentence(text, categories, classifier):
    sentences = text.split('.')
    sentence_results = []
    for sentence in sentences:
        if sentence.strip():  # Skip empty sentences
            result = classifier(sentence, candidate_labels=categories)
            sentence_results.append((sentence, result))
    return sentence_results

# Example usage
text = "The CEO of the company is a strong and decisive leader. The company has a diverse workforce."
categories = ["gender bias", "cultural bias", "political bias"]
classifier = pipeline('zero-shot-classification', model='facebook/bart-large-mnli')

sentence_results = detect_bias_per_sentence(text, categories, classifier)
for sentence, result in sentence_results:
    print(f"Sentence: {sentence}")
    print(f"Result: {result}")


Sentence: The CEO of the company is a strong and decisive leader
Result: {'sequence': 'The CEO of the company is a strong and decisive leader', 'labels': ['cultural bias', 'gender bias', 'political bias'], 'scores': [0.4565277695655823, 0.3388703763484955, 0.20460179448127747]}
Sentence:  The company has a diverse workforce
Result: {'sequence': ' The company has a diverse workforce', 'labels': ['cultural bias', 'gender bias', 'political bias'], 'scores': [0.6081842184066772, 0.26761922240257263, 0.12419655174016953]}


# Code to Content Ratio

In [1]:
import requests
from bs4 import BeautifulSoup

def fetch_html(url):
    response = requests.get(url)
    return response.text

def extract_text(html):
    soup = BeautifulSoup(html, 'html.parser')
    text = soup.get_text()
    return text

def calculate_ratio(html, text):
    html_size = len(html)
    text_size = len(text)
    ratio = (text_size / html_size) * 100
    return ratio

url = 'https://bekushal.com'
html_content = fetch_html(url)
text_content = extract_text(html_content)
ratio = calculate_ratio(html_content, text_content)

print(f"Code to Content Ratio: {ratio:.2f}%")


Code to Content Ratio: 5.42%


# Content Format

In [3]:
import requests
from bs4 import BeautifulSoup

def fetch_html(url):
    response = requests.get(url)
    return response.text

def parse_html(html):
    soup = BeautifulSoup(html, 'html.parser')
    return soup

def check_content_formats(soup):
    formats = {
        'text': ['p', 'article', 'blog', 'whitepaper'],
        'visual': ['img', 'video', 'figure'],
        'interactive': ['form', 'input', 'button', 'quiz', 'calculator', 'survey']
    }

    results = {key: [] for key in formats.keys()}

    for format_type, tags in formats.items():
        for tag in tags:
            elements = soup.find_all(tag)
            if elements:
                results[format_type].append((tag, len(elements)))

    return results

url = 'https://bekushal.com'
html_content = fetch_html(url)
soup = parse_html(html_content)
content_formats = check_content_formats(soup)

for format_type, elements in content_formats.items():
    print(f"{format_type.capitalize()} formats:")
    for tag, count in elements:
        print(f"  - {tag}: {count} instances")


Text formats:
  - p: 6 instances
Visual formats:
  - img: 2 instances
Interactive formats:
  - button: 4 instances
