In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from fairlearn.metrics import demographic_parity_difference, equalized_odds_difference
from fairlearn.postprocessing import ThresholdOptimizer
from fairlearn.reductions import DemographicParity, EqualizedOdds

In [2]:
import sys
print("Python executable:", sys.executable)

Python executable: /opt/anaconda3/envs/textmining/bin/python


In [4]:
dataframe = pd.read_csv("dataset_with_missing_values_duplicates_removed.csv")
dataframe.head()

Unnamed: 0,author,published,title,text,language,site_url,main_img_url,type,label,title_without_stopwords,text_without_stopwords,hasImage
0,Barracuda Brigade,2016-10-26T21:41:00.000+03:00,muslims busted they stole millions in govt ben...,print they should pay all the back all the mon...,english,100percentfedup.com,http://bb4sp.com/wp-content/uploads/2016/10/Fu...,bias,Real,muslims busted stole millions govt benefits,print pay back money plus interest entire fami...,1.0
1,reasoning with facts,2016-10-29T08:47:11.259+03:00,re why did attorney general loretta lynch plea...,why did attorney general loretta lynch plead t...,english,100percentfedup.com,http://bb4sp.com/wp-content/uploads/2016/10/Fu...,bias,Real,attorney general loretta lynch plead fifth,attorney general loretta lynch plead fifth bar...,1.0
2,Barracuda Brigade,2016-10-31T01:41:49.479+02:00,breaking weiner cooperating with fbi on hillar...,red state \nfox news sunday reported this mor...,english,100percentfedup.com,http://bb4sp.com/wp-content/uploads/2016/10/Fu...,bias,Real,breaking weiner cooperating fbi hillary email ...,red state fox news sunday reported morning ant...,1.0
3,Fed Up,2016-11-01T05:22:00.000+02:00,pin drop speech by father of daughter kidnappe...,email kayla mueller was a prisoner and torture...,english,100percentfedup.com,http://100percentfedup.com/wp-content/uploads/...,bias,Real,pin drop speech father daughter kidnapped kill...,email kayla mueller prisoner tortured isis cha...,1.0
4,Fed Up,2016-11-01T21:56:00.000+02:00,fantastic trumps point plan to reform healthc...,email healthcare reform to make america great ...,english,100percentfedup.com,http://100percentfedup.com/wp-content/uploads/...,bias,Real,fantastic trumps point plan reform healthcare ...,email healthcare reform make america great sin...,1.0


In [5]:
X = dataframe["text"]
y = dataframe["label"]

In [6]:
tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words="english")
X_tfidf = tfidf_vectorizer.fit_transform(X)

In [7]:
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
print("First 50 encoded values:", y_encoded[:50])

First 50 encoded values: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [8]:
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y_encoded, test_size=0.2, random_state=42)

In [9]:
random_forest_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
random_forest_classifier.fit(X_train, y_train)

In [10]:
y_predictions = random_forest_classifier.predict(X_test)

In [13]:
# Add this code after your label encoding step
print("Label mapping:")
for i, label in enumerate(label_encoder.classes_):
    print(f"Original Label: '{label}' → Encoded: {i}")

Label mapping:
Original Label: 'Fake' → Encoded: 0
Original Label: 'Real' → Encoded: 1


In [None]:
# If demographic parity difference = 0, it means the positive predictions are distributed equally among different groups 
# Or the model makes predicitons without any disparity 
# If demographic parity difference = 1, suggesting that there is a substantial disparity in positve predictions between groups
# Which can raise fairness concern
def demographic_parity_difference(y_true, y_pred):
    """
    Calculate demographic parity difference using the true labels as groups.
    
    In the context of fake news detection:
    - Measures if the model predicts "real news" at similar rates regardless of whether news is actually real or fake
    - A value of 0 means perfect fairness (no disparity)
    - A higher value indicates disparity in "real news" prediction rates between groups
    
    Parameters:
    - y_true: True labels (0=Fake, 1=Real) - used to define the groups
    - y_pred: Predicted labels (0=Fake, 1=Real)
    
    Returns:
    - Absolute difference in "real news" prediction rates between groups
    """
    # Identify indices for samples that are actually fake news (y_true=0)
    # enumerate(y_true) creates pairs of (index, value) for each element in y_true
    # keep only the indices where the value equals 0 (Fake news)
    fake_news_indices = [i for i, y in enumerate(y_true) if y==0]  
    
    # Identify indices for samples that are actually real news (y_true=1)
    # This creates a list of indices where the true label is 1 (Real news)
    real_news_indices = [i for i, y in enumerate(y_true) if y==1]  
    
    # Calculate the rate at which the model predicts as class 1 (Real) for fake news
    # For each index in fake_news_indices:
    #   - Check if the model predicted it as Real (y_pred[i]==1)
    #   - Count how many times this happens (sum of 1's)
    #   - Divide by total number of fake news samples to get a proportion
    real_prediction_rate_for_fake_news = sum(1 for i in fake_news_indices if y_pred[i]==1) / len(fake_news_indices)
    
    # Calculate the rate at which the model predicts as class 1 (Real) for real news
    # Similar to above, but for indices in real_news_indices
    # This measures how often real news is correctly identified as real
    real_prediction_rate_for_real_news = sum(1 for i in real_news_indices if y_pred[i]==1) / len(real_news_indices)
    
    # Calculate the absolute difference between the two rates
    # abs() gets a positive value representing the magnitude of disparity
    # A value of 0 would mean the model predicts "real" at equal rates for both fake and real news
    # A higher value indicates potential bias in how the model makes predictions
    disparity = abs(real_prediction_rate_for_fake_news - real_prediction_rate_for_real_news)
    
    # Return the calculated disparity measure
    # This is our demographic parity difference value:
    # - 0 = perfectly fair (equal prediction rates across groups)
    # - 1 = maximum unfairness (complete disparity in prediction rates)
    return disparity


In [16]:
DemographicDisparityDifference = demographic_parity_difference(y_test, y_predictions)
print(f"Demographic Disparity in 'Real News' predictions between the groups of fake news and real news: {DemographicDisparityDifference:.4f}")

Demographic Disparity in 'Real News' predictions between the groups of fake news and real news: 0.3278
