In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
df = pd.read_csv('clean.csv')
df = df.drop(columns=['post', 'political_leaning'], axis=1)
df.head()

Unnamed: 0,author_ID,clean_post,nr_of_words,nr_of_characters,label
0,t2_kj8twzq5,dog pic profile longer hair spot unfortunately...,774,4761,0
1,t2_1404zt,skyscraper probably gon na mostly concrete mea...,750,4791,0
2,t2_4erzxbrz,intentionally wash as poseidon french kiss bus...,741,4894,0
3,t2_ccx3o6,struggle still jungle similar role support sin...,787,5261,0
4,t2_b19hyjws,kitlesini sevmezdim ama kral adammış genj adam...,974,6428,0


In [6]:
# Separate TF-IDF by label
tfidf_labelwise = {}
for label in [0, 1, 2]:  # Replace with your unique labels
    subset = df[df['label'] == label]['clean_post']
    tfidf_labelwise[label] = TfidfVectorizer(max_features=5000, min_df=5)
    tfidf_labelwise[label].fit(subset)

# Calculate global tfidf
global_tfidf = TfidfVectorizer(max_features=5000, min_df=5)
global_tfidf.fit(df['clean_post'])


In [8]:
# Compare specific terms
term = "government"
global_score = global_tfidf.idf_[global_tfidf.vocabulary_.get(term, -1)]
print(f"Global score for {term}: {global_score}")

for label in [0, 1, 2]:
    label_score = tfidf_labelwise[label].idf_[tfidf_labelwise[label].vocabulary_.get(term, -1)]
    print(f"Label-specific score for {term} (label {label}): {label_score}")

Global score for government: 1.963685335865595
Label-specific score for government (label 0): 2.012984769606194
Label-specific score for government (label 1): 2.1172507033347436
Label-specific score for government (label 2): 1.7891246292251326


In [9]:

# Function to get top N words by TF-IDF scores
def get_top_tfidf_words(tfidf_vectorizer, n=10):
    scores = tfidf_vectorizer.idf_
    vocab = tfidf_vectorizer.vocabulary_
    # Sort words by TF-IDF score in descending order
    sorted_words = sorted(vocab.items(), key=lambda x: scores[x[1]], reverse=True)
    return [(word, scores[idx]) for word, idx in sorted_words[:n]]

# Get top 10 words for the global TF-IDF
print("Top 10 words for global TF-IDF:")
global_top_words = get_top_tfidf_words(global_tfidf, n=10)
for word, score in global_top_words:
    print(f"{word}: {score}")

# Get top 10 words for each label-specific TF-IDF
for label in tfidf_labelwise:
    print(f"\nTop 10 words for label {label}:")
    label_top_words = get_top_tfidf_words(tfidf_labelwise[label], n=10)
    for word, score in label_top_words:
        print(f"{word}: {score}")


Top 10 words for global TF-IDF:
avacado: 9.31811875212325
nbsp: 8.081356124974322
að: 7.133316694785586
στην: 7.06099603320596
ότι: 7.06099603320596
thì: 7.0494352108048846
σε: 7.015533659129203
nao: 7.015533659129203
για: 7.004483822942618
θα: 6.993554752410428

Top 10 words for label 0:
nbsp: 8.219560696331122
xset: 7.851835916205804
bir: 7.2812910577381915
chakra: 6.89042474905118
من: 6.703213206963033
на: 6.679115655383972
sussy: 6.655585157973778
не: 6.655585157973778
inte: 6.63259563974908
méxico: 6.63259563974908

Top 10 words for label 1:
bordiga: 8.114200180673295
speer: 7.372262835943918
aizen: 6.9202777122008605
не: 6.89042474905118
rusland: 6.7791991139409555
nrf: 6.63259563974908
nao: 6.588143877178246
peta: 6.566637671957283
chakra: 6.5455842627594505
isso: 6.44649336011522

Top 10 words for label 2:
bạn: 8.470875124612029
avacado: 8.470875124612029
blimp: 8.337343731987506
gatling: 7.42105300011335
että: 6.9202777122008605
litecoin: 6.805867361023116
jeg: 6.6555851579737

In [10]:
# Prepare a set of all words in the vocabulary across all labels
all_vocab = set()
for label in tfidf_labelwise:
    all_vocab.update(tfidf_labelwise[label].vocabulary_.keys())

# Dictionary to store biased words for each label
biased_words = {label: [] for label in tfidf_labelwise.keys()}

# Analyze word scores
for word in all_vocab:
    scores = {}
    for label in tfidf_labelwise:
        idx = tfidf_labelwise[label].vocabulary_.get(word, -1)
        scores[label] = tfidf_labelwise[label].idf_[idx] if idx != -1 else 0
    
    # Compare scores for each label
    for label in scores:
        other_labels = [l for l in scores if l != label]
        other_scores = [scores[l] for l in other_labels]
        max_other_score = max(other_scores, default=0)
        
        # Define the bias threshold (e.g., significantly higher by at least 1.5)
        if scores[label] > max_other_score + 0.5:
            biased_words[label].append((word, scores[label], max_other_score))

# Output biased words for each label
for label, words in biased_words.items():
    print(f"\nBiased words for label {label}:")
    for word, label_score, max_other_score in words:
        print(f"  {word} - Label Score: {label_score}, Max Other Score: {max_other_score}")



Biased words for label 0:
  snack - Label Score: 5.1134803656082655, Max Other Score: 0
  criterion - Label Score: 5.257729974452812, Max Other Score: 0
  colorado - Label Score: 5.235001723375256, Max Other Score: 0
  messi - Label Score: 5.564755009747724, Max Other Score: 0
  armenian - Label Score: 5.811615087679249, Max Other Score: 0
  postal - Label Score: 5.916975603337076, Max Other Score: 0
  poop - Label Score: 5.000684871462921, Max Other Score: 0
  flaired - Label Score: 5.782056285437705, Max Other Score: 0
  eight - Label Score: 5.143785715103594, Max Other Score: 0
  koji - Label Score: 6.273650547275809, Max Other Score: 0
  assigned - Label Score: 5.15928990163956, Max Other Score: 0
  tak - Label Score: 6.112720180463171, Max Other Score: 0
  rental - Label Score: 5.118467907119304, Max Other Score: 0
  ako - Label Score: 6.1826787690700815, Max Other Score: 0
  hvis - Label Score: 5.9741340171770245, Max Other Score: 5.453940643407433
  omicron - Label Score: 5.263