# Find Political-Specific Words
## Analyzing words predominantly used in Political memes

In [None]:
import pandas as pd
import numpy as np
from collections import Counter
import re

# Load the dataset
df = pd.read_csv('/data/raw_text/train/train_dataset_cleaned.csv')

print(f"Dataset shape: {df.shape}")
print(f"\nLabel distribution:")
print(df['Label'].value_counts())

Dataset shape: (330, 4)

Label distribution:
Label
Political       181
NonPolitical    149
Name: count, dtype: int64


In [10]:
# Separate political and non-political memes
political_text = df[df['Label'] == 'Political']['Processed_Text'].dropna()
nonpolitical_text = df[df['Label'] == 'NonPolitical']['Processed_Text'].dropna()

print(f"Political memes: {len(political_text)}")
print(f"Non-Political memes: {len(nonpolitical_text)}")

# Function to extract words from text
def extract_words(text):
    # Convert to lowercase and split by whitespace
    words = text.lower().split()
    # Remove words that are too short (less than 3 characters)
    words = [w for w in words if len(w) >= 3]
    return words

# Extract all words from political and non-political texts
political_words = []
for text in political_text:
    political_words.extend(extract_words(str(text)))

nonpolitical_words = []
for text in nonpolitical_text:
    nonpolitical_words.extend(extract_words(str(text)))

print(f"\nTotal words in Political memes: {len(political_words)}")
print(f"Total words in Non-Political memes: {len(nonpolitical_words)}")
print(f"Unique words in Political memes: {len(set(political_words))}")
print(f"Unique words in Non-Political memes: {len(set(nonpolitical_words))}")

Political memes: 176
Non-Political memes: 149

Total words in Political memes: 2453
Total words in Non-Political memes: 1965
Unique words in Political memes: 1803
Unique words in Non-Political memes: 1420


In [11]:
# Count word frequencies
political_word_counts = Counter(political_words)
nonpolitical_word_counts = Counter(nonpolitical_words)

print("Top 20 most common words in Political memes:")
for word, count in political_word_counts.most_common(20):
    print(f"{word}: {count}")

print("\n" + "="*50)
print("\nTop 20 most common words in Non-Political memes:")
for word, count in nonpolitical_word_counts.most_common(20):
    print(f"{word}: {count}")

Top 20 most common words in Political memes:
the: 22
যখন: 16
আমি: 14
আওয়ামী: 13
আমার: 11
you: 11
anwartvnews: 10
bangladesh: 10
nap: 10
শেখ: 9
লীগ: 9
দিতে: 8
থেকে: 8
জন্য: 8
and: 8
করে: 8
after: 7
স্বাধীন: 7
party: 7
national: 7


Top 20 most common words in Non-Political memes:
the: 40
for: 19
when: 17
you: 15
and: 15
যখন: 15
with: 11
স্বাধীন: 11
আমি: 10
করে: 10
pro: 9
vantages: 8
get: 8
but: 7
who: 7
আমার: 7
will: 7
that: 7
bash: 6
one: 6


In [12]:
# Find words that appear mostly in political memes
# Calculate the ratio of political to non-political usage

political_specific_words = []

# Get total number of political and non-political memes
total_political = len(political_text)
total_nonpolitical = len(nonpolitical_text)

for word, pol_count in political_word_counts.items():
    nonpol_count = nonpolitical_word_counts.get(word, 0)
    
    # Only consider words that appear at least 3 times in political memes
    if pol_count >= 3:
        # Calculate ratio (add 1 to avoid division by zero)
        ratio = pol_count / (nonpol_count + 1)
        
        # Calculate frequency percentages
        pol_frequency = (pol_count / total_political) * 100
        nonpol_frequency = (nonpol_count / total_nonpolitical) * 100 if nonpol_count > 0 else 0
        
        # Store word, political count, non-political count, and ratio
        political_specific_words.append({
            'word': word,
            'political_count': pol_count,
            'nonpolitical_count': nonpol_count,
            'ratio': ratio,
            'total_count': pol_count + nonpol_count,
            'political_frequency_%': round(pol_frequency, 2),
            'nonpolitical_frequency_%': round(nonpol_frequency, 2)
        })

# Convert to DataFrame and sort by ratio
political_df = pd.DataFrame(political_specific_words)
political_df = political_df.sort_values('ratio', ascending=False)

print("Words predominantly used in Political memes (Top 50):")
print("="*100)
print(political_df.head(50).to_string(index=False))


Words predominantly used in Political memes (Top 50):
            word  political_count  nonpolitical_count  ratio  total_count  political_frequency_%  nonpolitical_frequency_%
             nap               10                   0   10.0           10                   5.68                      0.00
     anwartvnews               10                   0   10.0           10                   5.68                      0.00
             শেখ                9                   0    9.0            9                   5.11                      0.00
        national                7                   0    7.0            7                   3.98                      0.00
           party                7                   0    7.0            7                   3.98                      0.00
          আওয়ামী               13                   1    6.5           14                   7.39                      0.67
        বাংলাদেশ                6                   0    6.0            6            

In [13]:
# Filter for words that appear ONLY or ALMOST ONLY in political memes
# Words with 0 or very few appearances in non-political memes

print("\n" + "="*80)
print("Words appearing ONLY in Political memes (never in non-political):")
print("="*80)
only_political = political_df[political_df['nonpolitical_count'] == 0]
print(only_political.head(30).to_string(index=False))

print("\n" + "="*80)
print("Words appearing MOSTLY in Political memes (ratio > 10):")
print("="*80)
mostly_political = political_df[(political_df['nonpolitical_count'] > 0) & (political_df['ratio'] > 10)]
print(mostly_political.head(30).to_string(index=False))


Words appearing ONLY in Political memes (never in non-political):
       word  political_count  nonpolitical_count  ratio  total_count  political_frequency_%  nonpolitical_frequency_%
        nap               10                   0   10.0           10                   5.68                       0.0
anwartvnews               10                   0   10.0           10                   5.68                       0.0
        শেখ                9                   0    9.0            9                   5.11                       0.0
   national                7                   0    7.0            7                   3.98                       0.0
      party                7                   0    7.0            7                   3.98                       0.0
   বাংলাদেশ                6                   0    6.0            6                   3.41                       0.0
    হাসিনার                6                   0    6.0            6                   3.41                

In [14]:
# Save the political-specific words to CSV
political_df.to_csv('sample_political_specific_words.csv', index=False)
print("\n" + "="*80)
print("Political-specific words saved to 'political_specific_words.csv'")

# Summary statistics
print("\n" + "="*80)
print("SUMMARY:")
print("="*80)
print(f"Total unique words analyzed: {len(political_df)}")
print(f"Words appearing ONLY in political memes: {len(only_political)}")
print(f"Words appearing MOSTLY in political memes (ratio > 10): {len(mostly_political)}")
print(f"Words with high political usage (ratio > 5): {len(political_df[political_df['ratio'] > 5])}")


Political-specific words saved to 'political_specific_words.csv'

SUMMARY:
Total unique words analyzed: 128
Words appearing ONLY in political memes: 69
Words appearing MOSTLY in political memes (ratio > 10): 0
Words with high political usage (ratio > 5): 11
