In [None]:
import pandas as pd
df = pd.read_csv("/Users/ziv/Desktop/Partisan Discourse Documentation/codes/2_Keyword_Extraction/output/tweets_with_labels_en_1_with_subjects.csv")
df.head()

In [None]:
df = pd.read_csv("/Users/ziv/Desktop/Partisan Discourse Documentation/codes/2_Keyword_Extraction/output/tweets_with_subjects.csv")
df.head()

In [None]:
import pandas as pd

# 1. Load the datasets
file_path_1 = "/Users/ziv/Desktop/Partisan Discourse Documentation/codes/2_Keyword_Extraction/output/tweets_with_labels_en_1_with_subjects.csv"
file_path_2 = "/Users/ziv/Desktop/Partisan Discourse Documentation/codes/2_Keyword_Extraction/output/tweets_with_subjects.csv"

df1 = pd.read_csv(file_path_1)
df2 = pd.read_csv(file_path_2)

# 2. Identify the overlapping tweets
# This creates a boolean mask for rows in df1 where the tweet exists in df2
overlap_mask = df1['tweet'].isin(df2['tweet'])
overlapping_tweets_df = df1[overlap_mask]

# 3. Calculate statistics
count_df1 = len(df1)
count_df2 = len(df2)
overlap_count = len(overlapping_tweets_df)
unique_to_df1 = count_df1 - overlap_count
unique_to_df2 = count_df2 - overlap_count

# 4. Display results
print(f"--- Dataset Comparison ---")
print(f"Total tweets in File 1: {count_df1}")
print(f"Total tweets in File 2: {count_df2}")
print(f"Number of overlapping tweets: {overlap_count}")
print(f"Percentage overlap (relative to File 1): {(overlap_count/count_df1)*100:.2f}%")

# 5. Optional: View the overlapping content
print("\n--- Sample Overlapping Tweets ---")
print(overlapping_tweets_df['tweet'].head())

In [None]:
import pandas as pd

# 1. Load the dataset
file_path = "/Users/ziv/Desktop/Partisan Discourse Documentation/codes/2_Keyword_Extraction/output/tweets_with_labels_en_1_with_subjects.csv"
df = pd.read_csv(file_path)

# 2. Pre-processing: Strip whitespaces
df['tweet'] = df['tweet'].str.strip()

# --- NEW SECTION: Analyze repeats by polarity ---

# Identify ALL rows that are part of a duplicate set (Author + Tweet)
# keep=False marks all occurrences as True so we can count them
all_duplicates_mask = df.duplicated(subset=['retweet_author', 'tweet'], keep=False)
duplicates_df = df[all_duplicates_mask]

# Count how many repeated tweets exist per label (Pro Ruling, Pro OPP, Neutral)
# We divide the count of rows by the total duplicates to see the distribution
repeat_stats = duplicates_df.groupby('tweet_label').size().reset_index(name='repeat_instance_count')

print("--- Repeat Analysis by Polarity ---")
if not repeat_stats.empty:
    print(repeat_stats)
else:
    print("No duplicates found.")

# --- DEDUPLICATION SECTION ---

# Remove duplicates (keeping only the first instance)
df_cleaned = df.drop_duplicates(subset=['retweet_author', 'tweet'], keep='first')

# 4. Final Report
initial_rows = len(df)
final_rows = len(df_cleaned)
duplicates_removed = initial_rows - final_rows

print(f"\n--- Deduplication Summary ---")
print(f"Original number of rows: {initial_rows}")
print(f"Rows after removing duplicates: {final_rows}")
print(f"Total duplicate entries removed: {duplicates_removed}")

# 5. Save the cleaned version
df_cleaned.to_csv("final_tweets_eng_deduplicated_by_author.csv", index=False)

In [None]:
import pandas as pd
df = pd.read_csv("/Users/ziv/Desktop/Partisan Discourse Documentation/codes/2_Keyword_Extraction/output/final_tweets_eng_deduplicated_by_author.csv")
df.head()

## Split Tweets by Subject (1 row per keyword)

Each tweet has 3 subjects in a list. We'll explode this so each tweet becomes 3 rows - one for each subject.
The 'subjects' column will be renamed to 'keyword'.

In [2]:
import pandas as pd
import ast

# Load the deduplicated CSV
df = pd.read_csv("/Users/ziv/Desktop/Partisan Discourse Documentation/codes/2_Keyword_Extraction/output/final_tweets_eng_deduplicated_by_author.csv")

print(f"Original DataFrame shape: {df.shape}")
print(f"Original columns: {df.columns.tolist()}")
df.head(3)

Original DataFrame shape: (2801717, 14)
Original columns: ['timestamp', 'tweet', 'retweet_author', 'original_author', 'retweet_lc', 'original_lc', 'retweet_party', 'year', 'side', 'polarity_avg', 'label_0_5', 'tweet_label', 'subjects', 'subjects_scored']


Unnamed: 0,timestamp,tweet,retweet_author,original_author,retweet_lc,original_lc,retweet_party,year,side,polarity_avg,label_0_5,tweet_label,subjects,subjects_scored
0,2020-06-18 11:59:57+00:00,PM ⁦@narendramodi⁩ to launch Garib Kalyan Rojg...,BJP4AnN,PMOIndia,bjp4ann,pmoindia,BJP,2020.0,ruling,0.967184,Pro Ruling,Pro Ruling,"['narendramodi', 'launch', 'livelihood']","[{'text': 'narendramodi', 'score': 0.4989}, {'..."
1,2020-05-20 18:15:42+00:00,A step towards economic resilience #aatmanirbh...,BJP4AnN,MOFPI_GOI,bjp4ann,mofpi_goi,BJP,2020.0,ruling,0.960328,Pro Ruling,Pro Ruling,"['resilience', '25000', 'food']","[{'text': 'resilience', 'score': 0.517}, {'tex..."
2,2020-05-20 18:14:57+00:00,.@MOFPI_GOI wholeheartedly welcomes the Cabine...,BJP4AnN,MOFPI_GOI,bjp4ann,mofpi_goi,BJP,2020.0,ruling,0.960328,Pro Ruling,Pro Ruling,"['mofpi_goi', 'cabinet', 'food']","[{'text': 'mofpi_goi', 'score': 0.4442}, {'tex..."


In [3]:
# The 'subjects' column is stored as a string representation of a list
# e.g., "['narendramodi', 'launch', 'livelihood']"
# We need to parse it into an actual Python list

def parse_subjects(subjects_str):
    """
    Parse the subjects string into a list.
    Handles various edge cases like NaN, empty strings, etc.
    """
    if pd.isna(subjects_str):
        return []
    if isinstance(subjects_str, list):
        return subjects_str
    try:
        # Try to parse as a Python literal (e.g., "['a', 'b', 'c']")
        parsed = ast.literal_eval(subjects_str)
        if isinstance(parsed, list):
            return parsed
        return [parsed]
    except (ValueError, SyntaxError):
        # If parsing fails, return empty list
        return []

# Parse the subjects column
df['subjects_list'] = df['subjects'].apply(parse_subjects)

# Check parsing results
print("Sample parsed subjects:")
for i in range(min(5, len(df))):
    print(f"  Original: {df['subjects'].iloc[i]}")
    print(f"  Parsed:   {df['subjects_list'].iloc[i]}")
    print()

Sample parsed subjects:
  Original: ['narendramodi', 'launch', 'livelihood']
  Parsed:   ['narendramodi', 'launch', 'livelihood']

  Original: ['resilience', '25000', 'food']
  Parsed:   ['resilience', '25000', 'food']

  Original: ['mofpi_goi', 'cabinet', 'food']
  Parsed:   ['mofpi_goi', 'cabinet', 'food']

  Original: ['coronavirus', 'russia', 'india']
  Parsed:   ['coronavirus', 'russia', 'india']

  Original: ['condolence', 'whatsapp', 'immediately']
  Parsed:   ['condolence', 'whatsapp', 'immediately']



In [4]:
# Count how many subjects each tweet has
df['num_subjects'] = df['subjects_list'].apply(len)
print("Distribution of number of subjects per tweet:")
print(df['num_subjects'].value_counts().sort_index())

Distribution of number of subjects per tweet:
num_subjects
0        185
1      10879
2      36999
3    2753654
Name: count, dtype: int64


In [5]:
# Explode the subjects_list column
# This creates one row per subject for each tweet
df_exploded = df.explode('subjects_list')

# Rename 'subjects_list' to 'keyword'
df_exploded = df_exploded.rename(columns={'subjects_list': 'keyword'})

# Drop the original 'subjects' and 'num_subjects' columns (no longer needed)
df_exploded = df_exploded.drop(columns=['subjects', 'num_subjects'])

# Reset index
df_exploded = df_exploded.reset_index(drop=True)

print(f"\nAfter exploding:")
print(f"  Original rows: {len(df)}")
print(f"  Exploded rows: {len(df_exploded)}")
print(f"  Expansion factor: {len(df_exploded) / len(df):.2f}x")


After exploding:
  Original rows: 2801717
  Exploded rows: 8346024
  Expansion factor: 2.98x


In [6]:
# Verify the result - show a sample tweet that was split into 3 rows
print("Sample of exploded data (same tweet with different keywords):")
print("=" * 80)

# Find a tweet that appears multiple times (with different keywords)
sample_tweet = df_exploded['tweet'].iloc[0]
sample_rows = df_exploded[df_exploded['tweet'] == sample_tweet][['tweet', 'keyword', 'tweet_label']]

print(f"Tweet (truncated): {sample_tweet[:100]}...")
print(f"\nThis tweet now has {len(sample_rows)} rows, one per keyword:")
for idx, row in sample_rows.iterrows():
    print(f"  - keyword: '{row['keyword']}'")

Sample of exploded data (same tweet with different keywords):
Tweet (truncated): PM ⁦@narendramodi⁩ to launch Garib Kalyan Rojgar Abhiyaan on 20th June to boost livelihood opportuni...

This tweet now has 348 rows, one per keyword:
  - keyword: 'narendramodi'
  - keyword: 'launch'
  - keyword: 'livelihood'
  - keyword: 'narendramodi'
  - keyword: 'launch'
  - keyword: 'livelihood'
  - keyword: 'narendramodi'
  - keyword: 'launch'
  - keyword: 'livelihood'
  - keyword: 'narendramodi'
  - keyword: 'launch'
  - keyword: 'livelihood'
  - keyword: 'narendramodi'
  - keyword: 'launch'
  - keyword: 'livelihood'
  - keyword: 'narendramodi'
  - keyword: 'launch'
  - keyword: 'livelihood'
  - keyword: 'narendramodi'
  - keyword: 'launch'
  - keyword: 'livelihood'
  - keyword: 'narendramodi'
  - keyword: 'launch'
  - keyword: 'livelihood'
  - keyword: 'narendramodi'
  - keyword: 'launch'
  - keyword: 'livelihood'
  - keyword: 'narendramodi'
  - keyword: 'launch'
  - keyword: 'livelihood'
  - keyw

In [7]:
# View the exploded dataframe
print(f"\nNew DataFrame columns: {df_exploded.columns.tolist()}")
df_exploded.head(10)


New DataFrame columns: ['timestamp', 'tweet', 'retweet_author', 'original_author', 'retweet_lc', 'original_lc', 'retweet_party', 'year', 'side', 'polarity_avg', 'label_0_5', 'tweet_label', 'subjects_scored', 'keyword']


Unnamed: 0,timestamp,tweet,retweet_author,original_author,retweet_lc,original_lc,retweet_party,year,side,polarity_avg,label_0_5,tweet_label,subjects_scored,keyword
0,2020-06-18 11:59:57+00:00,PM ⁦@narendramodi⁩ to launch Garib Kalyan Rojg...,BJP4AnN,PMOIndia,bjp4ann,pmoindia,BJP,2020.0,ruling,0.967184,Pro Ruling,Pro Ruling,"[{'text': 'narendramodi', 'score': 0.4989}, {'...",narendramodi
1,2020-06-18 11:59:57+00:00,PM ⁦@narendramodi⁩ to launch Garib Kalyan Rojg...,BJP4AnN,PMOIndia,bjp4ann,pmoindia,BJP,2020.0,ruling,0.967184,Pro Ruling,Pro Ruling,"[{'text': 'narendramodi', 'score': 0.4989}, {'...",launch
2,2020-06-18 11:59:57+00:00,PM ⁦@narendramodi⁩ to launch Garib Kalyan Rojg...,BJP4AnN,PMOIndia,bjp4ann,pmoindia,BJP,2020.0,ruling,0.967184,Pro Ruling,Pro Ruling,"[{'text': 'narendramodi', 'score': 0.4989}, {'...",livelihood
3,2020-05-20 18:15:42+00:00,A step towards economic resilience #aatmanirbh...,BJP4AnN,MOFPI_GOI,bjp4ann,mofpi_goi,BJP,2020.0,ruling,0.960328,Pro Ruling,Pro Ruling,"[{'text': 'resilience', 'score': 0.517}, {'tex...",resilience
4,2020-05-20 18:15:42+00:00,A step towards economic resilience #aatmanirbh...,BJP4AnN,MOFPI_GOI,bjp4ann,mofpi_goi,BJP,2020.0,ruling,0.960328,Pro Ruling,Pro Ruling,"[{'text': 'resilience', 'score': 0.517}, {'tex...",25000
5,2020-05-20 18:15:42+00:00,A step towards economic resilience #aatmanirbh...,BJP4AnN,MOFPI_GOI,bjp4ann,mofpi_goi,BJP,2020.0,ruling,0.960328,Pro Ruling,Pro Ruling,"[{'text': 'resilience', 'score': 0.517}, {'tex...",food
6,2020-05-20 18:14:57+00:00,.@MOFPI_GOI wholeheartedly welcomes the Cabine...,BJP4AnN,MOFPI_GOI,bjp4ann,mofpi_goi,BJP,2020.0,ruling,0.960328,Pro Ruling,Pro Ruling,"[{'text': 'mofpi_goi', 'score': 0.4442}, {'tex...",mofpi_goi
7,2020-05-20 18:14:57+00:00,.@MOFPI_GOI wholeheartedly welcomes the Cabine...,BJP4AnN,MOFPI_GOI,bjp4ann,mofpi_goi,BJP,2020.0,ruling,0.960328,Pro Ruling,Pro Ruling,"[{'text': 'mofpi_goi', 'score': 0.4442}, {'tex...",cabinet
8,2020-05-20 18:14:57+00:00,.@MOFPI_GOI wholeheartedly welcomes the Cabine...,BJP4AnN,MOFPI_GOI,bjp4ann,mofpi_goi,BJP,2020.0,ruling,0.960328,Pro Ruling,Pro Ruling,"[{'text': 'mofpi_goi', 'score': 0.4442}, {'tex...",food
9,2020-11-27 08:43:45+00:00,Russia agrees to produce coronavirus vaccine S...,VartakKuldeep,timesofindia,vartakkuldeep,timesofindia,INC,2020.0,opposition,-0.120152,Neutral,Neutral,"[{'text': 'coronavirus', 'score': 0.4371}, {'t...",coronavirus


In [8]:
# Save the exploded dataframe
output_path = "/Users/ziv/Desktop/Partisan Discourse Documentation/codes/2_Keyword_Extraction/output/tweets_exploded_by_keyword.csv"
df_exploded.to_csv(output_path, index=False)
print(f"✅ Saved exploded dataframe to: {output_path}")
print(f"   Total rows: {len(df_exploded):,}")

✅ Saved exploded dataframe to: /Users/ziv/Desktop/Partisan Discourse Documentation/codes/2_Keyword_Extraction/output/tweets_exploded_by_keyword.csv
   Total rows: 8,346,024
