# Deduplicate and Replace Repeated Tweets

This notebook:
1. For each annotation CSV, keeps only 1 instance of repeated tweets
2. Replaces removed duplicates with new tweets from the main dataset that match the same keyword
3. Saves deduplicated sheets to `deduplicated_sheets/` folder

In [1]:
import pandas as pd
import os
from pathlib import Path
import random

# Paths
CSV_SHEETS_DIR = 'csv_sheets'
OUTPUT_DIR = 'deduplicated_sheets'
MAIN_DATASET_PATH = '/Users/ziv/Desktop/Partisan Discourse Documentation/final_data/tweets_exploded_by_keyword.csv'

# Create output directory if it doesn't exist
os.makedirs(OUTPUT_DIR, exist_ok=True)

print(f"Input folder: {CSV_SHEETS_DIR}")
print(f"Output folder: {OUTPUT_DIR}")
print(f"Main dataset: {MAIN_DATASET_PATH}")

Input folder: csv_sheets
Output folder: deduplicated_sheets
Main dataset: /Users/ziv/Desktop/Partisan Discourse Documentation/final_data/tweets_exploded_by_keyword.csv


In [2]:
# Load the main dataset with all tweets
print("Loading main dataset...")
main_df = pd.read_csv(MAIN_DATASET_PATH, low_memory=False)
print(f"Main dataset loaded: {len(main_df):,} rows")
print(f"Columns: {main_df.columns.tolist()}")
print(f"\nUnique keywords in main dataset: {main_df['keyword'].nunique():,}")
main_df.head(3)

Loading main dataset...
Main dataset loaded: 8,346,024 rows
Columns: ['timestamp', 'tweet', 'retweet_author', 'original_author', 'retweet_lc', 'original_lc', 'retweet_party', 'year', 'side', 'polarity_avg', 'label_0_5', 'tweet_label', 'subjects_scored', 'keyword']

Unique keywords in main dataset: 96,528


Unnamed: 0,timestamp,tweet,retweet_author,original_author,retweet_lc,original_lc,retweet_party,year,side,polarity_avg,label_0_5,tweet_label,subjects_scored,keyword
0,2020-06-18 11:59:57+00:00,PM ‚Å¶@narendramodi‚Å© to launch Garib Kalyan Rojg...,BJP4AnN,PMOIndia,bjp4ann,pmoindia,BJP,2020.0,ruling,0.967184,Pro Ruling,Pro Ruling,"[{'text': 'narendramodi', 'score': 0.4989}, {'...",narendramodi
1,2020-06-18 11:59:57+00:00,PM ‚Å¶@narendramodi‚Å© to launch Garib Kalyan Rojg...,BJP4AnN,PMOIndia,bjp4ann,pmoindia,BJP,2020.0,ruling,0.967184,Pro Ruling,Pro Ruling,"[{'text': 'narendramodi', 'score': 0.4989}, {'...",launch
2,2020-06-18 11:59:57+00:00,PM ‚Å¶@narendramodi‚Å© to launch Garib Kalyan Rojg...,BJP4AnN,PMOIndia,bjp4ann,pmoindia,BJP,2020.0,ruling,0.967184,Pro Ruling,Pro Ruling,"[{'text': 'narendramodi', 'score': 0.4989}, {'...",livelihood


In [3]:
# List all CSV files in the csv_sheets folder (excluding the summary file)
csv_files = [f for f in os.listdir(CSV_SHEETS_DIR) 
             if f.endswith('.csv') and not f.startswith('_')]

print(f"Found {len(csv_files)} CSV files to process:")
for f in csv_files:
    print(f"  - {f}")

Found 15 CSV files to process:
  - shaheen_bagh.csv
  - modi.csv
  - hindu.csv
  - hindutva.csv
  - congress.csv
  - muslim.csv
  - farm_laws.csv
  - kashmir.csv
  - caa.csv
  - farmers_protests.csv
  - kashmiri_pandits.csv
  - china.csv
  - rahulgandhi.csv
  - ram_mandir.csv
  - new_parliament.csv


In [4]:
# First, let's analyze the duplicates in each file
print("=" * 80)
print("DUPLICATE ANALYSIS")
print("=" * 80)

analysis_results = []

for csv_file in csv_files:
    csv_path = os.path.join(CSV_SHEETS_DIR, csv_file)
    df = pd.read_csv(csv_path)
    
    total_rows = len(df)
    unique_tweets = df['tweet'].nunique()
    duplicates_to_replace = total_rows - unique_tweets
    
    # Get the keyword for this sheet (from the 'keyword' or 'matched keyword' column)
    if 'matched keyword' in df.columns:
        keyword = df['matched keyword'].iloc[0] if len(df) > 0 else 'unknown'
    elif 'keyword' in df.columns:
        keyword = df['keyword'].iloc[0] if len(df) > 0 else 'unknown'
    else:
        keyword = csv_file.replace('.csv', '').replace('_', ' ')
    
    analysis_results.append({
        'file': csv_file,
        'keyword': keyword,
        'total_rows': total_rows,
        'unique_tweets': unique_tweets,
        'duplicates_to_replace': duplicates_to_replace
    })
    
    if duplicates_to_replace > 0:
        print(f"\nüìÑ {csv_file}")
        print(f"   Keyword: '{keyword}'")
        print(f"   Total rows: {total_rows}, Unique tweets: {unique_tweets}")
        print(f"   üîÑ Duplicates to replace: {duplicates_to_replace}")

# Show summary
analysis_df = pd.DataFrame(analysis_results)
total_replacements = analysis_df['duplicates_to_replace'].sum()
print(f"\n" + "=" * 80)
print(f"TOTAL DUPLICATES TO REPLACE: {total_replacements}")
print("=" * 80)

DUPLICATE ANALYSIS

üìÑ modi.csv
   Keyword: 'modi'
   Total rows: 150, Unique tweets: 146
   üîÑ Duplicates to replace: 4

üìÑ hindutva.csv
   Keyword: 'hindutva'
   Total rows: 120, Unique tweets: 116
   üîÑ Duplicates to replace: 4

üìÑ congress.csv
   Keyword: 'congress'
   Total rows: 120, Unique tweets: 118
   üîÑ Duplicates to replace: 2

üìÑ kashmir.csv
   Keyword: 'kashmir'
   Total rows: 120, Unique tweets: 116
   üîÑ Duplicates to replace: 4

üìÑ farmers_protests.csv
   Keyword: 'farmers protests'
   Total rows: 120, Unique tweets: 102
   üîÑ Duplicates to replace: 18

üìÑ kashmiri_pandits.csv
   Keyword: 'kashmiri pandits'
   Total rows: 120, Unique tweets: 94
   üîÑ Duplicates to replace: 26

üìÑ china.csv
   Keyword: 'china'
   Total rows: 120, Unique tweets: 119
   üîÑ Duplicates to replace: 1

üìÑ rahulgandhi.csv
   Keyword: 'rahulgandhi'
   Total rows: 120, Unique tweets: 113
   üîÑ Duplicates to replace: 7

TOTAL DUPLICATES TO REPLACE: 66


In [5]:
def process_sheet(csv_path, main_df, output_dir):
    """
    Process a single annotation CSV:
    1. Remove duplicate tweets (keep first occurrence)
    2. Replace removed duplicates with new tweets from main dataset (matching keyword)
    3. Save to output directory
    
    Returns: dict with processing statistics
    """
    # Load the annotation sheet
    sheet_df = pd.read_csv(csv_path)
    original_count = len(sheet_df)
    csv_file = os.path.basename(csv_path)
    
    # Get the keyword for this sheet
    if 'matched keyword' in sheet_df.columns:
        keyword = sheet_df['matched keyword'].iloc[0] if len(sheet_df) > 0 else None
    elif 'keyword' in sheet_df.columns:
        keyword = sheet_df['keyword'].iloc[0] if len(sheet_df) > 0 else None
    else:
        keyword = csv_file.replace('.csv', '').replace('_', ' ')
    
    # Step 1: Remove duplicates, keeping first occurrence
    deduped_df = sheet_df.drop_duplicates(subset=['tweet'], keep='first').copy()
    unique_count = len(deduped_df)
    duplicates_removed = original_count - unique_count
    
    if duplicates_removed == 0:
        # No duplicates, just save as is
        output_path = os.path.join(output_dir, csv_file)
        sheet_df.to_csv(output_path, index=False)
        return {
            'file': csv_file,
            'keyword': keyword,
            'original_count': original_count,
            'duplicates_removed': 0,
            'replacements_added': 0,
            'final_count': original_count,
            'status': 'No duplicates'
        }
    
    # Step 2: Get existing tweets to exclude from replacements
    existing_tweets = set(sheet_df['tweet'].dropna().unique())
    
    # Step 3: Find replacement tweets from main dataset with matching keyword
    # The main dataset has 'keyword' column
    matching_df = main_df[main_df['keyword'] == keyword].copy()
    
    # Exclude tweets already in the sheet
    available_df = matching_df[~matching_df['tweet'].isin(existing_tweets)].copy()
    
    # Step 4: Sample replacement tweets
    replacements_needed = duplicates_removed
    available_count = len(available_df)
    
    if available_count < replacements_needed:
        print(f"  ‚ö†Ô∏è Warning: Only {available_count} replacement tweets available for keyword '{keyword}', need {replacements_needed}")
        replacements_to_add = available_count
    else:
        replacements_to_add = replacements_needed
    
    if replacements_to_add > 0:
        # Sample random replacement tweets
        replacement_tweets = available_df.sample(n=replacements_to_add, random_state=42)
        
        # Create new rows in the annotation format
        new_rows = []
        for _, row in replacement_tweets.iterrows():
            new_row = {
                'source_row': '',  # Will be filled manually if needed
                'tweet': row['tweet'],
                'tweet_label': row.get('tweet_label', ''),
                'subjects': '',  # To be annotated
                'subjects_scored': row.get('subjects_scored', ''),
                'keyword': keyword,
                'label_norm': '',  # To be annotated
                'matched keyword': keyword,
                'stance ': '',  # To be annotated (note the space in column name)
                'stance reason': ''  # To be annotated
            }
            new_rows.append(new_row)
        
        # Add replacement rows to deduplicated dataframe
        replacements_df = pd.DataFrame(new_rows)
        final_df = pd.concat([deduped_df, replacements_df], ignore_index=True)
    else:
        final_df = deduped_df
    
    # Save to output directory
    output_path = os.path.join(output_dir, csv_file)
    final_df.to_csv(output_path, index=False)
    
    return {
        'file': csv_file,
        'keyword': keyword,
        'original_count': original_count,
        'duplicates_removed': duplicates_removed,
        'replacements_added': replacements_to_add,
        'final_count': len(final_df),
        'status': 'Processed'
    }

In [6]:
# Process all CSV files
print("=" * 80)
print("PROCESSING ALL SHEETS")
print("=" * 80)

results = []

for csv_file in csv_files:
    csv_path = os.path.join(CSV_SHEETS_DIR, csv_file)
    print(f"\nüìÑ Processing: {csv_file}")
    
    result = process_sheet(csv_path, main_df, OUTPUT_DIR)
    results.append(result)
    
    if result['duplicates_removed'] > 0:
        print(f"   ‚úì Removed {result['duplicates_removed']} duplicates")
        print(f"   ‚úì Added {result['replacements_added']} replacement tweets")
        print(f"   ‚úì Final count: {result['final_count']} rows")
    else:
        print(f"   ‚úì No duplicates found, copied as-is")

print("\n" + "=" * 80)
print("PROCESSING COMPLETE")
print("=" * 80)

PROCESSING ALL SHEETS

üìÑ Processing: shaheen_bagh.csv
   ‚úì No duplicates found, copied as-is

üìÑ Processing: modi.csv
   ‚úì Removed 4 duplicates
   ‚úì Added 4 replacement tweets
   ‚úì Final count: 150 rows

üìÑ Processing: hindu.csv
   ‚úì No duplicates found, copied as-is

üìÑ Processing: hindutva.csv
   ‚úì Removed 4 duplicates
   ‚úì Added 4 replacement tweets
   ‚úì Final count: 120 rows

üìÑ Processing: congress.csv
   ‚úì Removed 2 duplicates
   ‚úì Added 2 replacement tweets
   ‚úì Final count: 120 rows

üìÑ Processing: muslim.csv
   ‚úì No duplicates found, copied as-is

üìÑ Processing: farm_laws.csv
   ‚úì No duplicates found, copied as-is

üìÑ Processing: kashmir.csv
   ‚úì Removed 4 duplicates
   ‚úì Added 4 replacement tweets
   ‚úì Final count: 120 rows

üìÑ Processing: caa.csv
   ‚úì No duplicates found, copied as-is

üìÑ Processing: farmers_protests.csv
   ‚úì Removed 18 duplicates
   ‚úì Added 0 replacement tweets
   ‚úì Final count: 102 rows

üìÑ Pro

In [7]:
# Summary table
results_df = pd.DataFrame(results)
print("\nüìä SUMMARY TABLE:")
print(results_df[['file', 'keyword', 'original_count', 'duplicates_removed', 'replacements_added', 'final_count', 'status']].to_string(index=False))

# Totals
print(f"\n" + "=" * 80)
print(f"TOTALS:")
print(f"  Total files processed: {len(results_df)}")
print(f"  Total duplicates removed: {results_df['duplicates_removed'].sum()}")
print(f"  Total replacements added: {results_df['replacements_added'].sum()}")
print(f"\n‚úÖ All deduplicated sheets saved to: {OUTPUT_DIR}/")


üìä SUMMARY TABLE:
                file          keyword  original_count  duplicates_removed  replacements_added  final_count        status
    shaheen_bagh.csv     shaheen bagh             150                   0                   0          150 No duplicates
            modi.csv             modi             150                   4                   4          150     Processed
           hindu.csv            hindu             120                   0                   0          120 No duplicates
        hindutva.csv         hindutva             120                   4                   4          120     Processed
        congress.csv         congress             120                   2                   2          120     Processed
          muslim.csv           muslim             150                   0                   0          150 No duplicates
       farm_laws.csv        farm laws             150                   0                   0          150 No duplicates
         ka

In [8]:
# Verification: Check one of the processed files
print("\n" + "=" * 80)
print("VERIFICATION: Sample of processed file")
print("=" * 80)

# Check a file that had duplicates
files_with_dups = results_df[results_df['duplicates_removed'] > 0]['file'].tolist()
if files_with_dups:
    sample_file = files_with_dups[0]
    sample_path = os.path.join(OUTPUT_DIR, sample_file)
    sample_df = pd.read_csv(sample_path)
    
    print(f"\nFile: {sample_file}")
    print(f"Total rows: {len(sample_df)}")
    print(f"Unique tweets: {sample_df['tweet'].nunique()}")
    print(f"\nColumns: {sample_df.columns.tolist()}")
    
    # Show last few rows (the new replacements)
    print(f"\nLast 5 rows (should include replacement tweets):")
    print(sample_df.tail()[['tweet', 'keyword', 'stance ', 'stance reason']])
else:
    print("No files had duplicates to process.")


VERIFICATION: Sample of processed file

File: modi.csv
Total rows: 150
Unique tweets: 150

Columns: ['source_row', 'tweet', 'tweet_label', '_label_norm', 'keyword', 'subjects', 'subjects_scored', 'STANCE ', 'Reasoning ', 'label_norm', 'matched keyword', 'stance ', 'stance reason']

Last 5 rows (should include replacement tweets):
                                                 tweet keyword  stance   \
145  On the sidelines of the @g20org Rome Summit, P...    modi      NaN   
146           Thanku modi ji üôè https://t.co/4pWrUmHBcN    modi      NaN   
147   Modi ji ki Vikas Express https://t.co/U0kdI98oWZ    modi      NaN   
148     Wah Modi ji Wah ...ü§£ü§£ü§£ https://t.co/23QTJPP9SC    modi      NaN   
149     Short Story of Modi ji https://t.co/sik4aKk6gx    modi      NaN   

     stance reason  
145            NaN  
146            NaN  
147            NaN  
148            NaN  
149            NaN  


In [9]:
# Save the processing summary
summary_path = os.path.join(OUTPUT_DIR, '_processing_summary.csv')
results_df.to_csv(summary_path, index=False)
print(f"‚úÖ Processing summary saved to: {summary_path}")

‚úÖ Processing summary saved to: deduplicated_sheets/_processing_summary.csv
