# Keyword Tweet Extractor (Custom List)

This notebook extracts ALL tweets for a specific keyword list.

Features:
- **Primary search**: exact match on `keyword` column (case-insensitive)
- **Text search**: case-insensitive text search in tweet content
- **ALWAYS uses BOTH searches** to extract maximum tweets
- No duplicate tweets across all extractions

In [1]:
import pandas as pd
import re
import os
from pathlib import Path
from functools import reduce
import operator

# ============================================================================
# CONFIGURATION
# ============================================================================

# Data paths
CSV_PATH = "/scratch/ziv_baretto/Research_X/Partisan-Discourse-on-X-English-/final_data/tweets_exploded_by_keyword.csv"
OUT_DIR  = Path("extracted_by_keyword")

SEED = 42

# ============================================================================
# KEYWORD LIST - Your specified keywords
# ============================================================================
KEYWORDS = [
    "ayodhya",
    "islamists",
    "balochistan",
    "sharia",
    "sangh",
    "ucc",
    "mahotsav",
    "caa",
    "aatmanirbhar",
    "unemployment",
    "inflation",
    "minorities",
    "hathras",
    "gdp",
    "msp",
    "suicides",
    "lynching",
    "spyware",
    "demonetisation",
    "democracy",
    "bhakts",
    "dictatorship",
    "ratetvdebate"
]

# Column configuration
POSSIBLE_TWEET_COLS = ("tweet", "text", "full_text", "content", "body")
KEYWORD_COL = "keyword"
LABEL_COL = "tweet_label"
TARGETS = ["pro ruling", "pro opposition"]

print(f"CSV Path: {CSV_PATH}")
print(f"Output Dir: {OUT_DIR}")
print(f"Total keywords: {len(KEYWORDS)}")
print(f"Mode: EXTRACT ALL (primary + text search combined)")

CSV Path: /scratch/ziv_baretto/Research_X/Partisan-Discourse-on-X-English-/final_data/tweets_exploded_by_keyword.csv
Output Dir: extracted_by_keyword
Total keywords: 23
Mode: EXTRACT ALL (primary + text search combined)


In [2]:
# ---------- Helper functions ----------
def _norm_nospace(x):
    """Lowercase + drop all non-alphanumerics (incl. spaces). Case-insensitive."""
    if isinstance(x, pd.Series):
        return (
            x.fillna("")
             .astype(str)
             .str.lower()  # Case-insensitive: 'RAM' and 'ram' become 'ram'
             .str.replace(r"[^a-z0-9]+", "", regex=True)
        )
    return re.sub(r"[^a-z0-9]+", "", str(x).lower())

def _phrase_variants(s: str) -> list:
    """
    Support ' or ' and '|' as OR separators inside a keyword/phrase.
    Returns the ORIGINAL (lowercased/trimmed) variants.
    """
    raw = str(s).strip()
    parts = re.split(r"\s+or\s+|\|", raw, flags=re.IGNORECASE)
    parts = [p.strip().lower() for p in parts if p.strip()]
    return parts if parts else [raw.lower().strip()]

def _any_contains_norm(tw_norm_series: pd.Series, raw_phrase: str) -> pd.Series:
    """
    Build a boolean mask: tweet contains ANY normalized variant of raw_phrase.
    Case-insensitive text search.
    
    Since tw_norm_series is already normalized (lowercased, non-alphanum removed),
    both 'RAM' and 'ram' will match 'ram' in the search.
    """
    variants = _phrase_variants(raw_phrase)
    variants_norm = [_norm_nospace(v) for v in variants]
    masks = [tw_norm_series.str.contains(re.escape(vn), regex=True) for vn in variants_norm]
    return reduce(operator.or_, masks) if masks else pd.Series(False, index=tw_norm_series.index)

In [3]:
# ---------- Load & prep ----------
print("Loading CSV... (this may take a while for large files)")
df = pd.read_csv(CSV_PATH, low_memory=False)
print(f"Loaded {len(df):,} rows")
print(f"Columns: {df.columns.tolist()}")

# choose tweet column
tweet_col = next((c for c in POSSIBLE_TWEET_COLS if c in df.columns), None)
if tweet_col is None:
    raise ValueError(f"Couldn't find a tweet/text column. Tried: {POSSIBLE_TWEET_COLS}.")
print(f"Tweet column: {tweet_col}")

# stable id
id_col = "source_row" if "source_row" in df.columns else None
if id_col is None:
    df["source_row"] = df.index
    id_col = "source_row"

# de-dup by tweet text
before_dedup = len(df)
df = df.drop_duplicates(subset=[tweet_col]).copy()
print(f"After dedup: {len(df):,} rows (removed {before_dedup - len(df):,} duplicates)")

Loading CSV... (this may take a while for large files)
Loaded 8,346,024 rows
Columns: ['timestamp', 'tweet', 'retweet_author', 'original_author', 'retweet_lc', 'original_lc', 'retweet_party', 'year', 'side', 'polarity_avg', 'label_0_5', 'tweet_label', 'subjects_scored', 'keyword']
Tweet column: tweet
After dedup: 1,079,099 rows (removed 7,266,925 duplicates)


In [4]:
# normalize labels to TARGETS
def normalize_label(x: str) -> str:
    if not isinstance(x, str): return "other"
    s = x.strip().lower()
    if re.search(r"\bpro[-_\s]*rul(?:ing)?\b", s): return "pro ruling"
    if re.search(r"\bpro[-_\s]*(opp|opposition)\b", s): return "pro opposition"
    return "other"

df["_label_norm"] = df[LABEL_COL].apply(normalize_label)
print(f"Label distribution (before filtering):")
print(df["_label_norm"].value_counts())

df = df[df["_label_norm"].isin(TARGETS)].copy()
print(f"\nAfter filtering to TARGETS: {len(df):,} rows")

# lowercase keyword col for primary match
if KEYWORD_COL not in df.columns:
    raise ValueError(f"Column '{KEYWORD_COL}' not found. Available: {list(df.columns)[:25]}")

df["_kw_lc"] = df[KEYWORD_COL].astype(str).str.strip().str.lower()

# normalized tweet text for text search (case-insensitive)
print("Normalizing tweet text for text search (case-insensitive)...")
tw_norm = _norm_nospace(df[tweet_col])
print("Done.")

Label distribution (before filtering):
_label_norm
pro ruling        540330
pro opposition    335961
other             202808
Name: count, dtype: int64

After filtering to TARGETS: 876,291 rows
Normalizing tweet text for text search (case-insensitive)...
Done.


In [5]:
# Global set to track all used tweet IDs across keywords (no duplicates)
GLOBAL_USED_IDS = set()

def extract_all_for_keyword(kw_raw: str) -> tuple:
    """
    Extract ALL tweets for a keyword using BOTH search methods:
    1. PRIMARY: Exact match on keyword column (case-insensitive)
    2. TEXT SEARCH: Case-insensitive text search in tweet content
    
    ALWAYS combines both methods to get maximum tweets.
    
    Ensures no duplicate tweets across all extractions via GLOBAL_USED_IDS.
    
    Returns:
        (DataFrame of extracted tweets, stats dict)
    """
    global GLOBAL_USED_IDS
    
    # variants for this bucket (handles 'or' and '|' separators)
    kw_variants = _phrase_variants(kw_raw)

    # Exclude already-used tweets globally
    available_mask = ~df[id_col].isin(GLOBAL_USED_IDS)
    available_df = df[available_mask]
    available_tw_norm = tw_norm[available_mask]

    # PRIMARY pool = keyword column equals any variant (case-insensitive)
    pool_primary = available_df[available_df["_kw_lc"].isin(kw_variants)].copy()
    primary_count = len(pool_primary)
    
    # ALWAYS use text search to get ALL tweets containing the keyword
    # TEXT SEARCH pool = tweet text contains ANY normalized variant (case-insensitive)
    contains_any = _any_contains_norm(available_tw_norm, kw_raw)
    pool_text_search = available_df[contains_any].copy()
    
    # Combine: primary + text search (avoiding duplicates within this keyword)
    primary_ids = set(pool_primary[id_col])
    pool_text_new = pool_text_search[~pool_text_search[id_col].isin(primary_ids)]
    
    out_kw = pd.concat([pool_primary, pool_text_new], axis=0)
    text_search_count = len(pool_text_new)

    # Remove duplicates within extraction (just to be safe)
    out_kw = out_kw.drop_duplicates(subset=[id_col]).copy()
    
    # Shuffle for variety
    if not out_kw.empty:
        out_kw = out_kw.sample(frac=1.0, random_state=SEED).reset_index(drop=True)

    # Integrity checks
    assert out_kw[id_col].nunique() == len(out_kw), f"[{kw_raw}] duplicate IDs"
    assert out_kw[tweet_col].nunique() == len(out_kw), f"[{kw_raw}] duplicate tweets"

    # Add to global used set
    GLOBAL_USED_IDS |= set(out_kw[id_col])

    # Overwrite keyword column with canonical keyword
    canonical = kw_variants[0] if kw_variants else str(kw_raw).strip().lower()
    out_kw[KEYWORD_COL] = canonical

    # Compute stats per label
    stats = {
        "total_extracted": len(out_kw),
        "from_primary": primary_count,
        "from_text_search": text_search_count,
        "by_label": {}
    }
    for label in TARGETS:
        label_count = len(out_kw[out_kw["_label_norm"] == label])
        stats["by_label"][label] = label_count

    return out_kw, stats

In [6]:
# --- Run extraction for all keywords ---
OUT_DIR.mkdir(parents=True, exist_ok=True)

# Reset global tracking
GLOBAL_USED_IDS = set()

combined = []
reports = {}

print("=" * 80)
print("EXTRACTING ALL TWEETS BY KEYWORD (PRIMARY + TEXT SEARCH)")
print(f"Keywords: {KEYWORDS}")
print("=" * 80)

for kw in KEYWORDS:
    out_kw, stat_kw = extract_all_for_keyword(kw)

    combined.append(out_kw)
    reports[kw] = stat_kw

    # write per-keyword files
    cols_out = [id_col, tweet_col, LABEL_COL, "_label_norm", KEYWORD_COL, "subjects_scored"]
    cols_out = [c for c in cols_out if c in out_kw.columns]
    canonical_name = _phrase_variants(kw)[0].replace(" ", "_")
    out_csv = OUT_DIR / f"extracted_{canonical_name}.csv"
    out_ids = OUT_DIR / f"extracted_{canonical_name}_ids.txt"

    out_kw[cols_out].to_csv(out_csv, index=False)
    with open(out_ids, "w", encoding="utf-8") as f:
        for v in out_kw[id_col].tolist():
            f.write(f"{v}\n")

    # Status with breakdown
    print(f"[OK] '{kw}'")
    print(f"     -> total: {stat_kw['total_extracted']}, primary: {stat_kw['from_primary']}, text_search: {stat_kw['from_text_search']}")
    print(f"     -> by label: {stat_kw['by_label']}")

print("\n" + "=" * 80)

EXTRACTING ALL TWEETS BY KEYWORD (PRIMARY + TEXT SEARCH)
Keywords: ['ayodhya', 'islamists', 'balochistan', 'sharia', 'sangh', 'ucc', 'mahotsav', 'caa', 'aatmanirbhar', 'unemployment', 'inflation', 'minorities', 'hathras', 'gdp', 'msp', 'suicides', 'lynching', 'spyware', 'demonetisation', 'democracy', 'bhakts', 'dictatorship', 'ratetvdebate']
[OK] 'ayodhya'
     -> total: 2447, primary: 1181, text_search: 1266
     -> by label: {'pro ruling': 1919, 'pro opposition': 528}
[OK] 'islamists'
     -> total: 1480, primary: 817, text_search: 663
     -> by label: {'pro ruling': 1462, 'pro opposition': 18}
[OK] 'balochistan'
     -> total: 650, primary: 431, text_search: 219
     -> by label: {'pro ruling': 638, 'pro opposition': 12}
[OK] 'sharia'
     -> total: 572, primary: 197, text_search: 375
     -> by label: {'pro ruling': 549, 'pro opposition': 23}
[OK] 'sangh'
     -> total: 4469, primary: 40, text_search: 4429
     -> by label: {'pro ruling': 1740, 'pro opposition': 2729}
[OK] 'ucc'
 

In [7]:
# Combined outputs
all_out = pd.concat(combined, axis=0).reset_index(drop=True) if combined else pd.DataFrame()
cols_out_all = [id_col, tweet_col, LABEL_COL, "_label_norm", KEYWORD_COL, "subjects_scored"]
cols_out_all = [c for c in cols_out_all if c in all_out.columns]

total_rows = len(all_out)
all_csv = OUT_DIR / f"extracted_ALL_{len(KEYWORDS)}keywords_{total_rows}rows.csv"
all_ids = OUT_DIR / f"extracted_ALL_{len(KEYWORDS)}keywords_ids.txt"

all_out[cols_out_all].to_csv(all_csv, index=False)
with open(all_ids, "w", encoding="utf-8") as f:
    for v in all_out[id_col].tolist():
        f.write(f"{v}\n")

print(f"[OK] Combined: {all_csv} (rows={total_rows})")

[OK] Combined: extracted_by_keyword/extracted_ALL_23keywords_56741rows.csv (rows=56741)


In [8]:
# Summary report
print("\n" + "=" * 80)
print("SUMMARY")
print("=" * 80)

print(f"\nMode: EXTRACT ALL (primary + text search combined)\n")

# Create summary table
summary_data = []
for kw, stat in reports.items():
    summary_data.append({
        "keyword": kw,
        "total": stat['total_extracted'],
        "primary": stat['from_primary'],
        "text_search": stat['from_text_search'],
        "pro_ruling": stat['by_label'].get('pro ruling', 0),
        "pro_opposition": stat['by_label'].get('pro opposition', 0)
    })

summary_df = pd.DataFrame(summary_data)
print(summary_df.to_string(index=False))

print(f"\n✅ All files saved to: {OUT_DIR}/")
print(f"   Total unique tweets extracted: {total_rows:,}")
print(f"   Keywords processed: {len(KEYWORDS)}")

# Save summary to CSV
summary_csv = OUT_DIR / "extraction_summary.csv"
summary_df.to_csv(summary_csv, index=False)
print(f"   Summary saved to: {summary_csv}")


SUMMARY

Mode: EXTRACT ALL (primary + text search combined)

       keyword  total  primary  text_search  pro_ruling  pro_opposition
       ayodhya   2447     1181         1266        1919             528
     islamists   1480      817          663        1462              18
   balochistan    650      431          219         638              12
        sharia    572      197          375         549              23
         sangh   4469       40         4429        1740            2729
           ucc   7656       56         7600        5632            2024
      mahotsav   5356      298         5058        5264              92
           caa   4139      178         3961        2399            1740
  aatmanirbhar   5132       64         5068        5037              95
  unemployment   1609      684          925         278            1331
     inflation   1200      654          546         589             611
    minorities   1451      183         1268         754             697
  