# Tweet Extractor by Keywords

This notebook extracts tweets from the main dataset for:
1. Keywords from finetuning JSONs (auto-extracted)
2. Custom keywords you specify manually

Features:
- **Primary search**: exact match on `keyword` column
- **Fallback search**: case-insensitive text search in tweet content (triggered if primary yields < 300 tweets)
- **Extract ALL tweets** for each keyword (no sampling limit)
- No duplicate tweets across all extractions

In [1]:
import pandas as pd
import re
import os
from pathlib import Path
from functools import reduce
import operator

# ============================================================================
# CONFIGURATION - Edit these values as needed
# ============================================================================

# Data paths - Updated for server
CSV_PATH = "/scratch/ziv_baretto/Research_X/Partisan-Discourse-on-X-English-/final_data/tweets_exploded_by_keyword.csv"
JSON_DIR = "/scratch/ziv_baretto/Research_X/Partisan-Discourse-on-X-English-/codes/4_finetuning/4_a_DataProcessing/data_formatting/jsons"
OUT_DIR  = Path("extracted_by_keyword")

# Minimum tweets threshold for primary search before using fallback
# If primary search yields fewer than this many tweets, fallback search is triggered
MIN_TWEETS_THRESHOLD = 300

SEED = 42

# ============================================================================
# CUSTOM KEYWORDS - Add your own keywords here!
# Format: ["keyword1", "keyword2", ...]
# Examples:
#   CUSTOM_KEYWORDS = ["demonetization", "article 370", "nrc"]
# Set to empty [] to skip custom keywords
# ============================================================================
CUSTOM_KEYWORDS = [
    # Add your custom keywords here:
    # "your keyword",
    # "another keyword",
]

# Set to True to include keywords from JSON files, False to skip them
USE_JSON_KEYWORDS = True

# ============================================================================
# Column configuration (usually don't need to change)
# ============================================================================
POSSIBLE_TWEET_COLS = ("tweet", "text", "full_text", "content", "body")
KEYWORD_COL = "keyword"
LABEL_COL = "tweet_label"
TARGETS = ["pro ruling", "pro opposition"]

print(f"CSV Path: {CSV_PATH}")
print(f"JSON Dir: {JSON_DIR}")
print(f"Output Dir: {OUT_DIR}")
print(f"Min tweets threshold (for fallback): {MIN_TWEETS_THRESHOLD}")

CSV Path: /scratch/ziv_baretto/Research_X/Partisan-Discourse-on-X-English-/final_data/tweets_exploded_by_keyword.csv
JSON Dir: /scratch/ziv_baretto/Research_X/Partisan-Discourse-on-X-English-/codes/4_finetuning/4_a_DataProcessing/data_formatting/jsons
Output Dir: extracted_by_keyword
Min tweets threshold (for fallback): 300


In [2]:
# Build the final keyword list

KEYWORDS = []

# 1. Add keywords from JSON files if enabled
if USE_JSON_KEYWORDS and os.path.exists(JSON_DIR):
    json_files = [f for f in os.listdir(JSON_DIR) if f.endswith('.json')]
    print(f"Found {len(json_files)} JSON files:")
    
    for jf in sorted(json_files):
        match = re.match(r'kyra_(.+)_stance\.json', jf)
        if match:
            kw = match.group(1).replace('_', ' ')
            KEYWORDS.append(kw)
            print(f"  [JSON] '{kw}'")
else:
    print("Skipping JSON keywords (USE_JSON_KEYWORDS=False or dir not found)")

# 2. Add custom keywords
if CUSTOM_KEYWORDS:
    print(f"\nAdding {len(CUSTOM_KEYWORDS)} custom keywords:")
    for kw in CUSTOM_KEYWORDS:
        KEYWORDS.append(kw.lower().strip())
        print(f"  [CUSTOM] '{kw}'")

# Remove duplicate keywords (keep first occurrence)
seen = set()
unique_keywords = []
for kw in KEYWORDS:
    kw_lower = kw.lower().strip()
    if kw_lower not in seen:
        seen.add(kw_lower)
        unique_keywords.append(kw_lower)

KEYWORDS = unique_keywords

print(f"\n{'='*60}")
print(f"TOTAL KEYWORDS TO EXTRACT: {len(KEYWORDS)}")
print(f"{'='*60}")
for kw in KEYWORDS:
    print(f"  - '{kw}'")

Found 15 JSON files:
  [JSON] 'caa'
  [JSON] 'china'
  [JSON] 'congress'
  [JSON] 'farm laws'
  [JSON] 'farmers protests'
  [JSON] 'hindu'
  [JSON] 'hindutva'
  [JSON] 'kashmir'
  [JSON] 'kashmiri pandits'
  [JSON] 'modi'
  [JSON] 'muslim'
  [JSON] 'new parliament'
  [JSON] 'rahulgandhi'
  [JSON] 'ram mandir'
  [JSON] 'shaheen bagh'

TOTAL KEYWORDS TO EXTRACT: 15
  - 'caa'
  - 'china'
  - 'congress'
  - 'farm laws'
  - 'farmers protests'
  - 'hindu'
  - 'hindutva'
  - 'kashmir'
  - 'kashmiri pandits'
  - 'modi'
  - 'muslim'
  - 'new parliament'
  - 'rahulgandhi'
  - 'ram mandir'
  - 'shaheen bagh'


In [3]:
# ---------- Helper functions ----------
def _norm_nospace(x):
    """Lowercase + drop all non-alphanumerics (incl. spaces). Case-insensitive."""
    if isinstance(x, pd.Series):
        return (
            x.fillna("")
             .astype(str)
             .str.lower()  # Case-insensitive: 'RAM' and 'ram' become 'ram'
             .str.replace(r"[^a-z0-9]+", "", regex=True)
        )
    return re.sub(r"[^a-z0-9]+", "", str(x).lower())

def _phrase_variants(s: str) -> list:
    """
    Support ' or ' and '|' as OR separators inside a keyword/phrase.
    Returns the ORIGINAL (lowercased/trimmed) variants.
    """
    raw = str(s).strip()
    parts = re.split(r"\s+or\s+|\|", raw, flags=re.IGNORECASE)
    parts = [p.strip().lower() for p in parts if p.strip()]
    return parts if parts else [raw.lower().strip()]

def _any_contains_norm(tw_norm_series: pd.Series, raw_phrase: str) -> pd.Series:
    """
    Build a boolean mask: tweet contains ANY normalized variant of raw_phrase.
    This is the FALLBACK search - case-insensitive text search.
    
    Since tw_norm_series is already normalized (lowercased, non-alphanum removed),
    both 'RAM' and 'ram' will match 'ram' in the search.
    """
    variants = _phrase_variants(raw_phrase)
    variants_norm = [_norm_nospace(v) for v in variants]
    masks = [tw_norm_series.str.contains(re.escape(vn), regex=True) for vn in variants_norm]
    return reduce(operator.or_, masks) if masks else pd.Series(False, index=tw_norm_series.index)

In [4]:
# ---------- Load & prep ----------
print("Loading CSV... (this may take a while for large files)")
df = pd.read_csv(CSV_PATH, low_memory=False)
print(f"Loaded {len(df):,} rows")
print(f"Columns: {df.columns.tolist()}")

# choose tweet column
tweet_col = next((c for c in POSSIBLE_TWEET_COLS if c in df.columns), None)
if tweet_col is None:
    raise ValueError(f"Couldn't find a tweet/text column. Tried: {POSSIBLE_TWEET_COLS}.")
print(f"Tweet column: {tweet_col}")

# stable id
id_col = "source_row" if "source_row" in df.columns else None
if id_col is None:
    df["source_row"] = df.index
    id_col = "source_row"

# de-dup by tweet text
before_dedup = len(df)
df = df.drop_duplicates(subset=[tweet_col]).copy()
print(f"After dedup: {len(df):,} rows (removed {before_dedup - len(df):,} duplicates)")

Loading CSV... (this may take a while for large files)
Loaded 8,346,024 rows
Columns: ['timestamp', 'tweet', 'retweet_author', 'original_author', 'retweet_lc', 'original_lc', 'retweet_party', 'year', 'side', 'polarity_avg', 'label_0_5', 'tweet_label', 'subjects_scored', 'keyword']
Tweet column: tweet
After dedup: 1,079,099 rows (removed 7,266,925 duplicates)


In [5]:
# normalize labels to TARGETS
def normalize_label(x: str) -> str:
    if not isinstance(x, str): return "other"
    s = x.strip().lower()
    if re.search(r"\bpro[-_\s]*rul(?:ing)?\b", s): return "pro ruling"
    if re.search(r"\bpro[-_\s]*(opp|opposition)\b", s): return "pro opposition"
    return "other"

df["_label_norm"] = df[LABEL_COL].apply(normalize_label)
print(f"Label distribution (before filtering):")
print(df["_label_norm"].value_counts())

df = df[df["_label_norm"].isin(TARGETS)].copy()
print(f"\nAfter filtering to TARGETS: {len(df):,} rows")

# lowercase keyword col for primary match
if KEYWORD_COL not in df.columns:
    raise ValueError(f"Column '{KEYWORD_COL}' not found. Available: {list(df.columns)[:25]}")

df["_kw_lc"] = df[KEYWORD_COL].astype(str).str.strip().str.lower()

# normalized tweet text for fallback search (case-insensitive)
print("Normalizing tweet text for fallback search (case-insensitive)...")
tw_norm = _norm_nospace(df[tweet_col])
print("Done.")

Label distribution (before filtering):
_label_norm
pro ruling        540330
pro opposition    335961
other             202808
Name: count, dtype: int64

After filtering to TARGETS: 876,291 rows
Normalizing tweet text for fallback search (case-insensitive)...
Done.


In [6]:
# Global set to track all used tweet IDs across keywords (no duplicates)
GLOBAL_USED_IDS = set()

def extract_all_for_keyword(kw_raw: str) -> tuple:
    """
    Extract ALL tweets for a keyword.
    
    Search strategy:
    1. PRIMARY: Exact match on keyword column
    2. FALLBACK: Case-insensitive text search in tweet content
       (triggered if primary yields < MIN_TWEETS_THRESHOLD tweets)
    
    Case-insensitivity: 'RAM' and 'ram' are treated as the same.
    
    Ensures no duplicate tweets across all extractions via GLOBAL_USED_IDS.
    
    Returns:
        (DataFrame of extracted tweets, stats dict)
    """
    global GLOBAL_USED_IDS
    
    # variants for this bucket (handles 'or' and '|' separators)
    kw_variants = _phrase_variants(kw_raw)

    # Exclude already-used tweets globally
    available_mask = ~df[id_col].isin(GLOBAL_USED_IDS)
    available_df = df[available_mask]
    available_tw_norm = tw_norm[available_mask]

    # PRIMARY pool = keyword column equals any variant (case-insensitive)
    pool_primary = available_df[available_df["_kw_lc"].isin(kw_variants)].copy()
    primary_count = len(pool_primary)
    
    # Check if we need fallback search
    use_fallback = primary_count < MIN_TWEETS_THRESHOLD
    
    if use_fallback:
        # FALLBACK pool = tweet text contains ANY normalized variant (case-insensitive)
        # This is case-insensitive because tw_norm is already lowercased
        contains_any = _any_contains_norm(available_tw_norm, kw_raw)
        pool_fallback = available_df[contains_any].copy()
        
        # Combine: primary + fallback (avoiding duplicates)
        primary_ids = set(pool_primary[id_col])
        pool_fallback_new = pool_fallback[~pool_fallback[id_col].isin(primary_ids)]
        
        out_kw = pd.concat([pool_primary, pool_fallback_new], axis=0)
        fallback_count = len(pool_fallback_new)
    else:
        out_kw = pool_primary
        fallback_count = 0

    # Remove duplicates within extraction (just to be safe)
    out_kw = out_kw.drop_duplicates(subset=[id_col]).copy()
    
    # Shuffle for variety
    if not out_kw.empty:
        out_kw = out_kw.sample(frac=1.0, random_state=SEED).reset_index(drop=True)

    # Integrity checks
    assert out_kw[id_col].nunique() == len(out_kw), f"[{kw_raw}] duplicate IDs"
    assert out_kw[tweet_col].nunique() == len(out_kw), f"[{kw_raw}] duplicate tweets"

    # Add to global used set
    GLOBAL_USED_IDS |= set(out_kw[id_col])

    # Overwrite keyword column with canonical keyword
    canonical = kw_variants[0] if kw_variants else str(kw_raw).strip().lower()
    out_kw[KEYWORD_COL] = canonical

    # Compute stats per label
    stats = {
        "total_extracted": len(out_kw),
        "from_primary": primary_count,
        "from_fallback": fallback_count,
        "used_fallback": use_fallback,
        "by_label": {}
    }
    for label in TARGETS:
        label_count = len(out_kw[out_kw["_label_norm"] == label])
        stats["by_label"][label] = label_count

    return out_kw, stats

In [7]:
# --- Run extraction for all keywords ---
OUT_DIR.mkdir(parents=True, exist_ok=True)

# Reset global tracking
GLOBAL_USED_IDS = set()

combined = []
reports = {}

print("=" * 80)
print("EXTRACTING ALL TWEETS BY KEYWORD")
print(f"Minimum threshold for fallback: {MIN_TWEETS_THRESHOLD} tweets")
print("=" * 80)

for kw in KEYWORDS:
    out_kw, stat_kw = extract_all_for_keyword(kw)

    combined.append(out_kw)
    reports[kw] = stat_kw

    # write per-keyword files
    cols_out = [id_col, tweet_col, LABEL_COL, "_label_norm", KEYWORD_COL, "subjects_scored"]
    cols_out = [c for c in cols_out if c in out_kw.columns]
    canonical_name = _phrase_variants(kw)[0].replace(" ", "_")
    out_csv = OUT_DIR / f"extracted_{canonical_name}.csv"
    out_ids = OUT_DIR / f"extracted_{canonical_name}_ids.txt"

    out_kw[cols_out].to_csv(out_csv, index=False)
    with open(out_ids, "w", encoding="utf-8") as f:
        for v in out_kw[id_col].tolist():
            f.write(f"{v}\n")

    # Status with breakdown
    fallback_status = "[FALLBACK USED]" if stat_kw["used_fallback"] else "[PRIMARY ONLY]"
    print(f"[OK] '{kw}' {fallback_status}")
    print(f"     -> total: {stat_kw['total_extracted']}, primary: {stat_kw['from_primary']}, fallback: {stat_kw['from_fallback']}")
    print(f"     -> by label: {stat_kw['by_label']}")

print("\n" + "=" * 80)

EXTRACTING ALL TWEETS BY KEYWORD
Minimum threshold for fallback: 300 tweets
[OK] 'caa' [FALLBACK USED]
     -> total: 4297, primary: 184, fallback: 4113
     -> by label: {'pro ruling': 2528, 'pro opposition': 1769}
[OK] 'china' [PRIMARY ONLY]
     -> total: 5479, primary: 5479, fallback: 0
     -> by label: {'pro ruling': 2972, 'pro opposition': 2507}
[OK] 'congress' [PRIMARY ONLY]
     -> total: 8416, primary: 8416, fallback: 0
     -> by label: {'pro ruling': 3790, 'pro opposition': 4626}
[OK] 'farm laws' [FALLBACK USED]
     -> total: 3399, primary: 0, fallback: 3399
     -> by label: {'pro ruling': 1046, 'pro opposition': 2353}
[OK] 'farmers protests' [FALLBACK USED]
     -> total: 912, primary: 0, fallback: 912
     -> by label: {'pro ruling': 251, 'pro opposition': 661}
[OK] 'hindu' [PRIMARY ONLY]
     -> total: 6565, primary: 6565, fallback: 0
     -> by label: {'pro ruling': 5213, 'pro opposition': 1352}
[OK] 'hindutva' [PRIMARY ONLY]
     -> total: 1388, primary: 1388, fallba

In [8]:
# Combined outputs
all_out = pd.concat(combined, axis=0).reset_index(drop=True) if combined else pd.DataFrame()
cols_out_all = [id_col, tweet_col, LABEL_COL, "_label_norm", KEYWORD_COL, "subjects_scored"]
cols_out_all = [c for c in cols_out_all if c in all_out.columns]

total_rows = len(all_out)
all_csv = OUT_DIR / f"extracted_ALL_{len(KEYWORDS)}keywords_{total_rows}rows.csv"
all_ids = OUT_DIR / f"extracted_ALL_{len(KEYWORDS)}keywords_ids.txt"

all_out[cols_out_all].to_csv(all_csv, index=False)
with open(all_ids, "w", encoding="utf-8") as f:
    for v in all_out[id_col].tolist():
        f.write(f"{v}\n")

print(f"[OK] Combined: {all_csv} (rows={total_rows})")

[OK] Combined: extracted_by_keyword/extracted_ALL_15keywords_141311rows.csv (rows=141311)


In [9]:
# Summary report
print("\n" + "=" * 80)
print("SUMMARY")
print("=" * 80)

print(f"\nMin tweets threshold for fallback: {MIN_TWEETS_THRESHOLD}")
print("(If primary search yields fewer tweets, fallback is triggered)\n")

for kw, stat in reports.items():
    fallback_status = "YES" if stat["used_fallback"] else "NO"
    print(f"  '{kw}':")
    print(f"      Total extracted: {stat['total_extracted']}")
    print(f"      From primary (keyword col): {stat['from_primary']}")
    print(f"      From fallback (text search, case-insensitive): {stat['from_fallback']}")
    print(f"      Fallback used: {fallback_status}")
    print(f"      By label: {stat['by_label']}")
    print()

print(f"\n✅ All files saved to: {OUT_DIR}/")
print(f"   Total unique tweets extracted: {total_rows:,}")
print(f"   Keywords processed: {len(KEYWORDS)}")


SUMMARY

Min tweets threshold for fallback: 300
(If primary search yields fewer tweets, fallback is triggered)

  'caa':
      Total extracted: 4297
      From primary (keyword col): 184
      From fallback (text search, case-insensitive): 4113
      Fallback used: YES
      By label: {'pro ruling': 2528, 'pro opposition': 1769}

  'china':
      Total extracted: 5479
      From primary (keyword col): 5479
      From fallback (text search, case-insensitive): 0
      Fallback used: NO
      By label: {'pro ruling': 2972, 'pro opposition': 2507}

  'congress':
      Total extracted: 8416
      From primary (keyword col): 8416
      From fallback (text search, case-insensitive): 0
      Fallback used: NO
      By label: {'pro ruling': 3790, 'pro opposition': 4626}

  'farm laws':
      Total extracted: 3399
      From primary (keyword col): 0
      From fallback (text search, case-insensitive): 3399
      Fallback used: YES
      By label: {'pro ruling': 1046, 'pro opposition': 2353}

  