# 0. Fetch Tweets

In [None]:
from datetime import datetime, timedelta
import os
from datetime import timedelta
from datetime import datetime
import time
import datetime
from textblob import TextBlob
import pandas as pd
import tweepy

# Authenticate to Twitter
api_key = ""
api_secret = ""
access_token = ""
access_token_secret = ""
bearer_token = ""

# V1 Authentication
auth = tweepy.OAuthHandler(consumer_key=api_key, consumer_secret=api_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth, wait_on_rate_limit=True)

# V2 Authentication
client = tweepy.Client(
    bearer_token,
    api_key,
    api_secret,
    access_token,
    access_token_secret,
    wait_on_rate_limit=True,
)


# Function to retrieve tweets for a specific time range with pagination

def get_tweets_for_date_range(client, start_date, end_date, max_tweets):
    # Convert dates to Twitter API-friendly string format
    start_time = start_date.strftime('%Y-%m-%dT%H:%M:%SZ')
    end_time = end_date.strftime('%Y-%m-%dT%H:%M:%SZ')

    # Initialize variables
    tweets_obtained = 0
    total_tweets = []
    next_token = None

    # Continue fetching results until the desired limit is reached or there are no more results
    while tweets_obtained < max_tweets:
        # Calculate the remaining tweets to fetch in this iteration
        remaining_tweets = min(max_tweets - tweets_obtained, 500)

        # Perform the search with pagination
        tweets = client.search_all_tweets(
            query="(me/cfs OR Myalgic encephalomyelitis/chronic fatigue syndrome OR Myalgic encephalomyelitis OR chronic fatigue syndrome) lang:en -is:retweet",
            start_time=start_time,
            end_time=end_time,
            max_results=remaining_tweets,
            next_token=next_token
        )

        # Update the counter for tweets obtained
        tweets_obtained += len(tweets.data)
        total_tweets.extend(tweets.data)

        # Check for next_token
        next_token = tweets.meta.get('next_token') if hasattr(tweets, 'meta') else None

        # If there are no more results, break out of the loop
        if not next_token:
            break

        # Introduce a wait time to avoid rate limits
        time.sleep(5)

    return total_tweets

# Main logic to fetch tweets for multiple date ranges


def fetch_tweets_for_date_ranges(client, date_ranges, max_tweets_per_month, output_directory):
    for start_date, end_date in date_ranges:
        # Initialize variables
        current_date = start_date
        total_tweets = []

        # Iterate over each month within the date range
        while current_date < end_date:
            next_month = current_date + timedelta(days=30)  # Approximate one month

            # Fetch tweets for the current month
            tweets_for_month = get_tweets_for_date_range(client, current_date, next_month, max_tweets_per_month)
            total_tweets.extend(tweets_for_month)

            # Print progress
            print(f"Month: {current_date.strftime('%Y-%m')}, Tweets Obtained: {len(tweets_for_month)}")

            # Move to the next month
            current_date = next_month

        # Save results to a CSV file
        output_file = f"{output_directory}/tweets_{start_date.strftime('%Y%m')}_to_{end_date.strftime('%Y%m')}.csv"
        df = pd.DataFrame([{'Tweet ID': tweet.id, 'Text': tweet.text} for tweet in total_tweets])
        df.to_csv(output_file, index=False)
        print(f"Saved {len(total_tweets)} tweets to {output_file}")


# Define the date ranges
all_date_ranges = [
    (datetime(2007, 3, 1), datetime(2009, 3, 1)),
    (datetime(2009, 4, 1), datetime(2011, 4, 1)),
    (datetime(2011, 5, 1), datetime(2013, 5, 1)),
    (datetime(2013, 6, 1), datetime(2015, 6, 1)),
    (datetime(2015, 7, 1), datetime(2017, 7, 1)),
    (datetime(2017, 8, 1), datetime(2019, 8, 1)),
    (datetime(2021, 1, 1), datetime(2021, 10, 27)),
    (datetime(2021, 10, 28), datetime(2024, 1, 30))
]

# Specify parameters
max_tweets_per_month = 4100
output_directory = os.path.join("datasets", "fetched_tweets")

# Uncomment the line below to fetch tweets after setting up your Twitter client
# fetch_tweets_for_date_ranges(client, all_date_ranges, max_tweets_per_month, output_directory)

def process_and_clean_tweets(file_paths, output_directory):
    for file_path in file_paths:
        # Load the CSV file into a DataFrame
        df = pd.read_csv(file_path)

        # Apply the cleaning function to the 'Text' column and create a new column 'Cleaned Text'
        # Generate cleaned file path
        cleaned_file_name = f"cleaned_{file_path.split('/')[-1]}"
        cleaned_csv_file_path = f"{output_directory}/{cleaned_file_name}"

        # Save the cleaned DataFrame to a CSV file
        df.to_csv(cleaned_csv_file_path, index=False)
        print(f"Cleaned tweets saved to {cleaned_csv_file_path}")


# Define the list of file paths and output directory
file_paths = [
    os.path.join("datasets", "fetched_tweets", 'tweets_october28_2021_to_january30_2024.csv'),
    os.path.join("datasets", "fetched_tweets", 'tweets_jan_2010_to_dec11_2019.csv'),
    os.path.join("datasets", "fetched_tweets", 'tweets_august_2017_to_august_2019.csv')
]
output_directory = os.path.join("datasets", "raw")

# Process and clean tweets
process_and_clean_tweets(file_paths, output_directory)

# 1. Clean Data

# 1.1. Unzip Datasets

In [None]:
import py7zr
import os 

# Path to the zip file

dir = os.path.join("datasets", "raw")

with py7zr.SevenZipFile(os.path.join(dir, "datasets.7z"), mode='r') as archive:
    archive.extractall(path=dir)
    print(f"Files extracted to '{dir}'")

## 1.2. Generate Cleaned Datasets

In [None]:
import re
import emoji
from transformers import RobertaTokenizer
from contractions import fix
import pandas as pd
import os
import string
import nltk
from nltk.corpus import stopwords


import html
from bs4 import BeautifulSoup

from nltk import pos_tag

from tqdm import tqdm
import swifter
import dask.dataframe as dd
from multiprocessing import cpu_count

tqdm.pandas()

num_workers = max(cpu_count() - 1, 1)

def preprocess(text):
    text = str(text)
    text = html.unescape(text)
    
    soup = BeautifulSoup(text, 'html.parser')
    text = soup.get_text()
    
    text = text.replace("&amp;", "&")      # Replace &amp; with &
    text = text.replace("&lt;", "<")       # Replace &lt; with <
    text = text.replace("&gt;", ">")       # Replace &gt; with >
    text = text.replace("&quot;", "\"")    # Replace &quot; with "
    text = text.replace("&#39;", "'")      # Replace &#39; with '
    text = text.replace("&nbsp;", " ")     # Replace &nbsp; with a space
    text = text.replace("&cent;", "¢")     # Replace &cent; with ¢
    text = text.replace("&pound;", "£")    # Replace &pound; with £
    text = text.replace("&yen;", "¥")      # Replace &yen; with ¥
    text = text.replace("&euro;", "€")     # Replace &euro; with €
    text = text.replace("&copy;", "©")     # Replace &copy; with ©
    text = text.replace("&reg;", "®")      # Replace &reg; with ®
    
    text = text.replace('\u00A0', ' ')
    text = re.sub(r'\s+', ' ', text).strip()
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Remove mentions
    text = re.sub(r'@\w+', '<USER>', text)
    # Normalize hashtags
    text = re.sub(r'#', '', text)
    # Expand contractions
    text = fix(text)
    return text
    
def roberta_process(text):
    text = preprocess(text)
    # Handle emojis
    text = emoji.demojize(text)
    # Strip extra whitespace
    text = text.strip()
    return text

def lexicon_preprocess(text):
    text = roberta_process(text)
    # Remove punctuation using regex
    text = re.sub(r'[^\w\s]', '', text)  # Retains only words and whitespace
    return text

def lda_process(text):
    from nltk.stem import WordNetLemmatizer
    from nltk.tokenize import word_tokenize
    # Helper function to map POS tags to WordNet's format
    def get_wordnet_pos(word):
        from nltk.corpus import wordnet
        tag = pos_tag([word])[0][1][0].upper()
        tag_dict = {"J": wordnet.ADJ, "N": wordnet.NOUN, "V": wordnet.VERB, "R": wordnet.ADV}
        return tag_dict.get(tag, wordnet.NOUN)
    
    text = preprocess(text)
    text = re.sub('<USER>', '', text)
    text = text.replace('/', ' ')
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    
    text = text.lower()
    
    terms = [
        r"me",
        r"cfs",
        r"cf",
        r"me/cf",
        r"me/cfs",
        r"mecfs",
        r"mecf",
        r"myalgic encephalomyelitis/chronic fatigue syndrome",
        r"myalgic encephalomyelitis",
        r"chronic fatigue syndrome",
    ]
    
    pattern = r"|".join([rf"\b{term}\b" for term in terms])
    text = re.sub(pattern, "", text, flags=re.IGNORECASE)
    
    lemmatizer = WordNetLemmatizer()

    words = word_tokenize(text)
    words = [
        lemmatizer.lemmatize(word, get_wordnet_pos(word)) 
        for word in words if word.isalnum()
    ]
    words = [word for word in words if word not in stop_words]
    
    text = ' '.join(words)
    
    return text
    
datasets = [
    "cleaned_tweets_jan_2010_to_dec11_2019.csv",
    "cleaned_tweets_dec12_2019_to_oct27_2021.csv",
    "cleaned_tweets_october28_2021_to_january30_2024.csv"
]

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

stop_words = set(stopwords.words('english'))

global_id = 1

for d in datasets:
    # Read the dataset
    df = pd.read_csv(os.path.join("datasets", "raw", d))

    # Drop rows where 'text' is NaN
    df = df.dropna(subset=['Text'])
    
    # Apply the cleaning function
    df['text'] = df['Text'].swifter.allow_dask_on_strings(enable=True).apply(roberta_process)
    df['text_lexicon'] = df['Text'].swifter.allow_dask_on_strings(enable=True).apply(lexicon_preprocess)
    df['text_lda'] = df['Text'].swifter.allow_dask_on_strings(enable=True).apply(lda_process)

    # Remove rows where 'text' is empty or contains only whitespace
    df = df[df['text'].str.strip().astype(bool)]

    # Add the unique ID column
    num_rows = len(df)
    df['id'] = range(global_id, global_id + num_rows)

    # Update the global ID counter
    global_id += num_rows
    
    df = df[["id", "text", "text_lexicon", "text_lda"]]
    
    columns_to_check = ['text', 'text_lexicon', 'text_lda']

    for col in columns_to_check:
        if col in df.columns:
            na_count = df[col].isna().sum()
            print(f"{col}: {na_count} NaN values found in '{col}' column.")
        else:
            print(f"{col}: '{col}' column not found.")

    # Save the updated dataset back to CSV
    df.to_csv(os.path.join("datasets", d), index=False)

# 2. Sentiment Analysis
## 2.1. RoBERTa

In [None]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset
from tqdm import tqdm
from transformers import RobertaTokenizer, RobertaForSequenceClassification, pipeline
import os
from torch.nn.functional import softmax
import re

os.makedirs("sentiment_results", exist_ok=True)

class FileDataset(Dataset):
    def __init__(self, df):
        self.df = df
        super().__init__()
        
    def __len__(self):
        return len(self.df.index)

    def __getitem__(self, i):
        return self.df.at[i, 'text']

    def getId(self, i):
        return self.df.at[i, 'id']
    
# Ensure you have a GPU available
device = torch.device(0 if torch.cuda.is_available() else -1)

datasets = [
    "cleaned_tweets_jan_2010_to_dec11_2019.csv",
    "cleaned_tweets_dec12_2019_to_oct27_2021.csv",
    "cleaned_tweets_october28_2021_to_january30_2024.csv"
]

tokenizer = RobertaTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment")
model = RobertaForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment")
classifier = pipeline(
    'sentiment-analysis',
    model=model,
    tokenizer=tokenizer,
    device=device,
    truncation=True,
    max_length=512,
    top_k=None
)

for dataset in datasets:
    df = pd.read_csv(os.path.join("datasets", dataset))
    dat = FileDataset(df)
    
    probability_negative = []
    probability_neutral = []
    probability_positive = []
    
    for i, row_preds in enumerate(tqdm(classifier(dat, batch_size=32),total=len(dat),disable=False)):
        prob_dict = {pred['label']: pred['score'] for pred in row_preds}
        probability_negative.append(prob_dict.get('LABEL_0', 0.0))  # Assuming LABEL_0 is negative
        probability_neutral.append(prob_dict.get('LABEL_1', 0.0))   # Assuming LABEL_1 is neutral
        probability_positive.append(prob_dict.get('LABEL_2', 0.0))  # Assuming LABEL_2 is positive

    df['probability_negative'] = probability_negative
    df['probability_neutral'] = probability_neutral
    df['probability_positive'] = probability_positive
    
    df['sentiment'] = df[['probability_negative', 'probability_neutral', 'probability_positive']].idxmax(axis=1)
    df['sentiment'] = df['sentiment'].map({
        'probability_negative': 'negative',
        'probability_neutral': 'neutral',
        'probability_positive': 'positive'
    })

    # Save the result to a CSV
    df.to_csv(os.path.join("sentiment_results", f"result-roberta-{dataset}"), index=False)

## 2.2. Bing Liu Lexicon (Optional)

In [None]:
import pandas as pd
from textblob import TextBlob
import os
import re

# Load the Bing Lexicon
bing_lexicon = bing_lexicon = pd.read_csv("bing_liu_lexicon.txt", sep='\t')

# Convert to a set for fast lookup
positive_words = set(bing_lexicon[bing_lexicon['sentiment'] == 'positive']['word'])
negative_words = set(bing_lexicon[bing_lexicon['sentiment'] == 'negative']['word'])


def calculate_bing_sentiment(text):
    """Calculates sentiment using the Bing Lexicon."""
    words = text.split()
    pos_count = sum(1 for word in words if word.lower() in positive_words)
    neg_count = sum(1 for word in words if word.lower() in negative_words)
    return pos_count - neg_count


datasets = [
    "cleaned_tweets_jan_2010_to_dec11_2019.csv",
    "cleaned_tweets_dec12_2019_to_oct27_2021.csv",
    "cleaned_tweets_october28_2021_to_january30_2024.csv"
]

for dataset in datasets:
    df = pd.read_csv(os.path.join("datasets", dataset))

    # Calculate sentiment labels
    df['sentiment_score'] = df['text_lexicon'].apply(calculate_bing_sentiment)
    df['sentiment'] = df['sentiment_score'].apply(
        lambda score: 'positive' if score > 0 else ('negative' if score < 0 else 'neutral')
    )
    # Save the updated DataFrame
    df.to_csv(os.path.join("sentiment_results", f"result-bing-{dataset}"), index=False)

# 3. Statistical Analysis

In [None]:
import pandas as pd
import os
import numpy as np
from scipy.stats import chi2_contingency
from statsmodels.stats.contingency_tables import mcnemar
from sklearn.metrics import cohen_kappa_score
import itertools


os.makedirs("stat_results", exist_ok=True)

def calculate_sentiment_proportions(df, source):
    timeframes = df['time_period'].unique()
    pairwise_comparisons = list(itertools.combinations(timeframes, 2))
    output_data = []

    for timeframe in timeframes:
        # Filter data for the current timeframe
        timeframe_data = df[df['time_period'] == timeframe]
        total_count = len(timeframe_data)

        # Calculate sentiment counts and proportions
        sentiment_counts = timeframe_data['sentiment'].value_counts(normalize=False)
        sentiment_proportions = timeframe_data['sentiment'].value_counts(normalize=True)

        n_positive = sentiment_counts.get('positive', 0)
        n_neutral = sentiment_counts.get('neutral', 0)
        n_negative = sentiment_counts.get('negative', 0)

        proportion_positive = sentiment_proportions.get('positive', 0) * 100
        proportion_neutral = sentiment_proportions.get('neutral', 0) * 100
        proportion_negative = sentiment_proportions.get('negative', 0) * 100

        # Append data to output table
        output_data.append([
            timeframe,
            total_count,
            f"{n_positive} ({proportion_positive:.1f})",
            f"{n_neutral} ({proportion_neutral:.1f})",
            f"{n_negative} ({proportion_negative:.1f})",
        ])

    # Perform chi-squared test for proportions across timeframes
    contingency_table = pd.crosstab(df['time_period'], df['sentiment'])
    _, p_value, _, _ = chi2_contingency(contingency_table)
    print(f"Total Chi^2 P-value: {p_value}")

    for c1, c2 in pairwise_comparisons:
        contingency_table = pd.crosstab(df[df['time_period'].isin([c1, c2])]['time_period'], df['sentiment'])
        _, p_value, _, _ = chi2_contingency(contingency_table)
        print(f"{c1} vs. {c2} Chi^2 P-value: {p_value}")

    # Append total row
    total_count = len(df)
    total_counts = df['sentiment'].value_counts(normalize=False)
    total_proportions = df['sentiment'].value_counts(normalize=True)

    total_positive = total_counts.get('positive', 0)
    total_neutral = total_counts.get('neutral', 0)
    total_negative = total_counts.get('negative', 0)

    proportion_positive = total_proportions.get('positive', 0) * 100
    proportion_neutral = total_proportions.get('neutral', 0) * 100
    proportion_negative = total_proportions.get('negative', 0) * 100

    output_data.append([
        'Total',
        total_count,
        f"{total_positive} ({proportion_positive:.1f})",
        f"{total_neutral} ({proportion_neutral:.1f})",
        f"{total_negative} ({proportion_negative:.1f})",
    ])

    return pd.DataFrame(output_data, columns=["Timeframe", "N Tweets", "N Positive (%)", "N Neutral (%)", "N Negative (%)"])


def compare_sentiment(df_roberta, df_bing):
    results = []

    # Timeframe-specific comparisons
    timeframes = df_roberta['time_period'].unique()
    for timeframe in timeframes:
        # Filter data for the current timeframe
        roberta_timeframe = df_roberta[df_roberta['time_period'] == timeframe]
        bing_timeframe = df_bing[df_bing['time_period'] == timeframe]

        # Merge the two datasets on 'id'
        merged = pd.merge(roberta_timeframe, bing_timeframe, on='id', suffixes=('_roberta', '_bing'))

        # McNemar's Test
        contingency = pd.crosstab(merged['sentiment_roberta'], merged['sentiment_bing'])
        mcnemar_result = mcnemar(contingency, exact=False)

        # Cohen's Kappa
        kappa_score = cohen_kappa_score(merged['sentiment_roberta'], merged['sentiment_bing'])

        results.append([timeframe, mcnemar_result.pvalue, kappa_score])

    # Overall comparison
    # Merge the datasets on 'id'
    merged = pd.merge(df_roberta, df_bing, on='id', suffixes=('_roberta', '_bing'))

    # McNemar's Test
    contingency = pd.crosstab(merged['sentiment_roberta'], merged['sentiment_bing'])
    mcnemar_result = mcnemar(contingency, exact=False)

    # Cohen's Kappa
    kappa_score = cohen_kappa_score(merged['sentiment_roberta'], merged['sentiment_bing'])

    results.append(['Overall', mcnemar_result.pvalue, kappa_score])

    # Return results as a DataFrame
    return pd.DataFrame(results, columns=['Timeframe', "McNemar's P-value", "Cohen's Kappa"])

# Datasets
datasets = [
    "cleaned_tweets_jan_2010_to_dec11_2019.csv",
    "cleaned_tweets_dec12_2019_to_oct27_2021.csv",
    "cleaned_tweets_october28_2021_to_january30_2024.csv"
]

# Process each dataset
for dataset in datasets:
    df_roberta_path = os.path.join("results", f"result-roberta-{dataset}")
    df_bing_path = os.path.join("results", f"result-bing-{dataset}")

    # Load datasets
    df_roberta = pd.read_csv(df_roberta_path)
    df_roberta = df_roberta[['id', 'sentiment']]
    df_bing = pd.read_csv(df_bing_path)[['id', 'sentiment']]
    df_bing = df_bing[['id', 'sentiment']]

    # Add time period information
    df_roberta['time_period'] = dataset
    df_bing['time_period'] = dataset

    # Concatenate for combined analysis
    df_roberta['source'] = 'roberta'
    df_bing['source'] = 'bing'

    if 'df_r' not in locals():
        df_r = df_roberta.copy()
    else:
        df_r = pd.concat([df_r, df_roberta], ignore_index=True)
    if 'df_b' not in locals():
        df_b = df_bing.copy()
    else:
        df_b = pd.concat([df_b, df_bing], ignore_index=True)

# Display the results
print("RoBERTa")
df_roberta_output = calculate_sentiment_proportions(df_r, "roberta")
display(df_roberta_output)
df_roberta_output.to_csv(os.path.join("stat_results", "roberta.csv"), index=False)

print("Bing")
df_bing_output = calculate_sentiment_proportions(df_b, "bing")
display(df_bing_output)
df_bing_output.to_csv(os.path.join("stat_results", "bing.csv"), index=False)

print("RoBERTa vs. Bing")
# Perform comparison by timeframe
timeframe_comparison = compare_sentiment(df_r, df_b)
display(timeframe_comparison)
timeframe_comparison.to_csv(os.path.join("stat_results", "roberta_vs_bing.csv"), index=False)

# 4. SHAP Explanation
## 4.1 Chunk Dataset
For parallel processing of SHAP values

In [None]:
import pandas as pd
import os

# List of dataset file names
datasets = [
    "cleaned_tweets_jan_2010_to_dec11_2019.csv",
    "cleaned_tweets_dec12_2019_to_oct27_2021.csv",
    "cleaned_tweets_october28_2021_to_january30_2024.csv"
]

# Define the chunk size
chunk_size = 10000

# Function to split and save datasets
i_chunk = 0

os.makedirs(os.path.join("datasets", "chunked"), exist_ok=True)

# Apply the function to each dataset
for dataset in datasets:
    df = pd.read_csv(os.path.join("datasets", dataset))

    # Determine number of chunks
    num_chunks = len(df) // chunk_size + (1 if len(df) % chunk_size != 0 else 0)

    for i in range(num_chunks):
        # Get the chunk
        chunk = df.iloc[i * chunk_size: (i + 1) * chunk_size]

        # Define the output file name
        output_file = f"chunk_{i_chunk}_{dataset}"

        # Save the chunk to a new file
        chunk.to_csv(os.path.join("datasets", "chunked", output_file), index=False)
        i_chunk += 1
        print(f"Saved: {output_file}")

## 4.2. Calculate SHAP Values in Chunks

In [None]:
import shapley
os.makedirs("shap_results", exist_ok=True)
for i in range(i_chunk):
    shapley.main(i, 0)

## 4.3. Combining Chunked Results

In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm

import csv
import json
import os
from multiprocessing import Pool, cpu_count

from get_shap_scores import get_shap_scores
from multiprocess import Pool, cpu_count
from tqdm import tqdm


def main():
    num_workers = max(cpu_count() - 1, 1)

    datasets = [
        "cleaned_tweets_jan_2010_to_dec11_2019.csv",
        "cleaned_tweets_dec12_2019_to_oct27_2021.csv",
        "cleaned_tweets_october28_2021_to_january30_2024.csv"
    ]

    shap_files = {}

    args = []

    for dataset in datasets:
        shap_files[dataset] = []
        for chunk in range(92):
            fpath = os.path.join("shap_results", f"sv_probs_chunk_{chunk}_{dataset}.pkl")
            if os.path.exists(fpath):
                df = pd.read_csv(os.path.join("datasets", "chunked", f"chunk_{chunk}_{dataset}"))
                id_min = df['id'].min()
                id_max = df['id'].max()

                for lb in range(3):
                    args.append((fpath, lb, dataset, id_min, id_max))

    with Pool(processes=num_workers) as pool:
        results = list(tqdm(pool.imap(get_shap_scores, args), total=len(args)))

    df = pd.DataFrame(columns=["dataset", "label", "feature", "value", "base_value", "id"])

    for result in tqdm(results):
        features, values, base_values, ids, label, dataset = result

        df_temp = pd.DataFrame({
            "feature": features,
            "value": values,
            "base_value": base_values,
            "id": ids
        })

        df_temp["label"] = label
        df_temp["dataset"] = dataset

        # Append to the main dataframe
        df = pd.concat([df, df_temp], ignore_index=True)
    # raise Exception()
    # Save or return the dataframe (for further processing)
    df.to_csv(os.path.join("shap_results", "sv_probs_raw.csv"), index=False)
    display(df.head())


if __name__ == "__main__":
    main()

## 4.4. Process Shap Values and Generate Figures and Tables

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import os
from scipy import stats

warnings.filterwarnings("ignore")

# Create folders
os.makedirs("shap_figures", exist_ok=True)
formats = ["pdf", "png", "tiff"]
for f in formats:
    os.makedirs(os.path.join("shap_figures", f), exist_ok=True)

label_mapping = {
    0: "Negative",
    1: "Neutral",
    2: "Positive"
}

time_mapping = {
    "cleaned_tweets_jan_2010_to_dec11_2019.csv": "T1",
    "cleaned_tweets_dec12_2019_to_oct27_2021.csv": "T2",
    "cleaned_tweets_october28_2021_to_january30_2024.csv": "T3",
    "total": "All"
}

datasets = [
    "cleaned_tweets_jan_2010_to_dec11_2019.csv",
    "cleaned_tweets_dec12_2019_to_oct27_2021.csv",
    "cleaned_tweets_october28_2021_to_january30_2024.csv"
]

df_sv = pd.read_csv(os.path.join("shap_results", "sv_probs_raw.csv"))

# This will collect the CI table rows
ci_rows = []

output = pd.DataFrame(columns=[
    "Dataset", "Time Period", "Sentiment", "Occurrence Threshold",
    "N Features", "N Unique Features", "N Unique Features with >= X Occurrence",
    "Feature Type", "Figure Name"
])

for dataset in datasets + ["total"]:
    if dataset != "total":
        df_d = df_sv[df_sv["dataset"] == dataset].copy()
    else:
        df_d = df_sv.copy()

    df_d['feature'] = df_d['feature'].str.replace("Ġ", "", regex=False).str.lower().str.strip()

    for lb in range(3):
        df_d_lb = df_d[df_d["label"] == lb]
        feature_counts = df_d_lb['feature'].value_counts()

        for occurrence in [1, 100, 1000]:
            df_d_lb_o = df_d_lb[df_d_lb['feature'].isin(feature_counts[feature_counts >= occurrence].index)]

            # Group stats for sorting
            grouped = df_d_lb_o.groupby('feature')['value']
            for ascending, feature_type in [(False, "Increase"), (True, "Decrease")]:
                stats_mean = grouped.mean().sort_values(ascending=ascending)
                top_features = stats_mean.head(15).index

                # Filter data for the top features
                tf_data = df_d_lb_o[df_d_lb_o['feature'].isin(top_features)]

                # Compute per-feature statistics
                for feat in top_features:
                    vals = tf_data.loc[tf_data['feature'] == feat, 'value']
                    n = len(vals)
                    mean = vals.mean()
                    sem = vals.std(ddof=1) / np.sqrt(n)
                    ci = sem * stats.t.ppf(0.975, df=n-1)  # 95% t-interval
                    ci_rows.append({
                        "Dataset": dataset,
                        "Time Period": time_mapping[dataset],
                        "Sentiment": label_mapping[lb],
                        "Occurrence": occurrence,
                        "Feature": feat,
                        "Mean_SHAP": mean,
                        "CI_Lower": mean - ci,
                        "CI_Upper": mean + ci,
                        "Feature_Type": feature_type
                    })

                # Prepare for plotting
                sample_sizes = tf_data.groupby('feature')['value'].count()
                tf_data['feature_with_n'] = tf_data['feature'].apply(
                    lambda x: f"{x} (n={sample_sizes[x]})"
                )
                stats_order = stats_mean.loc[top_features]
                colors = ['#ff5555' if v > 0 else '#55aaff' for v in stats_order]
                order = [f"{f} (n={sample_sizes[f]})" for f in stats_order.index]

                plt.figure(figsize=(8, 6))
                sns.barplot(
                    data=tf_data,
                    x='value',
                    y='feature_with_n',
                    palette=colors,
                    orient='h',
                    order=order,
                    err_kws={'linewidth': 1}
                )
                plt.xlabel('Mean SHAP Value')
                plt.ylabel('Token Feature')
                plt.tight_layout()

                fig_name = f"Mean_{time_mapping[dataset]}_{label_mapping[lb]}_{feature_type}_{occurrence}"
                for f in formats:
                    plt.savefig(
                        os.path.join("shap_figures", f, f"{fig_name}.{f}"),
                        dpi=300,
                        bbox_inches='tight'
                    )
                plt.close()

                # Record metadata
                output.loc[len(output)] = {
                    "Dataset": dataset,
                    "Time Period": time_mapping[dataset],
                    "Sentiment": label_mapping[lb],
                    "Occurrence Threshold": occurrence,
                    "N Features": len(df_d),
                    "N Unique Features": len(df_d['feature'].unique()),
                    "N Unique Features with >= X Occurrence": len(df_d_lb_o['feature'].unique()),
                    "Feature Type": feature_type,
                    "Figure Name": fig_name
                }

# Save the figure description
output.to_csv(os.path.join("shap_figures", "shap_figure_description.csv"), index=False)

# Build and save the CI table
ci_df = pd.DataFrame(ci_rows)
ci_df.to_csv(os.path.join("shap_figures", "shap_values_with_95CI.csv"), index=False)

# 5. LDA Topic Modelling
## 5.1. Generate models

In [None]:
import tweepy
import gensim
from gensim import corpora
import pandas as pd
import os
import nltk
from nltk.tokenize import word_tokenize
import pickle
from multiprocessing import cpu_count

num_workers = max(cpu_count() - 1, 1)

datasets = [
    "cleaned_tweets_jan_2010_to_dec11_2019.csv",
    "cleaned_tweets_dec12_2019_to_oct27_2021.csv",
    "cleaned_tweets_october28_2021_to_january30_2024.csv"
]

nltk.download('punkt')

os.makedirs("lda_results", exist_ok=True)

def preprocess(text):
    return word_tokenize(text.lower())  # Example: simple tokenization


dataframes = [pd.read_csv(os.path.join("datasets", dataset)) for dataset in datasets]
df_all = pd.concat(dataframes, ignore_index=True)

for dataset in datasets + ['total']:
    if dataset == 'total':
        df = df_all.copy()
    else:
        df = pd.read_csv(os.path.join("datasets", dataset))
    
    tokenized_words = [preprocess(str(tweet)) for tweet in df['text_lda']]

    # Create dictionary and corpus
    dictionary = corpora.Dictionary(tokenized_words)
    corpus = [dictionary.doc2bow(tweet) for tweet in tokenized_words]

    # Train LDA model
    num_topics = 6
    lda_model = gensim.models.LdaMulticore(
        corpus=corpus,
        id2word=dictionary,
        num_topics=num_topics,
        random_state=42,
        passes=10,
        workers=num_workers
    )
    
    with open(os.path.join("lda_results", f"lda_model_{dataset}.pkl"), "wb") as f:
        pickle.dump(lda_model, f)

    # Display topics
    for idx, topic in lda_model.print_topics(num_words=5):
        print(f"Topic {idx}: {topic}")


## 5.2. Generate Figures and Tables

In [3]:
import matplotlib.pyplot as plt
import pickle
import seaborn as sns
import pandas as pd
import os

# Create folders
os.makedirs("lda_figures", exist_ok=True)
formats = ["pdf", "png", "tiff"]
for f in formats:
    os.makedirs(os.path.join("lda_figures", f), exist_ok=True)

datasets = [
    "cleaned_tweets_jan_2010_to_dec11_2019.csv",
    "cleaned_tweets_dec12_2019_to_oct27_2021.csv",
    "cleaned_tweets_october28_2021_to_january30_2024.csv",
    "total"
]

# Will collect rows for the CSV
rows = []

for dataset in datasets:
    # Load the fitted LDA model
    with open(os.path.join("lda_results", f"lda_model_{dataset}.pkl"), "rb") as fp:
        lda_model = pickle.load(fp)

    # Extract topics: list of (topic_idx, [(word, weight), ...])
    topics = lda_model.show_topics(num_topics=-1, num_words=15, formatted=False)

    for idx, topic in topics:
        words, weights = zip(*topic)
        # Append each word/weight to rows
        for word, weight in zip(words, weights):
            rows.append({
                "Dataset": dataset,
                "Topic": idx + 1,
                "Word": word,
                "Weight": weight
            })

        # Plot the bar chart
        plt.figure(figsize=(8, 5))
        sns.barplot(x=list(weights), y=list(words), orient="h")
        plt.xlabel("Weight")
        plt.ylabel("Word")
        plt.title(f"{dataset} — Topic {idx+1}")
        plt.tight_layout()

        # Save in all formats
        for fmt in formats:
            plt.savefig(
                os.path.join("lda_figures", fmt, f"{dataset}_topic{idx+1}.{fmt}"),
                dpi=300,
                bbox_inches="tight"
            )
        plt.close()

# Build DataFrame and save CSV
lda_df = pd.DataFrame(rows)
lda_df.to_csv(os.path.join("lda_figures", "lda_topic_word_weights.csv"), index=False)