In [None]:
# sentiment_full_pipeline.py
# Single-file implementation: Data collection -> preprocessing -> VADER -> TF-IDF+LogReg -> BERT -> visualization -> save

import os
import re
import time
import json
import math
from tqdm import tqdm

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# NLP & ML
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Transformers
from transformers import pipeline

# Scraping & Wordcloud
import snscrape.modules.twitter as sntwitter
from wordcloud import WordCloud

# ------------------------
# Configuration
# ------------------------
BRAND = "Tesla"                           # change brand here
LANG = "en"
SINCE = "2025-07-01"                      # inclusive
UNTIL = "2025-08-01"                      # exclusive
MAX_TWEETS = 1000                         # number of tweets to collect
OUTPUT_DIR = "sentiment_outputs"
RANDOM_STATE = 42

os.makedirs(OUTPUT_DIR, exist_ok=True)

# ------------------------
# NLTK setup
# ------------------------
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)

STOPWORDS = set(stopwords.words('english'))
LEMMATIZER = WordNetLemmatizer()

# ------------------------
# 1. Data Collection (snscrape)
# ------------------------
def collect_tweets(query_brand=BRAND, lang=LANG, since=SINCE, until=UNTIL, max_tweets=MAX_TWEETS):
    query = f"{query_brand} lang:{lang} since:{since} until:{until}"
    tweets = []
    print(f"Collecting up to {max_tweets} tweets with query: {query}")
    for i, tweet in enumerate(sntwitter.TwitterSearchScraper(query).get_items()):
        if i >= max_tweets:
            break
        tweets.append({
            "date": tweet.date,
            "tweet": tweet.content,
            "username": tweet.user.username
        })
    df = pd.DataFrame(tweets)
    print(f"Collected {len(df)} tweets.")
    return df

# ------------------------
# 2. Preprocessing
# ------------------------
def clean_text(text):
    if not isinstance(text, str):
        return ""
    # Remove URLs, mentions, hashtags, punctuation (keep spaces), numbers
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"@\w+", "", text)
    text = re.sub(r"#", "", text)  # keep the word but remove '#'
    text = re.sub(r"[^A-Za-z\s]", " ", text)  # keep only letters and spaces
    text = text.lower()
    # tokenization & remove stopwords
    tokens = [tok for tok in text.split() if tok not in STOPWORDS and len(tok) > 1]
    # lemmatize
    tokens = [LEMMATIZER.lemmatize(tok) for tok in tokens]
    return " ".join(tokens)

# ------------------------
# 3. VADER Sentiment
# ------------------------
def apply_vader(df, text_col='clean_tweet'):
    analyzer = SentimentIntensityAnalyzer()
    df['vader_compound'] = df[text_col].apply(lambda x: analyzer.polarity_scores(x)['compound'])
    df['vader_label'] = df['vader_compound'].apply(lambda s: 'Positive' if s > 0.05 else ('Negative' if s < -0.05 else 'Neutral'))
    return df

# ------------------------
# 4. TF-IDF + Logistic Regression (using VADER labels as pseudo-labels)
# ------------------------
def train_tfidf_logreg(df, text_col='clean_tweet', label_col='vader_label'):
    # Prepare data
    X = df[text_col].fillna("").values
    y = df[label_col].values

    vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
    X_vect = vectorizer.fit_transform(X)
    X_train, X_test, y_train, y_test = train_test_split(X_vect, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y)

    model = LogisticRegression(max_iter=1000, random_state=RANDOM_STATE)
    print("Training LogisticRegression on TF-IDF features...")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    print("TF-IDF + Logistic Regression results:")
    print(f"Accuracy: {acc:.4f}")
    print(classification_report(y_test, y_pred))

    # save model artifacts
    import pickle
    with open(os.path.join(OUTPUT_DIR, 'tfidf_vectorizer.pkl'), 'wb') as f:
        pickle.dump(vectorizer, f)
    with open(os.path.join(OUTPUT_DIR, 'logreg_model.pkl'), 'wb') as f:
        pickle.dump(model, f)

    # Save test results
    test_df = pd.DataFrame({
        "text": X_test.astype(str)[:10] if hasattr(X_test, "astype") else [],
        "true": y_test,
        "pred": y_pred
    })
    return model, vectorizer, acc

# ------------------------
# 5. BERT Sentiment (transformers pipeline)
# ------------------------
def apply_bert(df, raw_text_col='tweet', batch_size=16):
    print("Loading BERT sentiment pipeline (this will download the model)...")
    bert = pipeline("sentiment-analysis", truncation=True)
    labels = []
    scores = []
    texts = df[raw_text_col].fillna("").tolist()
    print("Classifying with BERT (may take time depending on CPU/GPU)...")
    for i in tqdm(range(0, len(texts), batch_size)):
        batch = texts[i:i+batch_size]
        out = bert(batch)
        for o in out:
            labels.append(o['label'])
            scores.append(o.get('score', None))
    # Normalize labels (some models return 'POSITIVE'/'NEGATIVE')
    labels = [l.title() for l in labels]  # 'POSITIVE' -> 'Positive'
    df['bert_label'] = labels
    df['bert_score'] = scores
    return df

# ------------------------
# 6. Visualization utilities
# ------------------------
def plot_sentiment_distribution(df, label_col, title, filename):
    plt.figure(figsize=(6,4))
    order = ['Positive','Neutral','Negative']
    sns.countplot(x=label_col, data=df, order=order)
    plt.title(title)
    plt.xlabel('')
    plt.tight_layout()
    plt.savefig(os.path.join(OUTPUT_DIR, filename), dpi=150)
    plt.close()

def plot_wordcloud_from_texts(texts, filename, max_words=150):
    text = " ".join(texts)
    wc = WordCloud(width=1200, height=600, background_color='white', max_words=max_words).generate(text)
    plt.figure(figsize=(12,6))
    plt.imshow(wc, interpolation='bilinear')
    plt.axis('off')
    plt.tight_layout()
    plt.savefig(os.path.join(OUTPUT_DIR, filename), dpi=150)
    plt.close()

def plot_time_series_sentiment(df, date_col='date', label_col='vader_label', filename='sentiment_timeseries.png'):
    df = df.copy()
    df[date_col] = pd.to_datetime(df[date_col])
    df['date_only'] = df[date_col].dt.date
    daily = df.groupby(['date_only', label_col]).size().unstack(fill_value=0)
    daily = daily.reindex(columns=['Positive','Neutral','Negative'], fill_value=0)
    daily.plot(kind='line', figsize=(10,5), marker='o')
    plt.title('Daily sentiment counts')
    plt.xlabel('Date')
    plt.ylabel('Count')
    plt.tight_layout()
    plt.savefig(os.path.join(OUTPUT_DIR, filename), dpi=150)
    plt.close()

# ------------------------
# 7. Main pipeline
# ------------------------
def main():
    # 1. collect
    df = collect_tweets()
    if df.empty:
        print("No tweets collected. Exiting.")
        return

    # save raw
    df.to_csv(os.path.join(OUTPUT_DIR, 'raw_tweets.csv'), index=False)

    # 2. preprocess
    print("Cleaning tweets...")
    df['clean_tweet'] = df['tweet'].apply(clean_text)
    df.to_csv(os.path.join(OUTPUT_DIR, 'cleaned_tweets.csv'), index=False)

    # basic info
    print("Sample cleaned tweets:")
    print(df['clean_tweet'].head(5).to_list())

    # 3. VADER
    df = apply_vader(df)
    df.to_csv(os.path.join(OUTPUT_DIR, 'with_vader.csv'), index=False)
    plot_sentiment_distribution(df, 'vader_label', 'Sentiment Distribution (VADER)', 'vader_distribution.png')

    # 4. TF-IDF + Logistic Regression
    model, vectorizer, tfidf_acc = train_tfidf_logreg(df)
    # store a prediction column using logreg for all data
    X_all = vectorizer.transform(df['clean_tweet'].fillna("").tolist())
    df['logreg_pred'] = model.predict(X_all)

    # 5. BERT classification
    df = apply_bert(df)
    df.to_csv(os.path.join(OUTPUT_DIR, 'with_all_models.csv'), index=False)

    # 6. Visualizations
    plot_sentiment_distribution(df, 'vader_label', 'VADER Sentiment Distribution', 'vader_distribution.png')
    plot_sentiment_distribution(df, 'logreg_pred', 'LogReg (TF-IDF) Sentiment Distribution', 'logreg_distribution.png')
    plot_sentiment_distribution(df, 'bert_label', 'BERT Sentiment Distribution', 'bert_distribution.png')

    # wordclouds for positive & negative (by VADER)
    positive_texts = df[df['vader_label']=='Positive']['clean_tweet'].tolist()
    negative_texts = df[df['vader_label']=='Negative']['clean_tweet'].tolist()
    if positive_texts:
        plot_wordcloud_from_texts(positive_texts, 'wordcloud_positive_vader.png')
    if negative_texts:
        plot_wordcloud_from_texts(negative_texts, 'wordcloud_negative_vader.png')

    # time series (VADER)
    plot_time_series_sentiment(df, label_col='vader_label', filename='vader_timeseries.png')

    # 7. Simple evaluation: compare VADER vs LogReg vs BERT
    # We'll compute agreement metrics (accuracy against VADER for logreg and bert)
    # Note: TF-IDF/LogReg trained on VADER pseudo-labels; BERT is independent.
    print("\nAgreement/Evaluation summary (reference = VADER pseudo-label):")
    for col in ['logreg_pred', 'bert_label']:
        # drop NaNs
        valid = df[~df[col].isna()]
        agree = (valid[col].values == valid['vader_label'].values).mean()
        print(f"{col} agreement with VADER: {agree:.4f} ({int(agree*100)}%)")

    # Confusion table example (vader vs bert)
    cm = pd.crosstab(df['vader_label'], df['bert_label'], rownames=['VADER'], colnames=['BERT'], normalize='index')
    cm.to_csv(os.path.join(OUTPUT_DIR, 'confusion_vader_bert.csv'))
    print("\nSaved outputs to", OUTPUT_DIR)
    print("Top-level files:")
    for fname in os.listdir(OUTPUT_DIR):
        print(" -", fname)

    # Save final dataframe
    df.to_csv(os.path.join(OUTPUT_DIR, 'final_results.csv'), index=False)
    # Save summary JSON
    summary = {
        "brand": BRAND,
        "since": SINCE,
        "until": UNTIL,
        "n_tweets": len(df),
        "tfidf_logreg_accuracy_est": tfidf_acc
    }
    with open(os.path.join(OUTPUT_DIR, 'summary.json'), 'w') as f:
        json.dump(summary, f, indent=2, default=str)

if __name__ == "__main__":
    main()
