In [1]:
!pip install -q spacy
!python -m spacy download en_core_web_sm


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m39.5 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.



# Research Questions & Scope
To understand the public sentiment and discussion trends regarding Electric Vehicles (EVs), this analysis addresses:

1.  **Volume & Growth**: How has the interest in EVs changed over time (Monthly/Quarterly)?
2.  **Community Hubs**: Which subreddits are the primary drivers (r/electricvehicles, r/cars, r/TeslaMotors)?
3.  **Engagement**: Post vs. Comment volume comparison.
4.  **Topic Evolution**: Yearly analysis of discussed topics.
5.  **Spread & Sentiment**: Distribution of sentiment and data spread.

**Subreddits Analyzed**:
-   `r/electricvehicles`
-   `r/cars`
-   `r/TeslaMotors`


In [3]:
#!/usr/bin/env python3
"""
preprocess_and_eda_plots_enhanced.py
"""
import os
import re
import sys
import json
from pathlib import Path
from collections import Counter
from datetime import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# NLP Imports
try:
    import spacy
    SPACY_AVAILABLE = True
except ImportError:
    SPACY_AVAILABLE = False

try:
    from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
    VADER_AVAILABLE = True
except ImportError:
    VADER_AVAILABLE = False

import gensim
from gensim import corpora, models
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer

# ---------------------------
# Path Configuration
# ---------------------------
# Attempt to locate data relative to this notebook/script
# Assuming structure:
#  root/
#    CSS-datafiles/outputs/ (data here)
#    CSS-preprocessed data/ (this notebook here)

DATA_DIR = Path("../CSS-datafiles/outputs")
if not DATA_DIR.exists():
    # Fallback to local or /content
    if Path("outputs").exists():
        DATA_DIR = Path("outputs")
    else:
        DATA_DIR = Path("/content")

POSTS_FILE = DATA_DIR / "posts_final.csv"
COMMENTS_FILE = DATA_DIR / "comments_final.csv"

OUTPUT_DIR = Path("processed_outputs")
PLOTS_DIR = OUTPUT_DIR / "plots"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
PLOTS_DIR.mkdir(parents=True, exist_ok=True)
EDA_OUTPUT = OUTPUT_DIR / "eda_summary.csv"

# ---------------------------
# Helpers
# ---------------------------
URL_REGEX = re.compile(r"http\S+|www\.\S+")
HTML_ENTITY = re.compile(r"&\w+;")
SUBREDDIT_RE = re.compile(r"r\/[A-Za-z0-9_]+")
USER_RE = re.compile(r"u\/[A-Za-z0-9_\-]+")

def clean_text(text):
    if pd.isna(text): return ""
    text = str(text)
    text = text.replace("\n", " ").replace("\r", " ")
    text = URL_REGEX.sub(" ", text)
    text = HTML_ENTITY.sub(" ", text)
    text = SUBREDDIT_RE.sub(" ", text)
    text = USER_RE.sub(" ", text)
    text = re.sub(r"\[([^\]]+)\]\([^)]+\)", r"\1", text)
    text = text.encode("ascii", "ignore").decode("ascii")
    text = re.sub(r"[^\w\s]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text.lower()

def prepare_nlp():
    if SPACY_AVAILABLE:
        try:
            nlp = spacy.load("en_core_web_sm", disable=["ner", "parser"])
            print("[INFO] Using spaCy for preprocessing.")
            def proc(text):
                doc = nlp(text)
                return [tok.lemma_.lower() for tok in doc if tok.is_alpha and not tok.is_stop and len(tok)>2]
            return proc, "spaCy"
        except Exception:
            pass
    
    # Fallback NLTK
    for pkg in ["punkt", "stopwords"]:
        try: nltk.data.find(f"tokenizers/{pkg}" if pkg=="punkt" else f"corpora/{pkg}")
        except: nltk.download(pkg, quiet=True)
    
    stop_words = set(stopwords.words("english"))
    stemmer = PorterStemmer()
    print("[INFO] Using NLTK for preprocessing.")
    def proc(text):
        toks = word_tokenize(text)
        toks = [t.lower() for t in toks if t.isalpha() and len(t)>2 and t.lower() not in stop_words]
        toks = [stemmer.stem(t) for t in toks]
        return toks
    return proc, "NLTK"

def top_n_tokens(token_lists, n=20):
    c = Counter()
    for toks in token_lists: c.update(toks)
    return c.most_common(n)

def ensure_vader():
    if VADER_AVAILABLE: return SentimentIntensityAnalyzer()
    return None

def read_or_empty(path):
    if not path.exists():
        print(f"[WARN] {path} not found.")
        return pd.DataFrame()
    return pd.read_csv(path, low_memory=False)

def save_bar(top_list, title, fname, top_n=30):
    if not top_list: return
    tokens, counts = zip(*top_list[:top_n])
    plt.figure(figsize=(10,6))
    plt.bar(range(len(tokens)), counts)
    plt.xticks(range(len(tokens)), tokens, rotation=90)
    plt.title(title)
    plt.tight_layout()
    plt.savefig(fname)
    plt.close()

def save_hist(data, title, fname, xlabel="", bins=50):
    plt.figure(figsize=(8,4))
    plt.hist(data.dropna(), bins=bins)
    plt.title(title)
    if xlabel: plt.xlabel(xlabel)
    plt.tight_layout()
    plt.savefig(fname)
    plt.close()

# ---------------------------
# Enhanced Plotting Functions
# ---------------------------
def plot_posts_vs_comments_bar(n_posts, n_comments):
    plt.figure(figsize=(6,6))
    plt.bar(["Posts", "Comments"], [n_posts, n_comments], color=["skyblue", "orange"])
    plt.title("Number of Posts vs Comments")
    plt.ylabel("Count")
    plt.tight_layout()
    plt.savefig(PLOTS_DIR / "posts_vs_comments_sidebyside.png")
    plt.close()
    print("[SAVED] posts_vs_comments_sidebyside.png")

def plot_spread(df, name):
    # Box plots for numeric cols
    numerics = ["score", "num_comments", "token_count", "compound"]
    for col in numerics:
        if col in df.columns:
            plt.figure(figsize=(6,4))
            try:
                df.boxplot(column=[col], showfliers=False) # Hide outliers for better scale
                plt.title(f"Spread of {col} in {name}")
                plt.tight_layout()
                plt.savefig(PLOTS_DIR / f"{name}_{col}_spread_boxplot.png")
                plt.close()
            except Exception as e:
                print(f"[WARN] Failed boxplot {col}: {e}")

def plot_temporal_analysis(df, name):
    if df.empty or 'created_utc' not in df.columns: return
    
    # Handle timestamp
    try:
        # Check if numeric
        if pd.api.types.is_numeric_dtype(df['created_utc']):
             dt = pd.to_datetime(df['created_utc'], unit='s', errors='coerce')
        else:
             dt = pd.to_datetime(df['created_utc'], errors='coerce')
    except:
        return
        
    dt = dt.dropna()
    if len(dt) < 10: return
    
    # Monthly
    m_counts = dt.dt.to_period("M").value_counts().sort_index()
    if not m_counts.empty:
        plt.figure(figsize=(12,5))
        m_counts.plot(kind='line', marker='o')
        plt.title(f"{name} Volume (Monthly)")
        plt.ylabel("Count")
        plt.grid(True, alpha=0.3)
        plt.tight_layout()
        plt.savefig(PLOTS_DIR / f"{name}_monthly_trend.png")
        plt.close()

    # Quarterly
    q_counts = dt.dt.to_period("Q").value_counts().sort_index()
    if not q_counts.empty:
        plt.figure(figsize=(10,5))
        q_counts.plot(kind='bar', color='teal')
        plt.title(f"{name} Volume (Quarterly)")
        plt.ylabel("Count")
        plt.tight_layout()
        plt.savefig(PLOTS_DIR / f"{name}_quarterly_trend.png")
        plt.close()
    
    print(f"[SAVED] Temporal plots for {name}")

def run_yearly_lda(df, text_col, name, num_topics=5):
    print(f"Starting Yearly LDA for {name}...")
    if df.empty or text_col not in df.columns: return
    
    # Ensure year
    try:
        if pd.api.types.is_numeric_dtype(df['created_utc']):
             dt = pd.to_datetime(df['created_utc'], unit='s', errors='coerce')
        else:
             dt = pd.to_datetime(df['created_utc'], errors='coerce')
    except:
        return
        
    df = df.copy()
    df['year'] = dt.dt.year
    df = df.dropna(subset=['year'])
    
    years = sorted(df['year'].unique())
    year_results = []
    
    # Tokenizer for LDA
    stop_words = set(stopwords.words('english'))
    def fast_tok(text):
        return [w.lower() for w in word_tokenize(str(text)) if w.isalpha() and w.lower() not in stop_words and len(w)>2]

    for y in years:
        sub = df[df['year'] == y]
        if len(sub) < 10: continue
        
        docs = sub[text_col].tolist()
        tokens = [fast_tok(d) for d in docs]
        if not tokens: continue
        
        dictionary = corpora.Dictionary(tokens)
        # Filter extremes: less restricted since yearly slices are smaller
        dictionary.filter_extremes(no_below=2, no_above=0.7)
        corpus = [dictionary.doc2bow(t) for t in tokens]
        
        if not corpus: continue
        
        try:
            lda = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=5, random_state=42)
            topics_desc = []
            for t in range(num_topics):
                words = ", ".join([w for w, p in lda.show_topic(t, topn=5)])
                topics_desc.append(f"T{t}[{words}]")
            
            year_results.append({
                "year": int(y),
                "doc_count": len(sub),
                "topics": " | ".join(topics_desc)
            })
            print(f" - {int(y)} done ({len(sub)} docs)")
        except Exception as e:
            print(f"Error LDA year {y}: {e}")

    if year_results:
        out_f = OUTPUT_DIR / f"{name}_yearly_topics.csv"
        pd.DataFrame(year_results).to_csv(out_f, index=False)
        print(f"[SAVED] {out_f}")
    else:
        print("No yearly results generated.")

# ---------------------------
# Main Routine
# ---------------------------
def main():
    print(f"Loading data from {DATA_DIR}...")
    posts = read_or_empty(POSTS_FILE)
    comments = read_or_empty(COMMENTS_FILE)
    
    # 1. Cleaning & Tokenization
    token_proc_fn, _ = prepare_nlp()
    
    def process_df(df, label):
        if df.empty: return df
        if "clean_text" not in df.columns:
            # Create clean_text
            if "title" in df.columns and "selftext" in df.columns:
                df["clean_text"] = (df["title"].fillna("") + " " + df["selftext"].fillna("")).map(clean_text)
            elif "body" in df.columns:
                df["clean_text"] = df["body"].fillna("").map(clean_text)
            else:
                df["clean_text"] = ""
        
        # Tokenize
        df["tokens"] = df["clean_text"].map(token_proc_fn)
        df["token_count"] = df["tokens"].map(len)
        return df

    posts = process_df(posts, "posts")
    comments = process_df(comments, "comments")
    
    # 2. Sentiment
    sia = ensure_vader()
    def add_sentiment(df):
        if df.empty or not sia: return df
        # check if already there
        if "compound" in df.columns: return df
        print("Computing sentiment...")
        scores = df["clean_text"].astype(str).map(lambda t: sia.polarity_scores(t))
        df["neg"] = scores.map(lambda s: s["neg"])
        df["neu"] = scores.map(lambda s: s["neu"])
        df["pos"] = scores.map(lambda s: s["pos"])
        df["compound"] = scores.map(lambda s: s["compound"])
        return df
        
    posts = add_sentiment(posts)
    comments = add_sentiment(comments)
    
    # 3. Basic Plots (Token freq, etc - reusing logic concisely)
    if not posts.empty:
        save_bar(top_n_tokens(posts["tokens"]), "Top Terms (Posts)", PLOTS_DIR/"posts_top_terms.png")
        save_hist(posts["token_count"], "Token Count (Posts)", PLOTS_DIR/"posts_len_hist.png")
        if "subreddit" in posts.columns:
            vc = posts["subreddit"].value_counts().head(20)
            plt.figure(figsize=(10,6))
            plt.bar(vc.index, vc.values)
            plt.xticks(rotation=90)
            plt.title("Top Subreddits")
            plt.tight_layout()
            plt.savefig(PLOTS_DIR/"subreddit_dist.png")
            plt.close()
            
    # 4. ENHANCED ANALYSIS CALLS
    print("Running enhanced analysis...")
    
    # Posts vs Comments
    plot_posts_vs_comments_bar(len(posts), len(comments))
    
    # Temporal
    plot_temporal_analysis(posts, "posts")
    plot_temporal_analysis(comments, "comments")
    
    # Spread
    plot_spread(posts, "posts")
    plot_spread(comments, "comments")
    
    # Yearly LDA
    run_yearly_lda(posts, "clean_text", "posts")
    
    # Save processed CSVs
    if not posts.empty: posts.to_csv(OUTPUT_DIR/"posts_enhanced.csv", index=False)
    if not comments.empty: comments.to_csv(OUTPUT_DIR/"comments_enhanced.csv", index=False)
    
    print("\nAll Done. Results in", OUTPUT_DIR)

if __name__ == "__main__":
    main()


[INFO] Using spaCy for preprocessing.
[INFO] Tokenizer: spaCy
[SAVED] /content/posts_cleaned.csv (1281 rows)
[SAVED] /content/comments_cleaned.csv (194152 rows)
[INFO] Installing vaderSentiment...
[SAVED] /content/posts_cleaned_with_sent.csv
[SAVED] /content/comments_cleaned_with_sent.csv
[SAVED] EDA summary to /content/eda_summary.csv
[SAVED] posts_top_tokens.png
[SAVED] comments_top_tokens.png
[SAVED] posts_tokencount_hist.png
[SAVED] posts_charlen_hist.png
[SAVED] comments_tokencount_hist.png
[SAVED] comments_charlen_hist.png
[SAVED] posts_subreddit_dist.png
[SAVED] posts_compound_hist.png
[SAVED] comments_compound_hist.png
[SAVED] posts_timeseries_monthly.png
[SAVED] comments_timeseries_monthly.png
[SAVED] posts_token_vs_compound.png
[SAVED] comments_token_vs_compound.png

Done. Files written to: /content
 - comments_cleaned.csv
 - comments_cleaned_with_sent.csv
 - comments_final.csv
 - comments_token_freq.csv
 - eda_summary.csv
 - posts_cleaned.csv
 - posts_cleaned_with_sent.csv
 