## Imports

In [None]:
# reddit
import praw

# nlp
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords

# graph
from matplotlib.colors import LinearSegmentedColormap
import matplotlib.colors as mcolors
import matplotlib.pyplot as plt
import networkx as nx


# config
from secrets_config import RedditSecretsConfig

# system
from datetime import datetime, timezone
from collections import defaultdict, Counter
from pytz import UTC
import pandas as pd
import unicodedata
import emoji
import csv
import ast
import re

import site
site.addsitedir("NLP")


import re
import emoji
import unicodedata
import spacy

: 

## CONGIF Class

In [9]:
class CONFIG:
    INDIVIDUALS = [
        "Elon Musk",           # CEO of Tesla. Founder, product architect, and central to all decisions at Tesla.
        "Jeff Bezos",          # Amazon founder. Business and space rival to Musk (Blue Origin vs. SpaceX); often compared to Musk.
        "Larry Page",          # Google co-founder. Longtime friend of Musk; Google/Waymo competes with Tesla in autonomous driving.
        "Donald Trump",        # U.S. President. Influences public opinion and policy; has made comments on Tesla and Musk.
        "Mark Zuckerberg",     # Meta CEO. Competes with Musk in social media; has made comments on Tesla and Musk.
        "Joe Biden",           # U.S. President. Initially dismissive of Tesla in EV discussions; later acknowledged its EV leadership.
        "Cathie Wood",         # CEO of ARK Invest. Major Tesla bull and investor; forecasts extremely high valuations for Tesla.
        "Jim Cramer",          # CNBC host. Publicly flip-flopped on Tesla; currently supportive but controversial in Tesla circles.
        "Chamath Palihapitiya",# VC and SPAC investor. Public Tesla bull and Musk supporter; promoted Tesla on media.
        "Michael Burry",       # Famed for The Big Short. Publicly shorted Tesla; skeptical of valuation.
        "Gavin Newsom",        # Governor of California. Tesla's home state; has made comments on Tesla and Musk.
        "Alexandria Ocasio-Cortez", # U.S. Congresswoman. Criticized Musk and Tesla on social issues; represents a younger, progressive demographic.
        "Pete Buttigieg",      # U.S. Secretary of Transportation. Has commented on Tesla's role in EV adoption and infrastructure.
        "Bernie Sanders",      # U.S. Senator. Criticized Musk for wealth and influence; represents a progressive viewpoint on wealth inequality.
    ]

    SUBREDDITS = [
        'TeslaMotors',         # Main Tesla discussion hub
        'TeslaInvestorsClub',  # Tesla investment focused  
        'wallstreetbets',      # Retail trading community
        'investing',           # General investment discussions
        'electricvehicles',    # General EV discussions
        'technology',          # General tech discussions
        'politics',            # U.S. political discussions
        'RealTesla',           # Critical Tesla perspectives
        'elonmusk'             # Elon Musk specific
    ]

    ALIASES = {
        "Elon Musk": ["Elon", "Musk", "ElonMusk", "SpaceX", "X.com", "Tesla CEO"],
        "Jeff Bezos": ["Bezos", "Jeff", "Amazon founder", "Blue Origin", "JB"],
        "Mark Zuckerberg": ["Zuck", "Zuckerberg", "Meta CEO", "Facebook"],
        "Larry Page": ["Larry", "Google co-founder", "Alphabet"],
        "Donald Trump": ["Trump", "Donald", "POTUS 45", "45th President", "The Donald"],
        "Joe Biden": ["Biden", "President Biden", "Joe"],
        "Cathie Wood": ["Cathie", "ARK", "ARK Invest", "Cathie W", "ARKK"],
        "Jim Cramer": ["Cramer", "Mad Money", "Jim", "CNBC host"],
        "Chamath Palihapitiya": ["Chamath", "Chamath P", "Social Capital", "SPAC King"],
        "Michael Burry": ["Burry", "The Big Short", "Dr. Burry", "Scion Capital"],
        "Gavin Newsom": ["Newsom", "Governor Newsom", "CA Governor"],
        "Alexandria Ocasio-Cortez": ["AOC", "Ocasio-Cortez", "Congresswoman AOC"],
        "Pete Buttigieg": ["Buttigieg", "Mayor Pete", "Transportation Secretary"],
        "Bernie Sanders": ["Bernie", "Senator Sanders", "Sanders"],
    }

    COMPARATIVE_COMPANIES = [
        "Rivian", "NIO", "Lucid", "BYD", "Ford", "GM", "Apple", "Meta", "Palantir"
    ]

    TIME_PERIODS = {
        "<2020": ("2010-01-01", "2019-12-31"),
        "2020-2021": ("2020-01-01", "2021-12-31"),
        "2022-2023": ("2022-01-01", "2023-12-31"),
        "2024-2025": ("2024-01-01", "2025-12-31"),
    }

    POST_LIMIT = 50
    COMMENT_LIMIT = 20
    SENTIMENT_THRESHOLD = 0.03

## Reddit Client Class

In [10]:
class RedditClient:
    def __init__(self):
        self.reddit = praw.Reddit(
            client_id=RedditSecretsConfig.client_id,
            client_secret=RedditSecretsConfig.client_secret,
            user_agent=RedditSecretsConfig.user_agent
        )
        self.all_posts = []

    def _fetch_top_posts(self, subreddit_name, keyword, post_limit=CONFIG.POST_LIMIT):
        subreddit = self.reddit.subreddit(subreddit_name)
        posts = []

        for post in subreddit.search(keyword, limit=post_limit, sort='top'):
            post_info = {
                'title': post.title,
                'created_utc': datetime.fromtimestamp(post.created_utc, tz=timezone.utc),
                'score': post.score,
                'num_comments': post.num_comments,
                'comments': self._fetch_top_comments(post, CONFIG.COMMENT_LIMIT),
            }
            posts.append(post_info)
        return posts

    def _fetch_top_comments(self, post, comment_limit):
        post.comments.replace_more(limit=0)
        top_comments = post.comments[:comment_limit]
        return [
            {
                'author': str(comment.author),
                'body': comment.body,
                'score': comment.score
            }
            for comment in top_comments if comment.body not in ['[deleted]', '[removed]']
        ]

    def fetch_all_posts(self):
        for individual in CONFIG.INDIVIDUALS:
            print(f"Fetching posts for {individual}...")
            for subreddit in CONFIG.SUBREDDITS:
                posts = self._fetch_top_posts(subreddit, individual)
                self.all_posts.extend(posts)
            pd.DataFrame(posts).to_csv(f"posts_data/{individual.lower().replace(' ', '_')}.csv", index=False)

    def get_all_posts(self):
        """ returns all posts from all individuals, for the saved local files """
        to_return = []
        for individual in CONFIG.INDIVIDUALS:
            posts = pd.read_csv(f"posts_data/{individual.lower().replace(' ', '_')}.csv")
            to_return.extend(posts.to_dict(orient='records'))
        for post in to_return:
            post['comments'] = ast.literal_eval(post['comments'])
        return to_return

## Processor Class

In [11]:
nlp = spacy.load("en_core_web_sm")

class Processor:
    def __init__(self):
        self.sia = SentimentIntensityAnalyzer()

    def _analyze_sentiment(self, text):
        score = self.sia.polarity_scores(text)['compound']
        sentiment = 'positive' if score >= CONFIG.SENTIMENT_THRESHOLD else 'negative' if score <= -CONFIG.SENTIMENT_THRESHOLD else 'neutral'
        return score, sentiment

    def normalize_text(self, text):
        text = emoji.demojize(text, delimiters=("", ""))
        text = re.sub(r'\b[\w\.-]+@[\w\.-]+\.\w+\b', '', text)
        text = re.sub(r'https?://\S+|www\.\S+', '', text)
        text = unicodedata.normalize('NFKD', text).encode('ASCII', 'ignore').decode('utf-8')
        text = re.sub(r'[\x00-\x1f\x7f-\x9f]', '', text)
        text = re.sub(r'[^\w\s\.,!?\'"\-]', '', text)
        text = re.sub(r'\s+', ' ', text).strip()
        doc = nlp(text)
        text = ' '.join([token.lemma_ for token in doc])
        return text

    def _assign_post_periods(self, posts):
        copied = posts.copy()
        for post in copied:
            post['periods'] = []
            for period_name, (start_str, end_str) in CONFIG.TIME_PERIODS.items():
                start = pd.to_datetime(start_str).tz_localize(UTC)
                end = pd.to_datetime(end_str).tz_localize(UTC)
                created_utc = pd.to_datetime(post['created_utc'])
                if start <= created_utc <= end:
                    post['periods'].append(period_name)
                    break
        return copied

    def process_posts(self, posts):
        with_periods = self._assign_post_periods(posts)
        for post in with_periods:
            post['title'] = self.normalize_text(post['title'])
            score, label = self._analyze_sentiment(post['title'])
            post['sentiment_score'] = score
            post['sentiment_label'] = label
            for comment in post['comments']:
                comment['body'] = self.normalize_text(comment['body'])
                score, label = self._analyze_sentiment(comment['body'])
                comment['sentiment_score'] = score
                comment['sentiment_label'] = label
        return with_periods

## Graph Class

In [12]:
class GraphMaker:
    def __init__(self, given_period: str = None):
        self.graph = nx.Graph()
        self.entities = ["Elon Musk"] + CONFIG.INDIVIDUALS
        self.period = given_period
        self.min_width = 0.5
        self.max_width = 10.0
        self.custom_cmap = LinearSegmentedColormap.from_list(
            "custom_red_gray_green",
            ["#f80509", "#deb603", "#0bc746"],
            N=256
        )
        self.node_sentiment = defaultdict(list)
        self.competitor_mentions = defaultdict(int)

    def _find_mentions(self, text):
        mentions = set()
        lowered = text.lower()
        for entity in self.entities:
            aliases = [entity] + CONFIG.ALIASES.get(entity, [])
            for alias in aliases:
                if alias.lower() in lowered:
                    mentions.add(entity)

        for comp in CONFIG.COMPARATIVE_COMPANIES:
            if comp.lower() in lowered:
                mentions.add(comp)
                self.competitor_mentions[comp] += 1

        return list(mentions)

    def build_graph(self, posts):
        for post in posts:
            mentions = self._find_mentions(post['title'])
            self._add_edges(mentions, post['score'], post['sentiment_score'])
            for mention in mentions:
                self.node_sentiment[mention].append(post['sentiment_score'])
            for comment in post['comments']:
                comment_mentions = self._find_mentions(comment['body'])
                self._add_edges(comment_mentions, comment['score'], comment['sentiment_score'])
                for mention in comment_mentions:
                    self.node_sentiment[mention].append(comment['sentiment_score'])

    def _add_edges(self, mentions, score, sentiment_score):
        for i in range(len(mentions)):
            for j in range(i + 1, len(mentions)):
                a, b = mentions[i], mentions[j]
                if self.graph.has_edge(a, b):
                    self.graph[a][b]['weight'] += score
                    self.graph[a][b]['sentiments'].append(sentiment_score)
                else:
                    self.graph.add_edge(a, b, weight=score, sentiments=[sentiment_score])

    def finalize_graph(self):
        weights = [data['weight'] for _, _, data in self.graph.edges(data=True)]
        min_w = min(weights) if weights else 1
        max_w = max(weights) if weights else 1

        for u, v, data in self.graph.edges(data=True):
            if max_w != min_w:
                norm_weight = self.min_width + (data['weight'] - min_w) / (max_w - min_w) * (self.max_width - self.min_width)
            else:
                norm_weight = (self.max_width + self.min_width) / 2
            avg_sentiment = sum(data['sentiments']) / len(data['sentiments'])
            data['normalized_weight'] = norm_weight
            data['color'] = self._sentiment_to_color(avg_sentiment)

    def _sentiment_to_color(self, score):
        norm = mcolors.Normalize(vmin=-1.0, vmax=1.0)
        rgba = self.custom_cmap(norm(score))
        return rgba

    def _get_node_sentiment_color(self, node):
        if node in self.node_sentiment and self.node_sentiment[node]:
            avg_sentiment = sum(self.node_sentiment[node]) / len(self.node_sentiment[node])
            return self._sentiment_to_color(avg_sentiment)
        else:
            return self._sentiment_to_color(0.0)

    def print_neighborhood(self):
        for node in self.graph.nodes:
            neighbors = list(self.graph.neighbors(node))
            print(f"Neighbors of {node}:")
            for neighbor in neighbors:
                weight = self.graph[node][neighbor]['weight']
                sentiment = sum(self.graph[node][neighbor]['sentiments']) / len(self.graph[node][neighbor]['sentiments'])
                print(f"  {neighbor}: weight={weight}, sentiment={sentiment}")
            print("\n")

    def save_graph_info(self):
        def top_n(dictionary, n=3):
            return sorted(dictionary.items(), key=lambda x: x[1], reverse=True)[:n]

        path = f"graphs_analysis/{self.period.lower().replace(' ', '_')}.txt"
        with open(path, 'w') as f:
            f.write("📊 Graph Overview:\n")
            f.write("🔹 Basic Stats:\n")
            f.write(f"  Nodes: {len(self.graph.nodes)}\n")
            f.write(f"  Edges: {len(self.graph.edges)}\n")
            f.write(f"  Is Connected: {self.is_graph_connected()}\n")
            f.write(f"  Density: {self.get_graph_density():.4f}\n")
            f.write(f"  Diameter: {self.get_graph_diameter()}\n")
            f.write(f"  Average Degree: {self.get_graph_average_degree():.2f}\n")
            f.write(f"  Avg Clustering Coefficient: {self.get_graph_average_clustering():.4f}\n")
            f.write(f"  Avg Shortest Path Length: {self.get_graph_average_shortest_path_length():.2f}\n")
            f.write(f"  Avg Node Sentiment: {self.get_graph_average_node_sentiment():.2f}\n")

            f.write("\n🔹 Structural Analysis:\n")
            cut_vertices = self.get_cut_vertexes()
            bridges = self.get_bridges()
            f.write(f"  Cut Vertices (Articulation Points): {cut_vertices if cut_vertices else 'None'}\n")
            f.write(f"  Bridges (Critical Edges): {bridges if bridges else 'None'}\n")

            f.write("\n🔹 Community Detection:\n")
            communities = self.get_communities()
            f.write(f"  Number of Communities: {len(communities)}\n")
            community_sizes = [len(c) for c in communities]
            f.write(f"  Community Sizes: {community_sizes}\n")

            f.write("\n🔹 Centrality Measures (Top 3 Nodes per Metric):\n")
            centrality = self.get_centrality_measures()
            for metric, values in centrality.items():
                top_nodes = top_n(values)
                top_str = ", ".join(f"{node} ({score:.2f})" for node, score in top_nodes)
                f.write(f"  {metric.capitalize()}: {top_str}\n")

            f.write(f"\n🔹 Node Sentiments:\n")
            for node in self.graph.nodes:
                if node in self.node_sentiment and self.node_sentiment[node]:
                    avg_sentiment = sum(self.node_sentiment[node]) / len(self.node_sentiment[node])
                    f.write(f"  {node}: {avg_sentiment:.3f} (from {len(self.node_sentiment[node])} mentions)\n")
                else:
                    f.write(f"  {node}: No sentiment data\n")

            f.write(f"\n🔹 Structure Analysis:\n {self.get_community_structure()}")

    def get_cut_vertexes(self):
        return list(nx.articulation_points(self.graph))

    def get_bridges(self):
        return list(nx.bridges(self.graph))

    def get_communities(self):
        comp = nx.algorithms.community.girvan_newman(self.graph)
        for _ in range(1):
            communities = next(comp)
        return list(communities)

    def get_community_structure(self, verbose=True):
        communities = self.get_communities()
        community_structure = {i: list(community) for i, community in enumerate(communities)}
        community_summary = ""
        if verbose:
            for i, members in community_structure.items():
                community_summary += f"Community {i}: {', '.join(members)}\n"
                community_summary += f"  Size: {len(members)}\n"
                community_summary += f"  Members: {', '.join(members)}\n\n"
        return community_structure

    def get_centrality_measures(self):
        return {
            "degree": nx.degree_centrality(self.graph),
            "betweenness": nx.betweenness_centrality(self.graph),
            "closeness": nx.closeness_centrality(self.graph)
        }

    def get_graph_diameter(self):
        if nx.is_connected(self.graph):
            return nx.diameter(self.graph)
        else:
            return max(
                max(lengths.values())
                for node, lengths in nx.single_source_shortest_path_length(self.graph).items()
            )

    def get_graph_density(self):
        return nx.density(self.graph)

    def get_graph_average_clustering(self):
        return nx.average_clustering(self.graph)

    def get_graph_average_shortest_path_length(self):
        if nx.is_connected(self.graph):
            return nx.average_shortest_path_length(self.graph)
        else:
            return float('inf')

    def get_graph_average_degree(self):
        return sum(dict(self.graph.degree()).values()) / len(self.graph.nodes) if self.graph.nodes else 0

    def get_graph_average_node_sentiment(self):
        sentiments = [sum(scores) / len(scores) for scores in self.node_sentiment.values() if scores]
        return sum(sentiments) / len(sentiments) if sentiments else 0

    def is_graph_connected(self):
        return nx.is_connected(self.graph)

    def get_edge_sentiment_analysis(self, sentiment_thresholds=(-0.1, 0.1)):
        edge_insights = []
        for u, v, data in self.graph.edges(data=True):
            sentiments = data['sentiments']
            avg_sentiment = sum(sentiments) / len(sentiments) if sentiments else 0
            relation = (
                "Negative" if avg_sentiment < sentiment_thresholds[0]
                else "Positive" if avg_sentiment > sentiment_thresholds[1]
                else "Neutral"
            )
            edge_insights.append({
                "node_1": u,
                "node_2": v,
                "average_sentiment": round(avg_sentiment, 3),
                "relation_type": relation,
                "num_mentions": len(sentiments),
                "total_weight": data['weight']
            })
        return sorted(edge_insights, key=lambda x: abs(x["average_sentiment"]), reverse=True)

    def export_edge_sentiment_analysis(self, filepath="edge_sentiment_analysis.csv"):
        analysis = self.get_edge_sentiment_analysis()
        with open(filepath, mode='w', newline='') as file:
            writer = csv.DictWriter(file, fieldnames=analysis[0].keys())
            writer.writeheader()
            writer.writerows(analysis)

    def visualize(self):
        fig, ax = plt.subplots(figsize=(18, 14))

        central_node = ["Elon Musk"]
        influencer_nodes = [n for n in self.graph.nodes if n in CONFIG.INDIVIDUALS and n != "Elon Musk"]
        competitor_nodes = [n for n in self.graph.nodes if n in CONFIG.COMPARATIVE_COMPANIES]
        other_nodes = [n for n in self.graph.nodes if n not in central_node + influencer_nodes + competitor_nodes]

        shells = [central_node, influencer_nodes, competitor_nodes + other_nodes]
        pos = nx.shell_layout(self.graph, shells)

        edge_colors = [data['color'] for _, _, data in self.graph.edges(data=True)]
        edge_weights = [data['normalized_weight'] for _, _, data in self.graph.edges(data=True)]
        node_colors = [self._get_node_sentiment_color(node) for node in self.graph.nodes]

        nx.draw_networkx_nodes(self.graph, pos, node_color=node_colors, node_size=1300, alpha=0.9, ax=ax)
        nx.draw_networkx_labels(self.graph, pos, font_size=10, ax=ax)
        nx.draw_networkx_edges(self.graph, pos, edge_color=edge_colors, width=edge_weights, alpha=0.6, ax=ax)

        sm = plt.cm.ScalarMappable(cmap=self.custom_cmap, norm=plt.Normalize(vmin=-1, vmax=1))
        sm.set_array([])
        cbar = plt.colorbar(sm, ax=ax, shrink=0.8, aspect=20)
        cbar.set_label('Sentiment Score', rotation=270, labelpad=15)

        ax.set_title(f"Elon Musk & Influencers for period {self.period} (Node colors based on sentiment)", fontsize=16)
        ax.axis('off')
        plt.tight_layout()
        plt.show()


## Text Analysis

In [None]:
import re
import matplotlib.pyplot as plt
from collections import defaultdict, Counter
from wordcloud import WordCloud
from nltk.corpus import stopwords
from gensim import corpora, models
from gensim.models import CoherenceModel
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np


class TextAnalysis:
    def __init__(self, posts, period=None, aliases=None, min_word_freq=4, max_words=200, ngram_range=(1, 1)):
        self.posts = posts
        self.period = period
        self.min_word_freq = min_word_freq
        self.max_words = max_words
        self.ngram_range = ngram_range
        self.averaged_scores = {}
        self.word_counts = Counter()
        self.word_contexts = defaultdict(list)
        self.aliases = aliases or {}
        self.stop_words = stopwords.words('english')

        self.alias_lookup = {}
        for canonical, alias_list in self.aliases.items():
            for alias in alias_list:
                self.alias_lookup[alias.lower()] = canonical.lower()

    def _color_func(self, word, **kwargs):
        score = self.averaged_scores.get(word.lower(), 0)
        try:
            rgba = GraphMaker(self.period)._sentiment_to_color(score)
            r, g, b, _ = [int(c * 255) for c in rgba]
            return f"rgb({r}, {g}, {b})"
        except:
            if score > 0.1:
                intensity = min(int(255 * abs(score)), 255)
                return f"rgb(0, {intensity}, 0)"
            elif score < -0.1:
                intensity = min(int(255 * abs(score)), 255)
                return f"rgb({intensity}, 0, 0)"
            else:
                return "rgb(128, 128, 128)"

    def _replace_aliases(self, text):
        if not text:
            return ""
        text_lower = text.lower()
        for alias, canonical in self.alias_lookup.items():
            canonical_token = canonical.replace(" ", "_")
            pattern = re.compile(rf'\b{re.escape(alias)}\b', re.IGNORECASE)
            text_lower = pattern.sub(canonical_token, text_lower)
        return text_lower

    def _tokenize(self, text):
        if not text:
            return []

        tokens = re.findall(r'\b[a-z_]+\b', text.lower())
        tokens = [token for token in tokens if token not in self.stop_words and len(token) > 2]

        ngram_tokens = []
        for n in range(self.ngram_range[0], self.ngram_range[1] + 1):
            ngrams = zip(*[tokens[i:] for i in range(n)])
            ngram_tokens.extend(['_'.join(ngram) for ngram in ngrams])

        return ngram_tokens

    def process_all_text(self):
        word_scores = defaultdict(list)
        seen_contexts = set()

        for post in self.posts:
            if post.get('title'):
                sentiment = post.get('sentiment_score', 0)
                title_text = self._replace_aliases(post['title'])
                tokenized = self._tokenize(title_text)
                for word in set(tokenized):
                    self.word_counts[word] += tokenized.count(word)
                    word_scores[word].append(sentiment)
                    key = (word, post['title'])
                    if key not in seen_contexts:
                        self.word_contexts[word].append((post['title'], sentiment))
                        seen_contexts.add(key)

            for comment in post.get('comments', []):
                if comment.get('body'):
                    sentiment = comment.get('sentiment_score', 0)
                    body_text = self._replace_aliases(comment['body'])
                    tokenized = self._tokenize(body_text)
                    for word in set(tokenized):
                        self.word_counts[word] += tokenized.count(word)
                        word_scores[word].append(sentiment)
                        key = (word, comment['body'])
                        if key not in seen_contexts:
                            self.word_contexts[word].append((comment['body'], sentiment))
                            seen_contexts.add(key)

        self.averaged_scores = {
            word: np.mean(scores) for word, scores in word_scores.items()
        }

    def explain_word_sentiment(self, word, limit=3):
        print(f"\nExplanation for word '{word}'")
        contexts = self.word_contexts.get(word, [])
        if not contexts:
            print("  No context found.")
            return

        total_score = 0
        for i, (text, score) in enumerate(contexts[:limit]):
            print(f"  [{i+1}] Sentiment: {score:.3f} | Text: {text[:100]}...")
            total_score += score

        avg = self.averaged_scores.get(word, 0)
        print(f"  ➤ Averaged Sentiment: {avg:.3f} from {min(limit, len(contexts))} samples")

    def get_filtered_word_frequencies(self):
        return {
            word: count for word, count in self.word_counts.items()
            if count >= self.min_word_freq
        }

    def get_most_common_words(self, n=20):
        filtered_counts = self.get_filtered_word_frequencies()
        return Counter(filtered_counts).most_common(n)

    def get_sentiment_extremes(self, n=10):
        eligible_words = {
            word: score for word, score in self.averaged_scores.items()
            if self.word_counts[word] >= self.min_word_freq
        }

        sorted_by_sentiment = sorted(eligible_words.items(), key=lambda x: x[1])
        most_negative = sorted_by_sentiment[:n]
        most_positive = sorted_by_sentiment[-n:]

        return {
            'most_negative': most_negative,
            'most_positive': most_positive
        }

    def generate_word_cloud(self, save_path=None, words=None, word_list=None):
        if word_list:
            words = Counter(word_list)
        elif words is None:
            filtered_frequencies = self.get_filtered_word_frequencies()
            words = dict(Counter(filtered_frequencies).most_common(self.max_words))

        if not words:
            print("No words available to generate the word cloud.")
            return

        wordcloud = WordCloud(
            width=800,
            height=400,
            background_color='white',
            max_words=self.max_words,
            relative_scaling=0.5,
            colormap='viridis'
        ).generate_from_frequencies(words)

        plt.figure(figsize=(12, 6))
        plt.imshow(wordcloud.recolor(color_func=self._color_func), interpolation='bilinear')
        plt.axis("off")
        title = f"Word Cloud Colored by Sentiment"
        if self.period:
            title += f" for {self.period}"
        plt.title(title, fontsize=16, pad=20)

        if save_path:
            plt.savefig(save_path, bbox_inches='tight', dpi=300)
        plt.show()


    def run_lsa(self, num_topics=5):
        texts = []

        for post in self.posts:
            if post.get('title'):
                title_tokens = self._tokenize(self._replace_aliases(post['title']))
                if title_tokens:
                    texts.append(title_tokens)

            for comment in post.get('comments', []):
                if comment.get('body'):
                    body_tokens = self._tokenize(self._replace_aliases(comment['body']))
                    if body_tokens:
                        texts.append(body_tokens)

        dictionary = corpora.Dictionary(texts)
        dictionary.filter_extremes(no_below=2, no_above=0.8)
        corpus = [dictionary.doc2bow(text) for text in texts]

        if not corpus:
            print("No valid documents for LSA after filtering.")
            return None

        lsa = models.LsiModel(corpus, id2word=dictionary, num_topics=num_topics)
        topics = lsa.print_topics(num_topics=num_topics, num_words=8)

        print(f"\nLSA Topics (num_topics={num_topics}):")
        print("=" * 50)
        for i, topic in enumerate(topics):
            print(f"Topic {i}: {topic[1]}")

        try:
            coherence_model = CoherenceModel(
                model=lsa, texts=texts, dictionary=dictionary, coherence='c_v'
            )
            coherence_score = coherence_model.get_coherence()
            print(f"\nCoherence Score: {coherence_score:.4f}")
        except ImportError:
            print("\nInstall gensim[complete] for coherence scoring.")

        return {
            'model': lsa,
            'dictionary': dictionary,
            'corpus': corpus,
            'topics': topics
        }

    def get_summary_stats(self):
        total_words = sum(self.word_counts.values())
        unique_words = len(self.word_counts)
        avg_sentiment = np.mean(list(self.averaged_scores.values())) if self.averaged_scores else 0

        return {
            'total_posts': len(self.posts),
            'total_comments': sum(len(post.get('comments', [])) for post in self.posts),
            'total_words': total_words,
            'unique_words': unique_words,
            'words_above_threshold': len(self.get_filtered_word_frequencies()),
            'average_sentiment': avg_sentiment,
            'sentiment_std': np.std(list(self.averaged_scores.values())) if self.averaged_scores else 0
        }

    def get_individual_sentiments(self):
        results = []
        for canonical in self.aliases:
            token = canonical.lower().replace(" ", "_")
            if token in self.word_counts:
                sentiment = self.averaged_scores.get(token, 0)
                count = self.word_counts[token]
                std_dev = np.std([s for t, s in self.word_contexts[token]]) if self.word_contexts[token] else 0
                results.append({
                    'name': canonical,
                    'token': token,
                    'mention_count': count,
                    'average_sentiment': sentiment,
                    'sentiment_std_dev': std_dev
                })

        results.sort(key=lambda x: x['average_sentiment'], reverse=True)
        return results

    def print_analysis_summary(self):
        stats = self.get_summary_stats()
        extremes = self.get_sentiment_extremes(5)
        common_words = self.get_most_common_words(10)

        print("TEXT ANALYSIS SUMMARY")
        print("=" * 50)
        print(f"Period: {self.period or 'All time'}")
        print(f"Total Posts: {stats['total_posts']}")
        print(f"Total Comments: {stats['total_comments']}")
        print(f"Total Words: {stats['total_words']:,}")
        print(f"Unique Words: {stats['unique_words']:,}")
        print(f"Words Above Threshold of {self.min_word_freq}: {stats['words_above_threshold']:,}")
        print(f"Average Sentiment: {stats['average_sentiment']:.3f}")
        print(f"Sentiment Std Dev: {stats['sentiment_std']:.3f}")

        print(f"\nMOST COMMON WORDS:")
        for word, count in common_words:
            sentiment = self.averaged_scores.get(word, 0)
            print(f"  {word}: {count} times (sentiment: {sentiment:.3f})")

        print(f"\nMOST POSITIVE WORDS:")
        for i, (word, sentiment) in enumerate(extremes['most_positive']):
            count = self.word_counts[word]
            print(f"  {word}: {sentiment:.3f} ({count} times)")
            if i < 2:
                self.explain_word_sentiment(word)

        print(f"\nMOST NEGATIVE WORDS:")
        for i, (word, sentiment) in enumerate(reversed(extremes['most_negative'])):
            count = self.word_counts[word]
            print(f"  {word}: {sentiment:.3f} ({count} times)")
            if i < 2:
                self.explain_word_sentiment(word)

    def get_top_tfidf_words(self, top_n=20):
        documents = []

        for post in self.posts:
            if post.get("title"):
                documents.append(self._replace_aliases(post['title']))
            for comment in post.get('comments', []):
                if comment.get("body"):
                    documents.append(self._replace_aliases(comment['body']))

        if not documents:
            return []

        vectorizer = TfidfVectorizer(
            stop_words=self.stop_words,
            ngram_range=self.ngram_range,
            token_pattern=r'\b[a-z_]{3,}\b',
            lowercase=True
        )
        tfidf_matrix = vectorizer.fit_transform(documents)
        mean_scores = np.asarray(tfidf_matrix.mean(axis=0)).flatten()
        tfidf_scores = dict(zip(vectorizer.get_feature_names_out(), mean_scores))
        top_tfidf = sorted(tfidf_scores.items(), key=lambda x: x[1], reverse=True)[:top_n]
        return top_tfidf

## Main

In [None]:
import os
import csv
from collections import defaultdict

# Ensure output directories exist
os.makedirs("outputs", exist_ok=True)
os.makedirs("graphs_analysis", exist_ok=True)

client = RedditClient()
processor = Processor()
all_posts = []

# Uncomment to fetch new posts
# client.fetch_all_posts()
all_posts = client.get_all_posts()
processed_posts = processor.process_posts(all_posts)

posts_by_period = defaultdict(list)
for post in processed_posts:
    for period in post['periods']:
        posts_by_period[period].append(post)

period_graphs = {}
sentiment_over_time = defaultdict(dict)

for period, posts in posts_by_period.items():
    print(f"\n=== Processing period: {period} | Posts: {len(posts)} ===")

    # Build and analyze graph
    gm = GraphMaker(period)
    gm.build_graph(posts)
    gm.finalize_graph()
    period_graphs[period] = gm
    # gm.save_graph_info()
    gm.export_edge_sentiment_analysis(f"graphs_analysis/sentiment_edges_{period}.csv")

    # Text analysis
    text_analysis = TextAnalysis(
        posts=posts,
        period=period,
        aliases=CONFIG.ALIASES,
        ngram_range=(1, 2)  # use unigrams and bigrams
    )

    text_analysis.process_all_text()
    individual_stats = text_analysis.get_individual_sentiments()
    text_analysis.generate_word_cloud()

    # Save summary stats
    summary = text_analysis.get_summary_stats()
    with open(f"outputs/summary_stats_{period}.csv", "w", newline='') as f:
        writer = csv.writer(f)
        writer.writerow(["Metric", "Value"])
        for key, value in summary.items():
            writer.writerow([key, value])

    # Save most common words
    common_words = text_analysis.get_most_common_words(n=50)
    with open(f"outputs/common_words_{period}.csv", "w", newline='') as f:
        writer = csv.writer(f)
        writer.writerow(["Word", "Count", "Average Sentiment"])
        for word, count in common_words:
            sentiment = text_analysis.averaged_scores.get(word, 0)
            writer.writerow([word, count, round(sentiment, 4)])

    # Save sentiment extremes
    extremes = text_analysis.get_sentiment_extremes(n=10)
    for sentiment_type, word_list in extremes.items():
        with open(f"outputs/{sentiment_type}_{period}.csv", "w", newline='') as f:
            writer = csv.writer(f)
            writer.writerow(["Word", "Average Sentiment", "Count"])
            for word, sentiment in word_list:
                count = text_analysis.word_counts[word]
                writer.writerow([word, round(sentiment, 4), count])

    # Save top TF-IDF n-grams
    tfidf_words = text_analysis.get_top_tfidf_words(top_n=50)
    with open(f"outputs/tfidf_words_{period}.csv", "w", newline='') as f:
        writer = csv.writer(f)
        writer.writerow(["Word", "TF-IDF Score"])
        writer.writerows([(word, round(score, 5)) for word, score in tfidf_words])

    # Save individual entity sentiment stats
    with open(f"outputs/entity_sentiments_{period}.csv", "w", newline='') as f:
        writer = csv.DictWriter(f, fieldnames=[
            "name", "token", "mention_count", "average_sentiment", "sentiment_std_dev"
        ])
        writer.writeheader()
        for stat in individual_stats:
            writer.writerow(stat)

    # Store sentiment by entity over time
    for stat in individual_stats:
        sentiment_over_time[stat['name']][period] = stat['average_sentiment']

    # Visualize graph
    gm.visualize()

# Save overall sentiment evolution
with open("outputs/sentiment_over_time.csv", "w", newline='') as f:
    writer = csv.writer(f)
    all_periods = sorted(posts_by_period.keys())
    header = ["Entity"] + all_periods
    writer.writerow(header)
    for entity, sentiments in sentiment_over_time.items():
        row = [entity] + [round(sentiments.get(period, 0), 4) for period in all_periods]
        writer.writerow(row)

## Musk - Pete connection thorughout the years

In [173]:
def find_co_mentions(posts, person1, person2, person3=None, aliases=None, period=None):
    """
    Find co-mentions between two or more persons in a list of posts.
    """
    person1_tokens = [person1.lower().replace(" ", "_")]
    person2_tokens = [person2.lower().replace(" ", "_")]
    if person3:
        person3_tokens = [person3.lower().replace(" ", "_")]

    if aliases:
        person1_tokens += [alias.lower().replace(" ", "_") for alias in aliases.get(person1, [])]
        person2_tokens += [alias.lower().replace(" ", "_") for alias in aliases.get(person2, [])]
        if person3:
            person3_tokens += [alias.lower().replace(" ", "_") for alias in aliases.get(person3, [])]

    results = []

    for post in posts:
        # Check title
        title = post.get("title", "").lower()
        if any(p1 in title for p1 in person1_tokens) and any(p2 in title for p2 in person2_tokens) and (not person3 or any(p3 in title for p3 in person3_tokens)):
            results.append({
                "type": "title",
                "text": post["title"],
                "created_utc": post.get("created_utc"),
                "score": post.get("score")
            })

        # Check comments
        for comment in post.get("comments", []):
            body = comment.get("body", "").lower()
            if any(p1 in body for p1 in person1_tokens) and any(p2 in body for p2 in person2_tokens) and (not person3 or any(p3 in body for p3 in person3_tokens)):
                results.append({
                    "type": "comment",
                    "text": comment["body"],
                    "created_utc": comment.get("created_utc"),
                    "score": comment.get("score", 0),
                    "author": comment.get("author")
                })

    return [r["text"] for r in results]

In [None]:
nlp = spacy.load("en_core_web_sm")
stopwords = nlp.Defaults.stop_words

def get_sentiment_scores(text_list: list[str], top_n=10, min_word_freq=3):
    sia = SentimentIntensityAnalyzer()

    word_scores = defaultdict(list)
    word_counts = Counter()

    for text in text_list:
        if not text:
            continue
        tokens = re.findall(r'\b[a-z]{3,}\b', text.lower())
        filtered_tokens = [w for w in tokens if w not in stopwords]

        for word in set(filtered_tokens):
            sentiment = sia.polarity_scores(word)['compound']
            word_scores[word].append(sentiment)
            word_counts[word] += filtered_tokens.count(word)

    averaged_scores = {
        word: np.mean(scores)
        for word, scores in word_scores.items()
        if word_counts[word] >= min_word_freq
    }

    sorted_positive = sorted(averaged_scores.items(), key=lambda x: x[1], reverse=True)[:top_n]
    return sorted_positive

jim_elon_2022_2023 = find_co_mentions(processed_posts, "Jim Cramer", "Elon Musk", aliases=CONFIG.ALIASES, period="2022-2023")
biden_senders_before_2020 = find_co_mentions(processed_posts, "Joe Biden", "Bernie Sanders", aliases=CONFIG.ALIASES, period="<2020")
trump_sanders_before_2020 = find_co_mentions(processed_posts, "Donald Trump", "Bernie Sanders", aliases=CONFIG.ALIASES, period="<2020")
trump_biden_before_2020 = find_co_mentions(processed_posts, "Donald Trump", "Joe Biden", aliases=CONFIG.ALIASES, period="<2020")


threesome = [biden_senders_before_2020, trump_sanders_before_2020, trump_biden_before_2020, trump_pete_20_21, trump_pete_24_25] # 20 sentences

def get_top_tfidf_words(documents: list[str], top_n=20):
    vectorizer = TfidfVectorizer(
        stop_words=list(stopwords) + ["elon", "musk", "jim", "cramer", "biden", "sanders", "trump", "bernie"],
        ngram_range=(1,1),
        token_pattern=r'\b[a-z_]{3,}\b',
        lowercase=True
    )
    tfidf_matrix = vectorizer.fit_transform(documents)
    mean_scores = np.asarray(tfidf_matrix.mean(axis=0)).flatten()
    tfidf_scores = dict(zip(vectorizer.get_feature_names_out(), mean_scores))
    top_tfidf = sorted(tfidf_scores.items(), key=lambda x: x[1], reverse=True)[:top_n]
    return top_tfidf

# Join each group into a single document
threesome_docs = [" ".join(group) for group in threesome]

# Now run
print(get_top_tfidf_words(threesome_docs))


In [None]:
from gensim import corpora, models
from gensim.models.coherencemodel import CoherenceModel
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
import nltk

# Only run these once to download NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Stopwords and punctuation setup
stop_words = set(stopwords.words('english'))
punct_table = str.maketrans('', '', string.punctuation)

# Preprocessing function
def preprocess_texts(raw_texts):
    texts = []
    for doc in raw_texts:
        tokens = word_tokenize(doc.lower().translate(punct_table))
        filtered = [token for token in tokens if token not in stop_words and len(token) > 3 and token not in names]
        if filtered:
            texts.append(filtered)
    return texts

# LDA function
def run_lda(texts, num_topics=2):
    dictionary = corpora.Dictionary(texts)
    dictionary.filter_extremes(no_below=2, no_above=0.8)
    corpus = [dictionary.doc2bow(text) for text in texts]

    if not corpus:
        print("No valid documents for LDA after filtering.")
        return None

    lda = models.LdaModel(corpus, id2word=dictionary, num_topics=num_topics, random_state=42)
    topics = lda.print_topics(num_topics=num_topics, num_words=8)

    print(f"\nLDA Topics (num_topics={num_topics}):")
    print("=" * 50)
    for i, topic in enumerate(topics):
        print(f"Topic {i}: {topic[1]}")

    try:
        coherence_model = CoherenceModel(
            model=lda, texts=texts, dictionary=dictionary, coherence='c_v'
        )
        coherence_score = coherence_model.get_coherence()
        print(f"\nCoherence Score: {coherence_score:.4f}")
    except ImportError:
        print("\nInstall gensim[complete] for coherence scoring.")

    return {
        'model': lda,
        'dictionary': dictionary,
        'corpus': corpus,
        'topics': topics
    }




trump_pete_20_21 = find_co_mentions(processed_posts, "Donald Trump", "Pete Buttigieg", aliases=CONFIG.ALIASES, period="2020-2021")
trump_pete_24_25 = find_co_mentions(processed_posts, "Donald Trump", "Pete Buttigieg", aliases=CONFIG.ALIASES, period="2024-2025")

# texts_20_21 = preprocess_texts(trump_pete_20_21)
# lda_output = run_lda(texts_20_21, num_topics=2)
# 2024-2025


In [None]:
import matplotlib.pyplot as plt
from nltk.sentiment import SentimentIntensityAnalyzer

def histogram_of_words_sentiment(words: list[str]):
    sia = SentimentIntensityAnalyzer()

    categories = {"positive": 0, "neutral": 0, "negative": 0}

    for word in words:
        score = sia.polarity_scores(word)['compound']
        if score >= CONFIG.SENTIMENT_THRESHOLD:
            categories["positive"] += 1
        elif score <= -CONFIG.SENTIMENT_THRESHOLD:
            categories["negative"] += 1
        else:
            categories["neutral"] += 1

    labels = list(categories.keys())
    counts = list(categories.values())

    plt.bar(labels, counts, color=['green', 'gray', 'red'])
    plt.title("Sentiment Distribution of Words")
    plt.xlabel("Sentiment")
    plt.ylabel("Number of Words")
    plt.show()

    return categories


print(histogram_of_words_sentiment(words))

In [None]:
all_sentences = [post["title"] + " " + " ".join([comment["body"] for comment in post["comments"]]) for post in processed_posts]
processed_sentences = preprocess_texts(all_sentences)
lda_output = run_lda(processed_sentences, num_topics=2)
print(lda_output)