In [None]:
!pip install praw
import praw
import config

reddit = praw.Reddit(
    client_id=config.CLIENT_ID,
    client_secret=config.CLIENT_SECRET,
    user_agent=config.USER_AGENT,
    username=config.USERNAME,
    password=config.PASSWORD
)

In [None]:
subreddit = reddit.subreddit("philosophy")
for post in subreddit.hot(limit=10):
    print(post.title)

In [None]:
!pip install pandas
import pandas as pd
import time

def fetch_posts(subreddit_name, limit=500):
    subreddit = reddit.subreddit(subreddit_name)
    posts = []

    for post in subreddit.top(limit=limit, time_filter="month"):
        if not post.stickied and not post.locked:
            posts.append({
                "subreddit": subreddit_name,
                "id": post.id,
                "title": post.title,
                "text": post.selftext,
                "author": str(post.author) if post.author else "[deleted]",
                "score": post.score,
                "created_utc": post.created_utc,
                "num_comments": post.num_comments,
                "url": post.url
            })
        time.sleep(0.2)

    return pd.DataFrame(posts)

In [None]:
subs = ["philosophy", "askphilosophy", "sociology", "stoicism", "TrueAskReddit"]
all_posts = pd.DataFrame()

for sub in subs:
    print(f"Fetching from r/{sub}...")
    df = fetch_posts(sub, limit=500)
    all_posts = pd.concat([all_posts, df], ignore_index=True)

# Save to CSV
all_posts.to_csv("reddit_philosophy_data.csv", index=False)
print("Saved all posts to reddit_philosophy_data.csv")

In [None]:
df = pd.read_csv("reddit_philosophy_data.csv")
print(df.shape)
df.sample(5)

In [None]:
import re

def clean_text(text):
    if pd.isnull(text):
        return ""
    text = re.sub(r"http\S+", "", text)        
    text = re.sub(r"\[.*?\]\(.*?\)", "", text)  
    text = re.sub(r"[^\w\s]", "", text)     
    text = re.sub(r"\s+", " ", text)                
    return text.strip().lower()

df["clean_title"] = df["title"].apply(clean_text)
df["clean_text"] = df["text"].apply(clean_text)
df["full_text"] = df["clean_title"] + " " + df["clean_text"]

In [None]:
df[["subreddit", "clean_title", "clean_text"]].sample(5)

In [None]:
!python3 -m spacy download en_core_web_sm

In [None]:
!pip install scikit-learn
!pip install spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import text 
import spacy
nlp = spacy.load("en_core_web_sm")

def extract_relevant_words(text):
    if not isinstance(text, str):
        return ""
    doc = nlp(text)
    return " ".join(
        token.lemma_.lower()
        for token in doc
        if token.pos_ in {"NOUN", "PROPN", "ADJ", "VERB"}
        and not token.is_stop
        and token.is_alpha
    )

df["content_words"] = df["full_text"].apply(extract_relevant_words)
vectorizer = TfidfVectorizer(
    max_features=50,
    min_df=2, 
    max_df=0.8 
)

subreddit_docs = df.groupby("subreddit")["content_words"].apply(lambda x: " ".join(x)).to_dict()

tfidf_matrix = vectorizer.fit_transform(subreddit_docs.values())
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), index=subreddit_docs.keys(), columns=vectorizer.get_feature_names_out())

import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(12, 8))
sns.heatmap(tfidf_df.T, cmap="magma", annot=True)
plt.title("Refined TF-IDF Keyword Strength by Subreddit")
plt.xlabel("Subreddit")
plt.ylabel("Keyword")
plt.tight_layout()
plt.show()

noise_terms = {"thank", "start", "bring", "happen", "use", "thing", "stuff", "someone", "everyone", "anyone"}

tfidf_df_cleaned = tfidf_df.drop(columns=[col for col in tfidf_df.columns if col in noise_terms])

plt.figure(figsize=(12, 8))
sns.heatmap(tfidf_df_cleaned.T, cmap="magma", annot=True)
plt.title("Filtered TF-IDF: Discourse-Significant Terms Only")
plt.xlabel("Subreddit")
plt.ylabel("Keyword")
plt.tight_layout()
plt.savefig("tfidf_discourse_heatmap.png", dpi=300, bbox_inches="tight")
plt.show()

In [None]:
!pip install spacy
from collections import Counter
import itertools
import spacy
nlp = spacy.load("en_core_web_sm")

def extract_concepts(text):
    if not isinstance(text, str):
        return []
    doc = nlp(text)
    return [token.lemma_.lower() for token in doc 
            if token.pos_ in {"NOUN", "PROPN"}
            and not token.is_stop
            and token.is_alpha]

df["concepts"] = df["full_text"].apply(extract_concepts)

In [None]:
from collections import defaultdict

co_occurrence = defaultdict(int)

for concepts in df["concepts"]:
    unique_terms = list(set(concepts))
    for pair in itertools.combinations(sorted(unique_terms), 2):
        co_occurrence[pair] += 1

In [None]:
!pip install networkx
import networkx as nx

G = nx.Graph()

for (term1, term2), weight in co_occurrence.items():
    if weight >= 3:
        G.add_edge(term1, term2, weight=weight)

In [None]:
degree_dict = dict(nx.degree(G))
print("Max degree:", max(degree_dict.values()))
print("Min degree:", min(degree_dict.values()))

top_nodes = sorted(degree_dict.items(), key=lambda x: x[1], reverse=True)[:50]
nodes_to_keep = [node for node, _ in top_nodes]

G_filtered = G.subgraph(nodes_to_keep).copy()

print("Filtered nodes:", len(G_filtered.nodes))

In [None]:
min_edge_weight = 35

G_strong = nx.Graph()

for u, v, data in G_filtered.edges(data=True):
    if data["weight"] >= min_edge_weight:
        G_strong.add_edge(u, v, **data)

print("Nodes after filtering:", len(G_strong.nodes))
print("Edges after filtering:", len(G_strong.edges))

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(18, 14))
pos = nx.spring_layout(G_strong, k=0.9, iterations=150)

nx.draw_networkx_nodes(G_strong, pos, node_size=500, node_color="orange", alpha=0.85)
nx.draw_networkx_edges(G_strong, pos, width=1.5, alpha=0.4)
nx.draw_networkx_labels(G_strong, pos, font_size=11)

plt.title("Core Concept Co-Occurrence Network", fontsize=16)
plt.axis("off")
plt.tight_layout()
plt.savefig("concept_network.png", dpi=300, bbox_inches="tight")
plt.show()

In [None]:
!pip install nltk

import nltk
nltk.download('vader_lexicon')

from nltk.sentiment.vader import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

In [None]:
df["sentiment"] = df["full_text"].apply(lambda x: sia.polarity_scores(x)["compound"])

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
sns.boxplot(x="subreddit", y="sentiment", data=df)
plt.title("Sentiment Distribution by Subreddit")
plt.xlabel("Subreddit")
plt.ylabel("Compound Sentiment Score")
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig("sentiment_dist.png", dpi=300, bbox_inches="tight")
plt.show()

In [None]:
import zipfile

with zipfile.ZipFile("NRC-Emotion-Lexicon.zip", "r") as zip_ref:
    zip_ref.extractall("nrc_emotion_lexicon")

In [None]:
import os

for root, dirs, files in os.walk("nrc_emotion_lexicon"):
    for file in files:
        if "Wordlevel-v0.92.txt" in file:
            full_path = os.path.join(root, file)
            print("✅ FOUND:", full_path)

In [None]:
emotion_lexicon = {}

with open("nrc_emotion_lexicon/NRC-Emotion-Lexicon/NRC-Emotion-Lexicon-Wordlevel-v0.92.txt", "r", encoding="utf-8") as f:
    for line in f:
        parts = line.strip().split('\t')
        if len(parts) == 3:
            word, emotion, score = parts
            if int(score) == 1:
                emotion_lexicon.setdefault(word, set()).add(emotion)

print(list(emotion_lexicon.items())[:10])

In [None]:
from collections import Counter

def get_emotions(text):
    if not isinstance(text, str):
        return Counter()
    words = text.lower().split()
    emotions = []
    for word in words:
        if word in emotion_lexicon:
            emotions.extend(emotion_lexicon[word])
    return Counter(emotions)

df["emotion_counts"] = df["full_text"].apply(get_emotions)

In [None]:
emotion_df = df["emotion_counts"].apply(pd.Series).fillna(0)

emotion_df["subreddit"] = df["subreddit"]

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

emotion_means = emotion_df.groupby("subreddit").mean()

plt.figure(figsize=(12, 6))
sns.heatmap(emotion_means.T, cmap="coolwarm", annot=True)
plt.title("Average Emotion Score per Subreddit")
plt.xlabel("Subreddit")
plt.ylabel("Emotion")
plt.tight_layout()
plt.savefig("indepth_sentiment_dist.png", dpi=300, bbox_inches="tight")
plt.show()

In [None]:
known_names = {
    "nietzsche", "kant", "plato", "socrates", "aristotle", "foucault", 
    "camus", "locke", "hume", "spinoza", "marx", "zizek", "aurelius", 
    "descartes", "heidegger", "sartre", "chomsky", "rawls", "mill", "hobbes"
}

In [None]:
from collections import Counter

def match_known_names(text):
    if not isinstance(text, str): return []
    words = set(text.lower().split())
    return [name for name in known_names if name in words]

df["figures_cited"] = df["full_text"].apply(match_known_names)
name_counts = Counter([name for sublist in df["figures_cited"] for name in sublist])
top_cited = name_counts.most_common(20)

for name, count in top_cited:
    print(f"{name.title()}: {count}")

In [None]:
figure_emotion_profiles = {
    "aurelius":   {"trust": 0.8, "joy": 0.4, "sadness": 0.1, "fear": 0.1, "positive": 0.6, "negative": 0.1},
    "kant":       {"trust": 0.7, "joy": 0.1, "fear": 0.2, "anticipation": 0.4, "positive": 0.3},
    "plato":      {"trust": 0.5, "anticipation": 0.3, "joy": 0.2},
    "nietzsche":  {"anger": 0.4, "fear": 0.5, "sadness": 0.3, "negative": 0.6},
    "aristotle":  {"trust": 0.6, "joy": 0.3, "positive": 0.5},
    "socrates":   {"trust": 0.7, "joy": 0.2, "surprise": 0.2},
    "camus":      {"sadness": 0.5, "disgust": 0.3, "negative": 0.6},
    "descartes":  {"trust": 0.6, "anticipation": 0.4},
    "marx":       {"anger": 0.4, "disgust": 0.3, "sadness": 0.3, "negative": 0.5},
    "spinoza":    {"joy": 0.4, "trust": 0.5, "positive": 0.5},
    "hume":       {"trust": 0.6, "joy": 0.3, "positive": 0.4},
    "sartre":     {"sadness": 0.4, "fear": 0.3, "negative": 0.5},
    "heidegger":  {"fear": 0.5, "disgust": 0.3, "negative": 0.4},
    "foucault":   {"fear": 0.4, "disgust": 0.3, "sadness": 0.2, "negative": 0.5},
    "hobbes":     {"fear": 0.5, "anger": 0.4, "negative": 0.5},
    "rawls":      {"trust": 0.6, "anticipation": 0.3, "positive": 0.4},
    "locke":      {"trust": 0.6, "joy": 0.2, "positive": 0.3},
    "chomsky":    {"anger": 0.3, "trust": 0.5, "fear": 0.2},
}

In [None]:
emotion_df = df["emotion_counts"].apply(pd.Series)
df = pd.concat([df, emotion_df], axis=1)

In [None]:
emotion_cols = ["positive", "negative", "anger", "fear", "joy", "sadness", "trust", "anticipation", "disgust", "surprise"]

df_emotions = df[emotion_cols].copy()
df_emotions["figures_cited"] = df["figures_cited"]
df_emotions["subreddit"] = df["subreddit"]

df_figure_mention = df_emotions.explode("figures_cited").dropna(subset=["figures_cited"])

In [None]:
from scipy.spatial.distance import euclidean
import numpy as np

def compare_to_figure_profile(row):
    figure = row.name[1]
    if figure in figure_emotion_profiles:
        figure_profile = figure_emotion_profiles[figure]
        
        shared_emotions = set(row.index) & set(figure_profile.keys())
        if shared_emotions:
            subreddit_vector = [row[emotion] for emotion in shared_emotions]
            figure_vector = [figure_profile[emotion] for emotion in shared_emotions]
            
            if any(pd.isna(v) for v in subreddit_vector):
                return None
            
            return euclidean(subreddit_vector, figure_vector)
    
    return None

sub_cite_emotions = df_figure_mention.groupby(["subreddit", "figures_cited"])[emotion_cols].mean()

sub_cite_emotions["emotion_distance"] = sub_cite_emotions.apply(compare_to_figure_profile, axis=1)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plot_df = sub_cite_emotions.reset_index()

plot_df = plot_df.dropna(subset=["emotion_distance"])

plot_df = plot_df.sort_values("emotion_distance", ascending=False)

plt.figure(figsize=(12, 6))
sns.barplot(data=plot_df, x="figures_cited", y="emotion_distance", hue="subreddit")

plt.title("Emotional Distance Between Subreddits and Cited Figures")
plt.xlabel("Cited Figure")
plt.ylabel("Euclidean Distance in Emotion Profile")
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig("Cited-figures-subreddit.png", dpi=300, bbox_inches="tight")
plt.show()

distance_df = plot_df

In [None]:
print(distance_df["subreddit"].unique())
print(distance_df["subreddit"].value_counts())

In [None]:
import pandas as pd
import numpy as np

In [None]:
emotion_cols = ["positive", "negative", "anger", "fear", "joy", "sadness", "trust", "anticipation", "disgust", "surprise"]

# 1. Average emotion intensity per subreddit
emotion_means = df.groupby("subreddit")[emotion_cols].mean()

# 2. TF-IDF uniqueness: how many high-weight keywords are used
tfidf_uniqueness = tfidf_df.apply(lambda row: (row > 0.1).sum(), axis=1)

# 3. Citation frequency per subreddit
citation_counts = df["figures_cited"].explode().groupby(df["subreddit"]).count()

# 4. Mean emotional distance to cited figures
mean_emotion_distance = plot_df.groupby("subreddit")["emotion_distance"].mean()

In [None]:
subreddit_summary = pd.DataFrame({
    "tfidf_uniqueness": tfidf_uniqueness,
    "citation_count": citation_counts,
    "mean_emotion_distance": mean_emotion_distance
})

subreddit_summary = subreddit_summary.join(emotion_means)

subreddit_summary = subreddit_summary.fillna(0)

In [None]:
print(subreddit_summary.columns)
print(subreddit_summary.loc["philosophy"])

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
normalized_summary = pd.DataFrame(
    scaler.fit_transform(subreddit_summary),
    columns=subreddit_summary.columns,
    index=subreddit_summary.index
)

In [None]:
normalized_summary.head()

In [None]:
import matplotlib.pyplot as plt
import numpy as np

def plot_radar(data, subreddit, save=False):
    import matplotlib.pyplot as plt
    import numpy as np

    values = data.loc[subreddit].values
    labels = data.columns.tolist()

    # Repeat first value to close the loop
    values = np.concatenate((values, [values[0]]))
    angles = np.linspace(0, 2 * np.pi, len(labels), endpoint=False).tolist()
    angles += angles[:1]

    fig, ax = plt.subplots(figsize=(6, 6), subplot_kw=dict(polar=True))
    ax.plot(angles, values, linewidth=2)
    ax.fill(angles, values, alpha=0.25)

    ax.set_xticks(angles[:-1])
    ax.set_xticklabels(labels, fontsize=10)
    ax.set_yticklabels([])
    ax.set_title(f"Discourse Profile: {subreddit}", fontsize=14)

    plt.tight_layout()

    if save:
        filename = f"radar_{subreddit}.png"
        plt.savefig(filename, dpi=300, bbox_inches="tight")
        print(f"✅ Saved: {filename}")

    plt.show()

In [None]:
plot_radar(normalized_summary, "askphilosophy", save=True)

In [None]:
for sub in normalized_summary.index:
    plot_radar(normalized_summary, sub, save=True)

In [None]:
top_keywords = {}

for sub in tfidf_df.index:
    top_terms = tfidf_df.loc[sub]
    top_terms = top_terms[top_terms > 0]  # Only meaningful terms
    top_keywords[sub] = top_terms.sort_values(ascending=False).head(5).index.tolist()

In [None]:
def plot_radar(data, subreddit, top_keywords_dict=top_keywords):
    labels = data.columns.tolist()
    values = data.loc[subreddit].values.tolist()
    values += values[:1]  # loop around

    angles = np.linspace(0, 2 * np.pi, len(labels), endpoint=False).tolist()
    angles += angles[:1]

    fig, ax = plt.subplots(figsize=(8, 8), subplot_kw=dict(polar=True))

    ax.plot(angles, values, linewidth=2)
    ax.fill(angles, values, alpha=0.25)

    ax.set_xticks(angles[:-1])
    ax.set_xticklabels(labels)
    ax.set_title(f"Discourse Profile: {subreddit}", y=1.1)

    # ✅ Show keywords at the bottom, inside the figure bounds
    keywords = ", ".join(top_keywords_dict.get(subreddit, []))
    fig.text(0.5, 0.02, f"Top terms: {keywords}", ha="center", fontsize=10)

    # 🧼 REMOVE tight_layout — it clips text
    # plt.tight_layout()

    plt.savefig(f"radar_{subreddit}.png", dpi=300, bbox_inches="tight")
    plt.show()

In [None]:
for sub in normalized_summary.index:
    plot_radar(normalized_summary, sub, top_keywords)

In [None]:
# 1. Emotional intensity = std deviation of emotion scores
emotion_cols = ["positive", "negative", "anger", "fear", "joy", "sadness", "trust", "anticipation", "disgust", "surprise"]
subreddit_summary["emotion_intensity"] = df[emotion_cols].groupby(df["subreddit"]).std().mean(axis=1)

# 2. Emotional temperature (positive - negative)
subreddit_summary["emotional_temp"] = df.groupby("subreddit")["positive"].mean() - df.groupby("subreddit")["negative"].mean()

# 3. Normalize relevant columns for scaling
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

scaled_cols = ["tfidf_uniqueness", "citation_count", "mean_emotion_distance", "emotion_intensity", "emotional_temp"]
normalized = pd.DataFrame(
    scaler.fit_transform(subreddit_summary[scaled_cols]),
    columns=[f"{col}_scaled" for col in scaled_cols],
    index=subreddit_summary.index
)

# 4. Merge normalized back in
subreddit_summary = pd.concat([subreddit_summary, normalized], axis=1)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Sample DataFrame (replace with your real values)
compass_df = pd.DataFrame({
    "subreddit": ["stoicism", "askphilosophy", "philosophy", "TrueAskReddit", "sociology"],
    "x": [0.1, 0.35, 0.1, 1.0, 0.3],
    "y": [1.0, 0.48, 0.0, 0.8, 0.44],
    "size": [300, 500, 100, 600, 400],
    "color": [0.7, 0.8, 0.2, 1.0, 0.6]
})

plt.figure(figsize=(10, 8))
ax = plt.gca()

# Quadrant background colors
colors = {
    "top_left": "#c7d5e0",
    "top_right": "#faebd7",
    "bottom_left": "#dcdcdc",
    "bottom_right": "#f2f2f2"
}

# Draw quadrant rectangles
ax.axhspan(0.5, 1, xmin=0, xmax=0.5, facecolor=colors["top_left"], alpha=0.6, zorder=0)
ax.axhspan(0.5, 1, xmin=0.5, xmax=1, facecolor=colors["top_right"], alpha=0.6, zorder=0)
ax.axhspan(0, 0.5, xmin=0, xmax=0.5, facecolor=colors["bottom_left"], alpha=0.6, zorder=0)
ax.axhspan(0, 0.5, xmin=0.5, xmax=1, facecolor=colors["bottom_right"], alpha=0.6, zorder=0)

# Plot subreddit bubbles
scatter = ax.scatter(
    compass_df["x"], compass_df["y"],
    s=compass_df["size"],
    c=compass_df["color"],
    cmap="coolwarm",
    alpha=0.9,
    edgecolors='black'
)

# Subreddit name labels
for i, row in compass_df.iterrows():
    ax.text(
        row["x"], row["y"] + 0.05,
        row["subreddit"],
        ha="center", va="bottom",
        fontsize=10, color="black", weight="bold"
    )

# Add quadrant labels with semi-transparent background
def label_quadrant(x, y, text):
    ax.text(
        x, y, text,
        ha="center", va="center",
        fontsize=10, color="black", weight="bold",
        bbox=dict(facecolor="white", alpha=0.7, edgecolor="none", boxstyle="round,pad=0.3")
    )

label_quadrant(0.25, 0.75, "Detached / Reflective")
label_quadrant(0.75, 0.75, "Expressive / Experiential")
label_quadrant(0.25, 0.25, "Detached / Cold")
label_quadrant(0.75, 0.25, "Ideological / Abstract")

# Axis dividers
ax.axvline(0.5, color="gray", linestyle="--", linewidth=1)
ax.axhline(0.5, color="gray", linestyle="--", linewidth=1)
ax.set_xlim(-0.05, 1.05)
ax.set_ylim(-0.05, 1.05)

# Axis labels & title
ax.set_title("🧭 The Discourse Compass: Mapping Reddit Philosophy Subcultures", fontsize=14, weight="bold", pad=20)
ax.set_xlabel("Conceptual Anchoring (Closer to Cited Thinkers →)", fontsize=11)
ax.set_ylabel("Emotional Expressiveness (Muted → Intense)", fontsize=11)

# Add color legend
norm = plt.Normalize(0, 1)
sm = plt.cm.ScalarMappable(cmap="coolwarm", norm=norm)
sm.set_array([])
cbar = plt.colorbar(sm, ax=ax)
cbar.set_label("Emotional Temperature", fontsize=10)

# Final layout
plt.tight_layout()
plt.savefig("discourse_compass_final_refined.png", dpi=300)
plt.show()