#### Import Libraries and setup HTTP connection

In [1]:
import requests
import pandas as pd
import time
import tqdm
import networkx as nx
import plotly.graph_objects as go
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string
import re

client = ''
secret = ''
auth = requests.auth.HTTPBasicAuth(client, secret)

In [2]:
data = {
    'grant_type': 'password',
    'username': '',
    'password': ''
}

headers = {'User-Agent': 'MyAPI/0.0.1'}
res = requests.post('https://www.reddit.com/api/v1/access_token', auth=auth, data=data, headers=headers)
token = res.json()['access_token']  
headers = {**headers, **{'Authorization': f'bearer {token}'}}

#### Functions to retrieve reddit posts and comments

In [178]:
def fetch_posts(subreddit, headers, limit=100, max_posts=1000):
    posts = []
    after = None
    while len(posts) < max_posts:
        url = f'https://oauth.reddit.com/r/{subreddit}?limit={limit}'
        if after:
            url += f'&after={after}'
        
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            json_response = response.json()
            posts.extend(json_response['data']['children'])
            after = json_response['data'].get('after')
            if not after:
                break  # No more posts to fetch
        else:
            print(f"Failed to fetch posts: {response.status_code}")
            break
    
    return posts

def post_ids(posts):
    return [post['data']['id'] for post in posts]

def fetch_comments(subreddit,post_id, headers, limit=100, max_comments=1000):
    comments = []
    after = None
    while len(comments) < max_comments:
        url = f'https://oauth.reddit.com/r/{subreddit}/comments/{post_id}?limit={limit}'
        if after:
            url += f'&after={after}'
        
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            json_response = response.json()
            comments.extend(json_response[1]['data']['children'])
            after = json_response[1]['data'].get('after')
            if not after:
                break  # No more comments to fetch
        else:
            print(f"Failed to fetch comments: {response.status_code}")
            time.sleep(1)
            break
    
    return comments

#### Function to combine posts and comments, for given subreddits

In [179]:
# final code to get post, poster_name, comments and commentor_name to a dataframe. post and comments in 'content' , user in 'user' and a flag for post or comment in 'type'

def posts_to_df(subreddit, headers, limit=100, max_posts=1000):

    df = pd.DataFrame(columns=['content', 'user', 'type','subreddit','post_id'])

    posts = fetch_posts(subreddit, headers, limit=limit, max_posts=max_posts)

    for post in posts:
        new_row = pd.DataFrame({
        'content': [post['data']['title']],
        'user': [post['data']['author']],
        'type': ['post'],
        'subreddit': [subreddit],
        'post_id': [post['data']['id']]
        })
        df = pd.concat([df, new_row], ignore_index=True)

        comments = fetch_comments(subreddit,post['data']['id'], headers, limit=limit, max_comments=max_posts)
        
        for comment in comments:
            
            if 'body' not in comment['data']:
                continue
            
            new_row = pd.DataFrame({
            'content': [comment['data']['body']],
            'user': [comment['data']['author']],
            'type': ['comment'],
            'subreddit': [subreddit],
            'post_id': [post['data']['id']]
            })  
            df = pd.concat([df, new_row], ignore_index=True)

    return df


#### Load data

In [249]:
subreddits = ['democrats', 'republican','conservative','liberal']

posts_df = pd.DataFrame(columns=['content', 'user', 'type','subreddit','post_id'])

for subreddit in tqdm.tqdm(subreddits):
    posts_df_ = posts_to_df(subreddit, headers, limit=100, max_posts=5000)
    time.sleep(1000)
    posts_df = pd.concat([posts_df, posts_df_], ignore_index=True)

  0%|          | 0/4 [00:00<?, ?it/s]

100%|██████████| 4/4 [1:28:01<00:00, 1320.44s/it]


#### Skip data download, and load from local

In [1]:
#posts_df.to_csv('reddit_posts.csv')
#posts_df = pd.read_csv('reddit_posts.csv')

In [2]:
posts_df_['content'] = posts_df_['content'].str.lower()
posts_df_t = posts_df_[(posts_df_['content'].str.contains('trump'))&~(posts_df_['content'].str.contains('biden'))]
posts_df_b = posts_df_[(posts_df_['content'].str.contains('biden'))&~(posts_df_['content'].str.contains('trump'))]

posts_df_ = pd.concat([posts_df_t,posts_df_b])

posts_df = posts_df_[posts_df_['type']=='post'].groupby('subreddit').apply(lambda x: x.sample(100,random_state=1))
posts_df.reset_index(drop=True,inplace=True)
def about_trump_or_biden(text):
    if 'trump' in text:
        return 'trump'
    elif 'biden' in text:
        return 'biden'

posts_df['about_trump_or_biden'] = posts_df['content'].apply(about_trump_or_biden)

# before topic modelling remove stop words and punctuation, and custom words
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))
stop_words.update(['donald','joe','trump','biden','http','https','www','com','republican','democrat','republicans','democrats','president','election','vote','voting','voted','voter','votes','voters'])
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = word_tokenize(text)
    text = [lemmatizer.lemmatize(word) for word in text if word not in stop_words]
    text = ' '.join(text)
    return text

posts_df['cleaned_content'] = posts_df['content'].apply(clean_text)


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/uditdhand/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/uditdhand/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


#### Sentiment Analysis

In [None]:
import pandas as pd
from transformers import pipeline

sentiment_pipeline = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")

def analyze_sentiment(text):
    try:
        result = sentiment_pipeline(text)
        return result[0]['label'], result[0]['score']
    except Exception as e:
        return "ERROR", None

posts_df[['sentiment', 'score']] = posts_df.apply(lambda row: analyze_sentiment(row['content']), axis=1, result_type='expand')

# count of positive and negative sentiments for trump and biden and for  each subreddit 

counts = posts_df.groupby(['subreddit','about_trump_or_biden','sentiment']).size().unstack(fill_value=0)
counts['TOTAL'] = counts['NEGATIVE'] + counts['POSITIVE']
counts

#### Topic Modeling

In [None]:
from bertopic import BERTopic

model = BERTopic(language="english", calculate_probabilities=True, verbose=True)
topics, probs = model.fit_transform(posts_df['cleaned_content'])

# Create a DataFrame to hold the topic and probability information
topics_df = pd.DataFrame({
    'topic': topics,
    'probability': [max(prob) if prob.size > 0 else None for prob in probs]
})

# You can now inspect the topics_df DataFrame
print(topics_df.head())

topics_df['topic_name'] = topics_df['topic'].apply(lambda x: model.get_topic_info(x)['Name'])
posts_df['topic'] = topics_df['topic_name']

In [None]:
# count of topic wise posts , sentiment wise, trump or biden wise, subreddit wise

counts = posts_df.groupby(['subreddit','about_trump_or_biden','sentiment','topic']).size().unstack(fill_value=0)
counts

#### Create network

In [None]:
import networkx as nx
import matplotlib.pyplot as plt
import pandas as pd

avg_sentiment = posts_df.groupby(['topic', 'user'])['score'].mean().reset_index()

edges = [(row['user'], row['topic'], row['score']) for index, row in avg_sentiment.iterrows()]

user_subreddit = posts_df[['user', 'subreddit']].drop_duplicates().set_index('user')['subreddit'].to_dict()
topic_nodes = avg_sentiment['topic'].unique()
user_nodes = avg_sentiment['user'].unique()

color_map = {'republican': 'red', 'democrats': 'blue', 'liberal': 'blue', 'conservative': 'red'}
node_color_map = {**{topic: 'yellow' for topic in topic_nodes}, **color_map}  # Assuming topics have a default color, e.g., 'yellow'
all_nodes = list(topic_nodes) + list(user_nodes)
node_colors = [node_color_map[user_subreddit.get(user, 'default')] if user in user_nodes else 'yellow' for user in all_nodes]
B = nx.Graph()
B.add_nodes_from(topic_nodes, bipartite=0)
B.add_nodes_from(user_nodes, bipartite=1)
B.add_weighted_edges_from(edges)

pos = nx.spring_layout(B)

nx.draw(B, pos, with_labels=False, node_color=node_colors, edge_color='black', node_size=50, font_size=3, edge_cmap=plt.cm.Blues, width=0.3, alpha=0.7)

# label only topics
topic_nodes = {node: node for node in topic_nodes}
nx.draw_networkx_labels(B, pos, labels=topic_nodes, font_size=10)
plt.show()


#### Network metrics

In [None]:
# centrality measures of the topics:

degree_centrality = nx.degree_centrality(B)
closeness_centrality = nx.closeness_centrality(B)
betweenness_centrality = nx.betweenness_centrality(B)

centrality_measures = pd.DataFrame({
    'topic': list(degree_centrality.keys()),
    'degree_centrality': list(degree_centrality.values()),
    'closeness_centrality': list(closeness_centrality.values()),
    'betweenness_centrality': list(betweenness_centrality.values())
})

centrality_measures.sort_values(by='betweenness_centrality', ascending=False).head(10)

#### Community detection and affiliation

In [None]:
import networkx as nx
import matplotlib.pyplot as plt
import pandas as pd
import community as community_louvain  # This library is often called python-louvain

# Assuming you have your posts_df DataFrame from the previous context
avg_sentiment = posts_df.groupby(['topic', 'user'])['score'].mean().reset_index()

edges = [(row['user'], row['topic'], row['score']) for index, row in avg_sentiment.iterrows()]

user_subreddit = posts_df[['user', 'subreddit']].drop_duplicates().set_index('user')['subreddit'].to_dict()
topic_nodes = avg_sentiment['topic'].unique()
user_nodes = avg_sentiment['user'].unique()

B = nx.Graph()
B.add_nodes_from(topic_nodes, bipartite=0)
B.add_nodes_from(user_nodes, bipartite=1)
B.add_weighted_edges_from(edges)

# Detecting communities
partition = community_louvain.best_partition(B)

# Printing users in each community
for community_id in set(partition.values()):
    print(f"Community {community_id}:")
    members = [node for node, community in partition.items() if community == community_id and node in user_nodes]
    print(members)

# now find affiliation of each community with trump or biden

community_affiliation = {}
for community_id in set(partition.values()):
    members = [node for node, community in partition.items() if community == community_id and node in user_nodes]
    community_affiliation[community_id] = posts_df[posts_df['user'].isin(members)]['subreddit'].value_counts().idxmax()

community_affiliation

In [None]:
# what are the differences between 3  'liberal' communities that got detected?

communities = [community_id for community_id, affiliation in community_affiliation.items() if affiliation=='republican']

for community_id in communities:
    members = [node for node, community in partition.items() if community == community_id and node in user_nodes]
    print(f"Community {community_id}:")
    print(posts_df[posts_df['user'].isin(members)]['topic'].value_counts().head(5))
    print("\n")