In [2]:
import pandas as pd
import numpy as np

In [25]:
# Dataset I created with ~60,000 randomly selected comments
data = pd.read_csv('reddit_comments_random.csv', low_memory=False)

data.shape

(58696, 22)

In [26]:
# Only use subreddits with 30+ comments

top_subs_df = data.groupby('subreddit').count().sort_values('author', ascending=False)
top_subs = top_subs_df[top_subs_df.author >= 30].index

# Sorted by popularity
top_subs

Index(['AskReddit', 'politics', 'nfl', 'The_Donald', 'pokemon', 'news',
       'fantasyfootball', 'nba', 'pics', 'SquaredCircle',
       ...
       'BitcoinMarkets', 'Portland', 'Advice', 'NASCAR', 'pokemongo',
       'ImGoingToHellForThis', 'CrappyDesign', 'Eyebleach',
       'mildlyinfuriating', 'blogsnark'],
      dtype='object', name='subreddit', length=321)

In [27]:
# All we need is author and subreddit

data = data[['author', 'subreddit']]
data = data[data.subreddit.isin(top_subs)]

# Create a binary "commented" so we can use it in a pivot table
data['count'] = 1

data.shape

(35628, 3)

In [28]:
# Create pivot table

data = data.pivot_table(values='count', index='author', columns='subreddit')
data.columns = data.columns.str.lower()
data = data.fillna(0)

In [43]:
def correlated_subreddits(subreddit, threshold=.02):
    '''
    Calculates the correlation (overlap) in the userbase of different subreddits.
    Sorts by absolute value to find dissimilar subs as well.
    '''
    corrs = pd.DataFrame(data.corrwith(data[subreddit]))
    corrs['absol'] = abs(corrs[0])
    corrs = corrs.sort_values('absol', ascending=False).drop('absol', axis=1)
    corrs.columns = ['correlation']
    return corrs[abs(corrs.correlation) > threshold]

In [38]:
for i in top_subs[:10]:
    print(i)

AskReddit
politics
nfl
The_Donald
pokemon
news
fantasyfootball
nba
pics
SquaredCircle


In [39]:
correlated_subreddits('askreddit')

Unnamed: 0_level_0,correlation
subreddit,Unnamed: 1_level_1
askreddit,1.0
politics,-0.052858
nfl,-0.051712
pokemon,-0.043229
fantasyfootball,-0.042865
the_donald,-0.042322
news,-0.039676
nba,-0.037932
leagueoflegends,-0.033001
gaming,-0.031625


People in AskReddit are less likely to be gamers or politically active. They also tend to like sports less.

In [40]:
correlated_subreddits('politics')

Unnamed: 0_level_0,correlation
subreddit,Unnamed: 1_level_1
politics,1.0
askreddit,-0.052858
nfl,-0.032401
pokemon,-0.029808
nba,-0.024717
fantasyfootball,-0.023669
the_donald,-0.023257
the_mueller,0.022409
squaredcircle,-0.021383
leagueoflegends,-0.020215


Again, we see people who like politics on reddit tend to like football and video games less. They also tend to be liberal, preferring Bob Mueller over Donald Trump.

In [41]:
correlated_subreddits('nfl')

Unnamed: 0_level_0,correlation
subreddit,Unnamed: 1_level_1
nfl,1.0
askreddit,-0.051712
politics,-0.032401
denverbroncos,0.030685
patriots,0.02974
pokemon,-0.025803
the_donald,-0.025633
pics,-0.020857
news,-0.020417


People who like football tend to stick to other sports subs. They're fairly apolitical but don't really care for Trump either. So what are the characteristics of t_d posters?

In [45]:
correlated_subreddits('the_donald', threshold=.015)

Unnamed: 0_level_0,correlation
subreddit,Unnamed: 1_level_1
the_donald,1.0
askreddit,-0.042322
nfl,-0.025633
politics,-0.023257
pokemon,-0.02136
news,-0.01931
nba,-0.018492
fantasyfootball,-0.0179
photoshopbattles,0.017627


They don't like sports or the other news/political subreddits. They only thing they *do* like are photoshop battles. (Keep in mind these overlaps are tiny.)