In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import plotly.express as px

# Data Initialization and Pre-processing for caching purposes
First Let's load the data and word embeddings. Note the original paper only used data from 2017 and top 10k subreddits. \
For sake of time we shall use data from From a downsampled data set from 2019 - 2021 which has top subreddits \
Additionally since some columns are mostly blank (e.g: self text in posts), we will only be using the columns that are useful  \
Pre-process the GS-scores and store them in a csv for later use, similarly pre-process sentiments for later user \
We do this so that we can load posts and comments without loading in text which is very memory intensive \
For the GS-scores, to be accurate we utilize as much data as we can and it can be very costly to recalculate them \
Some other files such as scores, embedding meta data, and embedding vectors are from the CSSLab github (see README) \
Note: most files are >50Mb and can't be included in the repo, I'll include a seperate google drive link for most of them




### Loading ALL posts and comments with text data

In [3]:
#Import the data and filter accordingly
import pandas as pd
import numpy as np
import random
# #read posts data
post_fields = ['id','author', 'created_utc','score','title','subreddit']
posts = pd.read_csv('text_submissions.csv',skipinitialspace=True, usecols=post_fields)
# read comment data
comment_fields = ['id','author','subreddit','link_id','body']
comments = pd.read_csv('text_comments.csv',skipinitialspace=True, usecols=comment_fields)

  posts = pd.read_csv('text_submissions.csv',skipinitialspace=True, usecols=post_fields)
  comments = pd.read_csv('text_comments.csv',skipinitialspace=True, usecols=comment_fields)


In [4]:
#filter posts with no authors
posts = posts[(posts['author'] != '[deleted]')]
#parse time data
posts['created_utc'] = pd.to_numeric(posts['created_utc'], errors='coerce')
posts['time'] = pd.to_datetime(posts['created_utc'],utc=True,unit='s')
#drop rows with na
posts = posts.dropna()
(posts['time'].dt.year).value_counts()

# comments['created_utc'] = pd.to_numeric(comments['created_utc'], errors='coerce')
# comments['time'] = pd.to_datetime(comments['created_utc'],utc=True,unit='s')
comments = comments[comments['author'] != '[deleted]']
comments = comments.dropna()

In [17]:
comments = comments[comments['body'] != '[removed]']
comments = comments[comments['body'] != '[deleted]']

In [18]:
posts = posts[posts['title'] != '[removed]']
posts = posts[posts['title'] != '[deleted]']

## Initial GS-Score calculations and save them to a csv

Import the preexisting word2vec embeddings from social dimensions github \
Calculate GS-Scores as outlined in the paper and save them to a csv \
It is very time demanding to work with all the comment / post data, we rather just use all valid comments once to generate the gs-scores per user and save them for later use

In [19]:
import pandas as pd
import numpy as np
meta = pd.read_csv('embedding-metadata.tsv', sep='\t', header=None)
meta.columns = meta.iloc[0]
meta = meta.reindex(meta.index.drop(0))
meta.set_index(meta.columns[0], inplace=True)
#note all vectors are normalized
vectors = pd.read_csv('embedding-vectors.tsv', sep='\t', header=None)
vectors.set_index(meta.index, inplace=True)
vectors = vectors.divide(np.linalg.norm(vectors.values, axis=1), axis='rows')

Filter subreddits to those which we can use with the embeddings

In [20]:
posts = posts[posts['subreddit'].isin(meta.index.to_list())]
comments = comments[comments['subreddit'].isin(meta.index.to_list())]

In [21]:
comments.shape[0]

30811298

In [22]:
print(comments.shape[0])
print(posts.shape[0])

30811298
2033336


In [None]:
#code is inspired by implementation by: https://github.com/ptuls/movielens-diversity-metric
def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

# center according to the paper is average of community vectors
def compute_center(vectors, subreddits):
    center = np.zeros(len(vectors.columns))
    weight = 0
    for subreddit in subreddits:
        try:
            subreddit_vec = vectors.loc[subreddit]
            center += subreddit_vec
            weight += 1
        except KeyError:
            print('Subreddit '+subreddit+' not in embedding')
            continue
    return center / weight

# the score computation is sum of cosine similarities divided by number of unique communities contributed to
def compute_score(vectors,subreddits,center):
    score = 0
    weight = 0
    for subreddit in subreddits:
        try:
            subreddit_vec = vectors.loc[subreddit]
            score +=  cosine_similarity(subreddit_vec, center)
            weight += 1
        except KeyError:
            print('Subreddit '+subreddit+' not in embedding')
            continue
    return score / weight

def generalist_specialist_score(vectors,subreddits):
    #reduce our overhead
    if(len(subreddits)<=1 or len(set(subreddits))==1):
        return 1.0
    center = compute_center(vectors,subreddits)
    score = compute_score(vectors,subreddits,center)
    return score

Use community embeddings to generate scores of users 

In [134]:
usergroupings = comments.groupby('author')['subreddit'].apply(list).reset_index(name='subreddits_used')
usergroupings['gs_scores'] = usergroupings['subreddits_used'].apply(lambda x: generalist_specialist_score(vectors, x))
usergroupings['number_of_engagements']=usergroupings['subreddits_used'].str.len()
usergroupings[['author','gs_scores','number_of_engagements']].to_csv('gs_scores_of_users.csv')

GS-scores using post statistics only 

In [None]:
usergroupings = posts.groupby('author')['subreddit'].apply(list).reset_index(name='subreddits_used')
usergroupings['gs_scores'] = usergroupings['subreddits_used'].apply(lambda x: generalist_specialist_score(vectors, x))
usergroupings['number_of_engagements']=usergroupings['subreddits_used'].str.len()
usergroupings[['author','gs_scores','number_of_engagements']].to_csv('gs_scores_of_creators.csv')

Utilize NLTK to conduct sentiment analysis on comments / submissions \
This is very time consuming do this when you have more time

In [23]:
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()




[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Wasimroks\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [24]:
#Test on small dataset for now
post_title_sentiment = posts["title"].apply(lambda x: pd.Series(sia.polarity_scores(x)))
post_title_sentiment['id']=posts['id']
post_title_sentiment.to_csv('post_title_sentiments.csv')

  post_title_sentiment = posts["title"].apply(lambda x: pd.Series(sia.polarity_scores(x)))


Do an analysis on the comment sentiments (However only the comments that are linked to a post) \
This will allow us to visualize average user reception per post

In [25]:

comments_on_posts = comments[comments['link_id'].isin(posts['id'].to_list())]

In [26]:
#Test on small dataset for now
post_comment_sentiment = comments_on_posts['body'].apply(lambda x: pd.Series(sia.polarity_scores(x)))
post_comment_sentiment[['id','author','link_id','subreddit']]=comments_on_posts[['id','author','link_id','subreddit']]
post_title_sentiment.to_csv('post_comment_sentiments.csv')

  post_comment_sentiment = comments_on_posts['body'].apply(lambda x: pd.Series(sia.polarity_scores(x)))


## Load Libraries and minimum required data after preprocessing
After preprocessing all the text and GS-scores, we can use only a few relevant fields to save us time

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import plotly.express as px
#load posts

#load relevant comments

# GS-Score and Sentiment relationship
Investigate and study relationship between sentiment and GS-scores \
Utilize Vader to study basic emotions such as positive, negative, neutral \
Visualize as line graph of GS-score vs each sentiment \
Breakdown Average community sentiment along with communities as a Grid \
Hypothesis to test: Can GS-scores of communities contribute to their emotions and strenght? \
                    Are specialists more likely to be enthusiastic, since they are more picky?

# GS-Score and submission statistics
Compare submissions and comments of each user \
Utilize the ratio of submissions vs comments in a time frame to see if the user is a actively creating \
Categorize users as active creators \
Investigate the communities the user likes to submit in, is it similar to the ones they comment in (Ones used to make gs scores) \
Train a model with and without GS-scores to see if active contributors can be identified \
Compare Elite posters (top 5% posts) with community GS-scores, the original paper suggested elite commenters are generalists, is it also true for posters? \
Hypothesis to test: Can GS-Score be a good indicator of active contributer (likes to create submissions)?

User GS-Score's relation Post Submission Rate and Avg Rating
User GS-Score and Sentiment of their comments
Community GS-Score's and Top Post submitters [Elite Posters]

## GS-scores of Elite Posters 

## Use social dimensions to categorize community wide specialization


Create social dimenstions of each community \
Get community GS scores of posters \
Visualize embeddings via social dimensions, and cluster using kmeans \
find optimum number of clusters using elbow-method \
Are certain social clusters of a higher GS-score?