# Measure whether language norms were expressed implicitly in community guidelines


- Using labelled dataset of comments + posts (excluding any banned (?), stickied(?), or moderator posts) train a text classifier to predict the subreddit a post or comment came from. 

- Measure of implicit language style of interface: accuracy of trained classifier on interface text of subreddit

- Classification can be *(subreddit vs. rest of dataset)* , *(subreddit vs. rest of interfaces)* or *multilabel classification* with all subreddits. -- Start with *(subreddit vs. rest of dataset)*

- Can also explore two types of interface text: just the public description, including guidelines and rules, just the moderator and stickied posts, and the two of them together. 

- Use SoPa as classifier, manually inspect patters


In [1]:
%load_ext dotenv
%dotenv

import numpy as np
import csv as csv
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
import math
import json
from scipy import stats
from datetime import datetime
import sklearn

from nltk import pos_tag
from nltk.util import pad_sequence
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from termcolor import colored

import praw
import requests
import json
import os

client_id = os.environ.get("client_id")
client_secret = os.environ.get("client_secret")
user_agent = os.environ.get("user_agent")

reddit = praw.Reddit(client_id=client_id,
                     client_secret=client_secret,
                     user_agent=user_agent)

### Test subreddits
### Test some subreddits randomly taken from the identity paper
tested_subs = ['science', 'politics', 'Economics', 'depression', 'Cooking', 'pics', 'Naruto', 'BabyBumps']
rows = []
for s in tested_subs:
    sub = reddit.subreddit(s)
    print(datetime.fromtimestamp(sub.created_utc))
    rows.append({'subreddit': s , 'descr':sub.description, 'public_descr':sub.public_description, 'id':'t5_'+sub.id})
    
df_sub = pd.DataFrame(rows)


df_sub['full_descr'] = df_sub['descr'] + df_sub['public_descr']

# make a dict for labels
tested_sub_ids = list(df_sub.id)
subreddit_label_dict = {s:i for i, s in enumerate(tested_sub_ids)}

2006-10-18 06:54:26
2007-08-05 22:16:39
2008-01-24 20:27:02
2008-12-31 17:17:38
2008-01-25 09:45:21
2008-01-24 16:31:09
2009-03-12 19:55:58
2010-11-27 10:56:18


In [11]:
reddit.subreddit('RoastMe').description

'***\n\n[**RULES OF THE ROAST:**](https://www.reddit.com/r/RoastMe/about/rules/)\n\n**Roastees:**\n\n * You must provide at least one high-quality picture of the roastee holding a handwritten sign with the text /r/RoastMe - Phones and Snapchat text overlay is **NOT** allowed!\n\n * Photoshopped/edited posts are **NOT** allowed. This **includes** Snapchat filters such as the digitally-added flowers or dog ears. Posting multiple edited photos will result in a ban.\n\n * Your post will not appear until after it has been approved by a moderator. Please send [this message](https://www.reddit.com/message/compose?to=/r/roastme&subject=Post Not Approved&message=Hello, I don\'t believe my post has been approved yet. Could you please approve my post? Add link to post here: (Make sure you link to the reddit post, not the imgur!\\)) if it takes longer than 1 hour.\n\n * The minimum posting age is **16** years old, your post will be rejected if you look younger.\n\n * By submitting a Roast, you agr

In [18]:
df_comments_1.iloc[:100]

Unnamed: 0,body,score_hidden,archived,name,author,author_flair_text,downs,created_utc,subreddit_id,link_id,parent_id,score,retrieved_on,controversiality,gilded,id,subreddit,ups,distinguished,author_flair_css_class
0,The CM Storm is $87 at Vuugo. \n\nYou can also...,,,,AndCockGoesTheGun,,,1453075080,t5_2tesr,t3_41g1z5,t1_cz22mhp,3,1454558115,0,0,cz236bo,bapcsalescanada,3,,
1,"Did you use ICF, SL, or any of the similar pro...",,,,SBushwi1,,,1453075082,t5_2s9bg,t3_41fj1c,t3_41fj1c,1,1454558115,0,0,cz236cy,gainit,1,,
2,you can easily 2-star if you quad-quake somewh...,,,,JRMHCNSK,Maxed TH9,,1453075087,t5_396f1,t3_41dsv8,t3_41dsv8,1,1454558116,0,0,cz236gw,HWYA,1,,TH9
3,Too bad he won't be able to press any of the b...,,,,jello_fever,,,1453075089,t5_2qh03,t3_41fojp,t1_cz20foy,623,1454558117,0,0,cz236io,gaming,623,,
4,I could not help but notice the same thing.,,,,psillyness,,,1453075090,t5_2qh1i,t3_41dkav,t1_cz21zc3,7,1454558117,0,0,cz236j7,AskReddit,7,,
5,peace be with you,,,,Froswald,,,1453075091,t5_33zyg,t3_41ejg6,t1_cz22ev6,8,1454558117,0,0,cz236jt,joinsquad,8,,
6,[deleted],,,,[deleted],,,1453075093,t5_2s30g,t3_41fj00,t3_41fj00,2,1454558118,0,0,cz236lr,AskMen,2,,
7,yes i do. if i love animals enough to have a s...,,,,bonniebubblegum,,,1453075093,t5_2te20,t3_3z6iv6,t1_cz1n44e,1,1454558118,0,0,cz236lw,lobster,1,,
8,"The IA is a cool car, actually good incentives...",,,,tjallday,,,1453075094,t5_2vhkv,t3_41g53h,t3_41g53h,1,1454558118,0,0,cz236mh,askcarsales,1,,
9,If youre a hoe and you know it The Clap,,,,jarphynator,,,1453075096,t5_33x33,t3_41fh35,t3_41fh35,9,1454558119,0,0,cz236ns,BlackPeopleTwitter,9,,


In [48]:
### Get comments/posts

df_comments_1 = pd.read_csv('data/reddit_comments_2016_01/comments_reddit_comments_2016_01_000000000000.csv')
df_comments_2 = pd.read_csv('data/reddit_comments_2016_01/comments_reddit_comments_2016_01_000000000001.csv')
df_comments_3 = pd.read_csv('data/reddit_comments_2016_01/comments_reddit_comments_2016_01_000000000002.csv')

# group them all together
print(len(df_comments_1), len(df_comments_2), len(df_comments_3))
df_comments = df_comments_1.append(df_comments_2.append(df_comments_3))
print(len(df_comments))

df_posts_1 = pd.read_csv('data/reddit_posts_2016_01/posts_reddit_posts_2016_01_000000000000.csv')
df_posts_2 = pd.read_csv('data/reddit_posts_2016_01/posts_reddit_posts_2016_01_000000000001.csv')
df_posts_3 = pd.read_csv('data/reddit_posts_2016_01/posts_reddit_posts_2016_01_000000000002.csv')

# group them all together
print(len(df_posts_1), len(df_posts_2), len(df_posts_3))
df_posts = df_posts_1.append(df_posts_2.append(df_posts_3))
print(len(df_posts))


### Get only the ones in the current tested subreddits
df_comments = df_comments[df_comments.apply(lambda x: x['subreddit_id'] in tested_sub_ids, axis=1)]
df_posts = df_posts[df_posts.apply(lambda x: x['subreddit_id'] in tested_sub_ids, axis=1)]
print('comments:', len(df_comments), ' and ', len(df_posts), ' posts in', tested_subs)


### Remove all deleted and moderator posts
df_posts_no_removed = df_posts[df_posts['selftext'].apply(lambda x: x not in ['[deleted]', '[removed]'])]
df_comments_no_removed = df_comments[df_comments['author'].apply(lambda x: x not in ['[deleted]', 'AutoModerator'])]

df_posts_no_removed_mod = df_posts_no_removed[df_posts_no_removed['distinguished'] != 'moderator']
df_comments_no_removed_mod = df_comments_no_removed[df_comments_no_removed['distinguished'] != 'moderator']

print('cleaned comments:', len(df_comments_no_removed_mod), ' and ', len(df_posts_no_removed_mod), ' cleaned posts in', tested_subs)



428076 431016 431583
1290675


  interactivity=interactivity, compiler=compiler, result=result)


112632 112984 112810
338426
comments: 33874  and  4098  posts in ['science', 'politics', 'Economics', 'depression', 'Cooking', 'pics', 'Naruto', 'BabyBumps']
cleaned comments: 30552  and  3062  cleaned posts in ['science', 'politics', 'Economics', 'depression', 'Cooking', 'pics', 'Naruto', 'BabyBumps']


In [51]:
### Make a smaller df of comments and posts connected to a subreddit
# df_comments_no_removed_mod[['body', 'subreddit_id', 'subreddit']]
# df_posts_no_removed_mod[['title', 'selftext', 'subreddit_id', 'subreddit']]

In [53]:
df_comments_no_removed_mod_labelled = df_comments_no_removed_mod[['body', 'subreddit', 'subreddit_id']]
df_comments_no_removed_mod_labelled['label'] = df_comments_no_removed_mod_labelled['subreddit_id'].apply(lambda x: subreddit_label_dict[x]).astype(int)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,body,subreddit,subreddit_id,label
34,&gt;Wait until you take a look at the hordes o...,politics,t5_2cneq,1
135,Cut it in half again and you have potato wedge...,pics,t5_2qh0u,5
142,In the medical field that's like THE worst thi...,pics,t5_2qh0u,5
197,I'm Canadian... Nice pics of a warm summer rac...,pics,t5_2qh0u,5
208,Lady in back must not agree.,pics,t5_2qh0u,5
272,"I like Ron and Rand Paul, now I support Bernie...",politics,t5_2cneq,1
300,You like Herr’s Buffalo Blue Cheese Flavored C...,pics,t5_2qh0u,5
439,&gt; more practical and safe options when it c...,politics,t5_2cneq,1
519,Thank you for this information it really helps...,BabyBumps,t5_2s7cl,7
578,OK cool but that all seems basically pointless...,science,t5_mouw,0


In [56]:
### split and save
comment_train, comment_dev = sklearn.model_selection.train_test_split(df_comments_no_removed_mod_labelled, test_size=.2, random_state=2239)

comment_train.body.to_csv('data/classifier_data_sopa/train.data', index=False)
comment_train.label.to_csv('data/classifier_data_sopa/train.label', index=False)

comment_dev.body.to_csv('data/classifier_data_sopa/dev.data', index=False)
comment_dev.label.to_csv('data/classifier_data_sopa/dev.label', index=False)

In [55]:
print(comment_train['subreddit'].value_counts())
print(comment_dev['subreddit'].value_counts())

politics      11864
pics           9283
BabyBumps       973
science         866
depression      453
Cooking         404
Economics       323
Naruto          275
Name: subreddit, dtype: int64
politics      2925
pics          2328
BabyBumps      241
science        235
Cooking        124
depression     118
Economics       89
Naruto          51
Name: subreddit, dtype: int64


In [4]:
print('posts:', df_posts.columns)
print('comments:', df_comments.columns)


posts: Index(['created_utc', 'subreddit', 'author', 'domain', 'url', 'num_comments',
       'score', 'ups', 'downs', 'title', 'selftext', 'saved', 'id',
       'from_kind', 'gilded', 'from', 'stickied', 'retrieved_on', 'over_18',
       'thumbnail', 'subreddit_id', 'hide_score', 'link_flair_css_class',
       'author_flair_css_class', 'archived', 'is_self', 'from_id', 'permalink',
       'name', 'author_flair_text', 'quarantine', 'link_flair_text',
       'distinguished'],
      dtype='object')
comments: Index(['body', 'score_hidden', 'archived', 'name', 'author',
       'author_flair_text', 'downs', 'created_utc', 'subreddit_id', 'link_id',
       'parent_id', 'score', 'retrieved_on', 'controversiality', 'gilded',
       'id', 'subreddit', 'ups', 'distinguished', 'author_flair_css_class'],
      dtype='object')
