In [1]:
import os
import json
import pandas as pd
import numpy as np

In [2]:
datafile = os.path.join('..', 'dat', '2018')

with open(datafile, 'r') as f:
    record_dicts = []
    for line in f.readlines():
        record = json.loads(line)
        reply_list = record['reply']
        earliest_reply_text = None
        for reply_dict in sorted(reply_list, key=lambda x: x['created_utc']):
            if reply_dict['body'] != '[deleted]':
                earliest_reply_text = reply_dict['body']
            if earliest_reply_text:
                break
        if earliest_reply_text:
            record.pop('reply')
            record['reply_text'] = earliest_reply_text
            record_dicts.append(record)

In [5]:
reddit_df = pd.DataFrame(record_dicts)
reddit_df = reddit_df[reddit_df.body != '[deleted]']
reddit_df = reddit_df.astype({'score':np.int64, 'controversiality':np.int64, 'gilded':np.int64, 'created_utc':np.int64})

In [17]:
gold = reddit_df[reddit_df.gilded>0]
contro = reddit_df[reddit_df.controversiality>0]

Unnamed: 0,author,author_flair_css_class,author_flair_text,body,controversiality,created_utc,gender,gilded,id,link_id,parent_id,reply_text,score,subreddit
0,be-concerned,sun,15F,if this isn’t accurate then idk what is,0,1542407736,female,0,e9ufsur,t3_9xpaoc,e9ufsur,"Yeah, this is definitely me. Then again, I cou...",60,infj
1,be-concerned,sun,15F,"i’m in highschool right now. two years ago, my...",0,1545673571,female,0,ecgumma,t3_a940fq,ecgumma,Identify a common interest you have and maybe ...,5,infj
2,INFJen,clover,27F,[It’s me.](https://imgur.com/a/WDavpAM),0,1525914365,female,0,dyqcclj,t3_8hz688,dyqcclj,Aphrodite? Is that you?,3,infj
3,INFJen,clover,27F,u/User_Simulator u/INFJen,0,1528055310,female,0,e01ztf1,t3_8o8shk,e01ztf1,+/u/User_Simulator /u/INFJen,2,infj
4,INFJen,clover,27F,"It's okay, you too are unique. I'm unique, you...",0,1522538912,female,0,dwlp1jv,t3_88m25u,dwlp1jv,[Are we?](https://www.wikihow.com/Stop-Taking-...,8,infj
5,INFJen,clover,27F,ENTPS are\n\nthe most perfect pair for the\n\n...,0,1522538809,female,0,dwloygj,t3_88k3ov,dwloygj,Ninja Haiku,7,infj
6,INFJen,clover,27F,I highly disagree with the idea that an INFJ i...,0,1534551506,female,0,e4duyx9,t3_985e66,e4duyx9,I am hesitating between continuing in academia...,31,infj
7,INFJen,clover,27F,Hi everyone! We’re glad there is going to be a...,0,1540761531,female,0,e8miv30,t3_9s6r87,e8miv30,10000% yes this! \n\nThank you Jen!,1,infj
8,INFJen,clover,27F,**I'm going to add some not so scary suggestio...,0,1538519950,female,0,e722a0q,t3_9kvbx6,e722a0q,"Fuck Yea, Stephen King!!! The Tommyknockers, a...",7,infj
9,INFJen,clover,27F,For those participating who didn’t get an emai...,0,1540733998,female,0,e8lnz44,t3_9s2om3,e8lnz44,"Sorry if i jumped the gun a bit, i'm just a bi...",1,infj


In [16]:
reddit_df['sub_id'] = pd.Categorical(reddit_df['subreddit']).codes
data = reddit_df[['body', 'score', 'sub_id']]
data = data.astype({'score':np.int64})
text = data['body'].values
labels = data['score'].values

In [36]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
stoplist = stopwords.words("English")

vec = CountVectorizer(max_df=0.9, min_df=0.02, stop_words='english', ngram_range=(1,2))
features = vec.fit_transform(text)
features.shape

(422206, 195)

In [33]:
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.metrics import mean_squared_error as mse

docs = np.arange(features.shape[0])
np.random.shuffle(docs)
subsample = docs
tr_features = features[subsample, :]
tr_labels = labels[subsample]
print(tr_features.shape)

model = LinearRegression()
model.fit(tr_features, tr_labels)
train_pred = model.predict(tr_features)
print(mse(tr_labels, train_pred))

print(tr_labels)
print(train_pred)

(422206, 3504)
27031.81146485864
[ 3 12 10 ...  2 10  5]
[ 7.03625916 15.30755675 35.62633923 ... 19.38465038 37.32604458
 24.48792292]


In [40]:
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.metrics import mean_squared_error as mse

docs = np.arange(features.shape[0])
np.random.shuffle(docs)
subsample = docs
tr_features = features[subsample, :]
tr_features = tr_features.sum(axis=1)
tr_labels = labels[subsample]
print(tr_features.shape)

model = LinearRegression()
model.fit(tr_features, tr_labels)
train_pred = model.predict(tr_features)
print(mse(tr_labels, train_pred))

(422206, 1)
27468.657373347


In [38]:
np.sqrt(mse(tr_labels, train_pred))

165.73671100075265

In [9]:
reddit_df['op_gender_visible'] = True
reddit_df['op_gender'] = ''
reddit_df.loc[reddit_df.author_flair_css_class.isin(['female']), 'op_gender'] = 'F'
reddit_df.loc[reddit_df.author_flair_css_class.isin(['male']), 'op_gender'] = 'M'

notmale = ~reddit_df.author_flair_text.str.contains('\d M|\dM|\/M', regex=True, na=False)
female = reddit_df.author_flair_text.str.contains('\d F|\dF|\/F', regex=True, na=False)
notfemale = ~reddit_df.author_flair_text.str.contains('\d F|\dF|\/F', regex=True, na=False)
male = reddit_df.author_flair_text.str.contains('\d M|\dM|\/M', regex=True, na=False)
notnull = reddit_df.author_flair_text.notnull()

reddit_df.loc[notnull & female & notmale, 'op_gender'] = 'F'
reddit_df.loc[notnull & male & notfemale, 'op_gender'] = 'M'
reddit_df = reddit_df[reddit_df.op_gender != '']

In [43]:
reddit_df.columns

Index(['author', 'author_flair_css_class', 'author_flair_text', 'body',
       'controversiality', 'created_utc', 'gilded', 'id', 'link_id',
       'parent_id', 'reply', 'score', 'subreddit', 'op_gender_visible',
       'op_gender'],
      dtype='object')