# Building a classifier to see how easy it is to identify r/science based on posts and comments

### Will use posts/comments seperately
### One vs. all classification with r/science

In [1]:
import re
import csv
import json
import string
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.externals import joblib
from sklearn.metrics import f1_score

import spacy
import numpy as np
import pandas as pd
from scipy import stats


import os 
os.chdir('/homes/gws/taugust/Projects/ARK/community_guidelines/')

all_posts_dir = 'data/cleaned/full_real_subs_cleaned_posts_2018_short.csv'
all_comments_dir = 'data/cleaned/full_real_subs_cleaned_comments_2018_short.csv'

In [2]:
# define a quick and simple tokenizer
# (FWIW: I'm pretty sure I created this for something else, it's not perfect but ...
# ... the point is to remove punctuation somewhat sensibly, lower case, and split)

punct_chars = list(set(string.punctuation) - set("'"))
punct_chars.sort()
punctuation = ''.join(punct_chars)
replace = re.compile('[%s]' % re.escape(punctuation))

def text_to_tokens(text, lower=True, ngram=None):
    # replace underscores with spaces
    text = re.sub(r'_', ' ', text)
    # break off single quotes at the ends of words (e.g. 'test' -> test)
    text = re.sub(r'\s\'', ' ', text)
    text = re.sub(r'\'\s', ' ', text)
    # remove periods (e.g. U.S. -> US)
    text = re.sub(r'\.', '', text)
    # replace all other punctuation (except single quotes) with spaces (e.g. T-rex -> t rex)
    text = replace.sub(' ', text)
    # remove single quotes (e.g. don't -> dont)
    text = re.sub(r'\'', '', text)
    # replace all whitespace with a single space
    text = re.sub(r'\s', ' ', text)
    # strip off spaces on either end
    text = text.strip()    
    if lower:
        text = text.lower()
    split_text = text.split()
    if ngram is None:
        return split_text
    else:
        return [tuple(split_text[i:i+ngram]) for i in range(len(split_text)-ngram+1)]
    
# convert list for bigrams to tuple
def convert_to_tuple(line, cols):
    for col in cols:
        line[col] = [tuple(bigram) for bigram in line[col]]
    return line
        

In [4]:
# build pipeline for classifier
text_clf_pipeline = Pipeline([
     ('tfidf', TfidfVectorizer(ngram_range=(1, 2, ), stop_words=None, smooth_idf=False, max_features=1000)),
     ('clf', SGDClassifier()),
 ])

parameters = {'tfidf__ngram_range': [(1, 1), (1, 2)],
              'tfidf__use_idf': (True, False),
              'tfidf__max_df': [0.25, 0.5, 0.75, 1.0],
              'tfidf__max_features': [10, 50, 100, 250, 500, 1000, None],
              'tfidf__stop_words': ('english', None),
              'tfidf__smooth_idf': (True, False),
              'tfidf__norm': ('l1', 'l2', None),
              }


## Posts

In [39]:
# read in all data from json file that the frequency tests use
with open('data/cleaned/all_posts_2018.jsonlist') as f:
    posts = f.readlines()
posts = [json.loads(line) for line in posts] 



In [40]:
posts_df = pd.DataFrame.from_records(posts)

In [41]:
posts_df['full_text'] = posts_df['title'] + ' ' + posts_df['selftext'] 
print(len(posts_df))

115051


In [42]:
posts_df['is_science'] = posts_df['subreddit'] == 'science'
print(posts_df['is_science'].value_counts())

False    92894
True     22157
Name: is_science, dtype: int64


In [44]:
posts_df_sampled_not_science = posts_df[~posts_df['is_science']].sample(len(posts_df[posts_df['is_science']]), random_state=42)

In [45]:
posts_sampled_balanced = posts_df_sampled_not_science.append(posts_df[posts_df['is_science']])

In [47]:
# save a balanced subset of the data to train and test on
posts_sampled_balanced.to_csv('data/cleaned/posts_sampled_balanced.csv', index=False, quoting=csv.QUOTE_ALL, escapechar='\\')

In [48]:
posts_sampled_balanced= pd.read_csv('data/cleaned/posts_sampled_balanced.csv', quoting=csv.QUOTE_ALL, escapechar='\\')

In [49]:

# do a train/test split
X_train_posts, X_test_posts, y_train_posts, y_test_posts = train_test_split(posts_sampled_balanced['full_text'], posts_sampled_balanced['is_science'], test_size=0.33, random_state=42)

In [50]:
len(X_train_posts), len(X_test_posts), len(y_train_posts), len(y_test_posts)

(29690, 14624, 29690, 14624)

In [52]:
post_clf = text_clf_pipeline.fit(X_train_posts, y_train_posts)

In [53]:
pred_post = post_clf.predict(X_test_posts)

In [54]:
f1_score(y_test_posts, pred_post, average='micro')  

0.8354759299781181

In [25]:
np.mean(pred_post == y_test_posts)   

0.8364332603938731

In [3]:
best_post_est = joblib.load('best_post_est.pkl')
post_grid_results = joblib.load('post_grid_results.pkl')

## Comments 

In [18]:
with open('all_comments_2018_unparsed.jsonlist') as f:
    comments = f.readlines() 
comments = [json.loads(line) for line in comments] 



KeyboardInterrupt: 

In [19]:
import os

os.chdir('/homes/gws/taugust/Projects/ARK/community_guidelines')

%run Reddit/SRILM_building_funcs.ipynb

/homes/gws/taugust/Projects/ARK/community_guidelines


In [20]:
# Getting entire comment set fro
subs = ['funny', 'science', 'news', 'politics', 'pics', 'AskReddit', 'AskHistorians', 'EverythingScience', 'Futurology', 'TrueReddit', 'dataisbeautiful', 'askscience']

df_comment_list = []

for s in subs:     
    df_comments, df_author_counts_train = import_csvs(s, path='data/cleaned/train/2018/', ext='_train_2018.csv', comment_pre_path='data/cleaned/sub_comments/', comment_ext='_comments_2018.csv')
    df_comment_list.append(df_comments)

Importing  [35mdata/cleaned/sub_comments/funny_comments_2018.csv[0m ..... 

  if (yield from self.run_code(code, result)):


Done
Importing  [35mdata/cleaned/train/2018/author_counts/funny_author_counts_train_2018.csv[0m ..... Done
Importing  [35mdata/cleaned/sub_comments/science_comments_2018.csv[0m ..... Done
Importing  [35mdata/cleaned/train/2018/author_counts/science_author_counts_train_2018.csv[0m ..... Done
Importing  [35mdata/cleaned/sub_comments/news_comments_2018.csv[0m ..... Done
Importing  [35mdata/cleaned/train/2018/author_counts/news_author_counts_train_2018.csv[0m ..... Done
Importing  [35mdata/cleaned/sub_comments/politics_comments_2018.csv[0m ..... Done
Importing  [35mdata/cleaned/train/2018/author_counts/politics_author_counts_train_2018.csv[0m ..... Done
Importing  [35mdata/cleaned/sub_comments/pics_comments_2018.csv[0m ..... Done
Importing  [35mdata/cleaned/train/2018/author_counts/pics_author_counts_train_2018.csv[0m ..... Done
Importing  [35mdata/cleaned/sub_comments/AskReddit_comments_2018.csv[0m ..... Done
Importing  [35mdata/cleaned/train/2018/author_counts/AskRed

In [21]:
df_comment_total = pd.concat(df_comment_list, axis = 0, ignore_index = True)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


In [22]:
comments_df = df_comment_total

In [24]:
comments_df['is_science'] = comments_df['subreddit'] == 'science'
print(comments_df['is_science'].value_counts())

False    66062089
True       604267
Name: is_science, dtype: int64


In [25]:
comments_df_not_science = comments_df[~comments_df['is_science']]

In [26]:
comments_df_science = comments_df[comments_df['is_science']]

In [27]:
comments_df_sampled_not_science = comments_df_not_science.sample(len(comments_df_science), random_state=42)
comments_sampled_balanced = comments_df_sampled_not_science.append(comments_df_science)

In [28]:
comments_sampled_balanced.to_csv('data/cleaned/comments_sampled_balanced.csv', index=False, quoting=csv.QUOTE_ALL, escapechar='\\')

In [30]:
comments_sampled_balanced = pd.read_csv('data/cleaned/comments_sampled_balanced.csv', quoting=csv.QUOTE_ALL, escapechar='\\')

In [31]:
len(comments_sampled_balanced)

1208534

In [32]:
# comments_df_sampled = posts_df[['full_text', 'is_science']].sample(100000, random_state=42)
# print(len(comments_df_sampled))

In [33]:
X_train_comments, X_test_comments, y_train_comments, y_test_comments = train_test_split(comments_sampled_balanced['body'], comments_sampled_balanced['is_science'], test_size=0.33, random_state=42)

In [34]:
len(X_train_comments), len(X_test_comments), len(y_train_comments), len(y_test_comments)

(809717, 398817, 809717, 398817)

In [35]:
# grid_comment = GridSearchCV(text_clf_pipeline, parameters, cv=2, verbose=1, scoring='f1')
# grid_comment.fit(X_train_comments, y_train_comments)
comment_clf = text_clf_pipeline.fit(X_train_comments, y_train_comments)



In [55]:
joblib.dump(comment_clf, 'comment_clf_sampled.pkl', compress = 1)

['comment_clf_sampled.pkl']

In [36]:
pred_comment = comment_clf.predict(X_test_comments)

In [37]:
np.mean(y_test_comments == pred_comment)

0.7252173302542269

In [38]:
f1_score(y_test_comments, pred_comment, average='micro')  

0.7252173302542269

In [None]:
# joblib.dump(grid_comment.best_estimator_, 'best_comment_est.pkl', compress = 1)
# joblib.dump(grid_comment.cv_results_, 'comment_grid_results.pkl', compress = 1)

In [None]:
print(grid_comment.cv_results_['mean_test_score'])

In [35]:
# text_clf.fit(X_train_comments, y_train_comments) 
# predicted = text_clf.predict(X_test_comments)
# print('comment accuracy:', np.mean(predicted == y_test_comments))


In [36]:
# science_comments = [line for line in comments if line['subreddit'] == 'science']
# print(len(science_comments))
# background_comments = [line for line in comments if line['subreddit'] != 'science']
# print(len(background_comments))

# science_text_comments = [line['body'] for line in science_comments]
# background_text_comments = [line['body'] for line in background_comments]

In [37]:
# # combine the texts but with seperate labels
# science_text_labeled_comments = [(0, line) for line in science_text_comments]
# background_text_labeled_comments = [(1, line) for line in background_text_comments]
# all_text_labled_comments = science_text_labeled_comments + background_text_labeled_comments

# # do a train/test split
# X_train_comments, X_test_comments, y_train_comments, y_test_comments = train_test_split([line[1] for line in all_text_labled_comments], [line[0] for line in all_text_labled_comments] , test_size=0.33, random_state=42)

In [45]:
s

0.9882961310833319