In [3]:
import re
import time

import numpy as np
import pandas as pd
import requests
from progressbar import progressbar

In [4]:
# !pip3 install progressbar2

## Scraping

### Scraping Functions

In [5]:
# function to scrape reddit page (takes a reddit .json url)
# returns posts 

headers = {'User-Agent' : 'override this bad boy!'}

def scraper_bike(url):
    posts = []
    after = {}

    for page in progressbar(range(40)):
        params = {'after' : after}
        url = url
        pagepull = requests.get(url = url, params = params, headers = headers)
        page_dict = pagepull.json()
        posts.extend(page_dict['data']['children'])
        after = page_dict['data']['after']
        time.sleep(.2)
        
    return posts

In [6]:
# function to convert posts to DataFrame - won't allow duplicate posts since unique id 'name' is set as index
# Extract: name (as index) and subreddit, selftext, title (as columns)

def posts_to_df(post_list):
    i = 0
    post_dict = {}
    
    for post in post_list:
        ind = post_list[i]['data']
        post_dict[ind['name']] = [ind['subreddit'], ind['title'], ind['selftext']]
        i += 1

    df_name = pd.DataFrame(post_dict)
    df_name = df_name.T
    df_name.columns = ['subreddit', 'title', 'selftext'] #'selftext'
    
    return df_name

In [7]:
# takes scraper function and url - outputs dataframe

def scrape_to_df(scrape_func, url):
    
    return posts_to_df(scrape_func(url))

In [8]:
#### If you want to scrape repeatedly over time and add to a csv
# scrape, import csv, concat, drop duplicate, and output to csv
# takes in scraper function, url, csv filename to import, csv filename to output
# Outputs - Concatenated DataFrame as csv

def scrape_add(scrape_func, url, import_file, export_file):
    scrape_df = posts_to_df(scrape_func(url))
    imported_df = pd.read_csv(import_file, index_col = 'Unnamed: 0')
    concat_df = pd.concat([imported_df, scrape_df])
    concat_df = concat_df[~concat_df.index.duplicated(keep='first')]
    concat_df.to_csv(export_file)

### Run Scrape

In [98]:
# Run this and comment out pd.read_csv lines in data cleaning / preprocessing to use freshly scraped data
# You can also put in any 2 subreddits in as the URL and get results for those

nfltest = scrape_to_df(scraper_bike, 'https://www.reddit.com/r/nfl.json')
nbatest = scrape_to_df(scraper_bike, 'https://www.reddit.com/r/nba.json')

100% (40 of 40) |########################| Elapsed Time: 0:00:25 Time:  0:00:25
100% (40 of 40) |########################| Elapsed Time: 0:00:25 Time:  0:00:25


In [99]:
politics_test = scrape_to_df(scraper_bike, 'https://www.reddit.com/r/politics.json')
conservative_test = scrape_to_df(scraper_bike, 'https://www.reddit.com/r/conservative.json')

100% (40 of 40) |########################| Elapsed Time: 0:00:29 Time:  0:00:29
100% (40 of 40) |########################| Elapsed Time: 0:00:27 Time:  0:00:27


In [100]:
nbatest.shape

(734, 3)

In [101]:
nfltest.shape

(934, 3)

In [102]:
nfltest.head()

Unnamed: 0,subreddit,title,selftext
t3_dcc7g6,nfl,Water Cooler Wednesday,"Welcome to today's open thread, where /r/nfl users can discuss anything they wish not related directly to the NFL.\n\nWant to talk about personal life? Cool things about your fandom? Whatever happens to be dominating today's news cycle? Do you have something to talk about that didn't warrant its..."
t3_dbxfyg,nfl,Official Week 4 /r/NFL Power Rankings,"Good afternoon, r/nfl! We're through the first quarter of this race, with only 3 teams remaining undefeated while 6 have yet to win a game. It's early going and there's still a lot of moving around to be done, as these horses are anything but predictable. Some wild swings appear in the rankings ..."
t3_dcajbu,nfl,Tom Brady has been wearing the same shoulder pads since his freshman year at Michigan in 1995. They're older than 5 of his current team-mates.,
t3_dca6d5,nfl,Percy Harvin Says He Was High Every Game He Played,
t3_dc44f6,nfl,[Jaguars] The Jaguars are giving out a bandana and a mustache to any fan who purchases tickets to the team's 2 home games this month,


##### These scrape_add functions add to already built csvs

In [103]:
# scrape_add(scraper_bike, 'https://www.reddit.com/r/CollegeBasketball/new.json', 'NCAA_Posts_Update2.csv', 'NCAA_Posts_Update3.csv')
# scrape_add(scraper_bike, 'https://www.reddit.com/r/AskScience/new.json', 'AskSci_Posts_Update2.csv', 'AskSci_Posts_Update3.csv')
# scrape_add(scraper_bike, 'https://www.reddit.com/r/nba/new.json', 'NBA_Posts_Update2.csv', 'NBA_Posts_Update3.csv')
# scrape_add(scraper_bike, 'https://www.reddit.com/r/nfl/new.json', 'NFL_Posts_Update2.csv', 'NFL_Posts_Update3.csv')

### Data Cleaning / Preprocessing

In [104]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer, sent_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

pd.set_option('max_colwidth', 300)

In [105]:
# drop column

nfltest = nfltest.drop(columns = 'selftext')
nbatest = nbatest.drop(columns = 'selftext')

In [106]:
# merge subreddit data

train = pd.concat([nfltest, nbatest])

In [107]:
train.head()

Unnamed: 0,subreddit,title
t3_dcc7g6,nfl,Water Cooler Wednesday
t3_dbxfyg,nfl,Official Week 4 /r/NFL Power Rankings
t3_dcajbu,nfl,Tom Brady has been wearing the same shoulder pads since his freshman year at Michigan in 1995. They're older than 5 of his current team-mates.
t3_dca6d5,nfl,Percy Harvin Says He Was High Every Game He Played
t3_dc44f6,nfl,[Jaguars] The Jaguars are giving out a bandana and a mustache to any fan who purchases tickets to the team's 2 home games this month


##### Tokenize (grab only word characters)

In [108]:
word_tokenizer = RegexpTokenizer(r'\w+')

In [109]:
train['title'] = train['title'].map(lambda x: word_tokenizer.tokenize(x.lower()))

In [110]:
# rejoin list of tokenized words into single string for each row

train['title'] = train['title'].map(lambda x: ' '.join(x))

In [111]:
train['title'][0:5]

t3_dcc7g6                                                                                                                          water cooler wednesday
t3_dbxfyg                                                                                                            official week 4 r nfl power rankings
t3_dcajbu    tom brady has been wearing the same shoulder pads since his freshman year at michigan in 1995 they re older than 5 of his current team mates
t3_dca6d5                                                                                              percy harvin says he was high every game he played
t3_dc44f6              jaguars the jaguars are giving out a bandana and a mustache to any fan who purchases tickets to the team s 2 home games this month
Name: title, dtype: object

### Train test split and converting series to list of strings then to array

In [112]:
X = train[['title']]
y = train['subreddit']

In [113]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=.25,
                                                    random_state=42,
                                                    stratify=y)

In [114]:
# baseline is

y.value_counts(normalize=True)

nfl    0.559952
nba    0.440048
Name: subreddit, dtype: float64

In [115]:
# create our training data list - this is a list of strings, with each string being a post title

clean_train_data = []

for traindata in X_train['title']:
    clean_train_data.append(traindata)

In [116]:
len(clean_train_data)

1251

In [117]:
# create test data list

clean_test_data = []

for testdata in X_test['title']:
    clean_test_data.append(testdata)

In [118]:
len(clean_test_data)

417

### Count Vectorizer

In [119]:
# instantiate our CountVectorizer. This counts the number of appearances of all the words in our training data and
# eliminates common english stop words. 5000 max features works well for our purposes (tested various numbers). Our
# data is already preprocessed and tokenized manually earlier. ngram_range is 1,3, although all or nearly all our
# features are single words

vectorizer = CountVectorizer(analyzer="word",
                             tokenizer=None,
                             preprocessor=None,
                             stop_words='english',
                             max_features=5000,
                             ngram_range=(1, 3))

In [120]:
# fit our training data and test data lists to our count_vectorizer

train_data_features = vectorizer.fit_transform(clean_train_data)

test_data_features = vectorizer.transform(clean_test_data)

In [121]:
# convert to array

train_data_features = train_data_features.toarray()

In [122]:
# check shapes

train_data_features.shape, test_data_features.shape

((1251, 5000), (417, 5000))

In [123]:
# I wanted check that the features corpus was as expected - removed print statement for readability

vocab = vectorizer.get_feature_names()

In [124]:
vocab[0:200]

['00',
 '000',
 '000 career',
 '04',
 '10',
 '10 2014',
 '10 catches',
 '10 catches 223',
 '10 points',
 '10 reb',
 '10 yards',
 '100',
 '100 greatest',
 '100 greatest games',
 '1000',
 '106',
 '11',
 '11 games',
 '12',
 '12 games',
 '13',
 '13 games',
 '132',
 '14',
 '140',
 '15',
 '15 000',
 '15 000 career',
 '150',
 '150 yard',
 '150 yard receiver',
 '150 yard rusher',
 '151',
 '16',
 '16 17',
 '16 games',
 '16 tds',
 '16m',
 '17',
 '17 straight',
 '17 straight games',
 '17 year',
 '179',
 '18',
 '18 65',
 '18 65 mph',
 '18 mph',
 '18 mph vs',
 '18 sacks',
 '18 snapped',
 '18 snapped streaks',
 '19',
 '19 matt',
 '19 matt ryan',
 '19 season',
 '1950',
 '1969',
 '1970',
 '1970 merger',
 '1986',
 '1994',
 '1994 1999',
 '1997',
 '1999',
 '1st',
 '1st amp',
 '1st amp 2nd',
 '1st place',
 '20',
 '200',
 '200 snaps',
 '2000',
 '2003',
 '2004',
 '2005',
 '2006',
 '2008',
 '2010',
 '2010s',
 '2011',
 '2012',
 '2013',
 '2014',
 '2015',
 '2016',
 '2017',
 '2018',
 '2018 2019',
 '2019',
 '2019

## MODELING

### Logistic Regression

In [125]:
from sklearn.linear_model import LogisticRegression

In [126]:
# fit logistic regression model

lr = LogisticRegression(penalty='l2')

In [127]:
# shape check

train_data_features.shape, y_train.shape

((1251, 5000), (1251,))

In [128]:
lr.fit(train_data_features, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [129]:
lr.score(train_data_features, y_train)

0.996802557953637

In [130]:
lr.score(test_data_features, y_test)

0.9304556354916067

### Feature comparison

Creates a dataframe that matches features to coefficients

In [131]:
coef_list = lr.coef_.tolist()

In [132]:
coef_list = coef_list[0]

In [133]:
coef_df = pd.DataFrame({'features': vectorizer.get_feature_names(),
                        'coefs': coef_list})

In [134]:
coef_df.sort_values(by = ['coefs'])

Unnamed: 0,features,coefs
1796,nba,-2.605586
1554,lebron,-1.488739
1457,kawhi,-1.209259
381,basketball,-1.180088
1191,harden,-1.089459
1519,lakers,-0.985407
3823,rockets,-0.977707
4456,warriors,-0.898148
1460,kd,-0.866865
978,finals,-0.825422


### Let's throw out these unfair words and rerun

In [135]:
stopwords = set(stopwords.words('english'))

extra_stopwords = ['nba', 'basketball', 'football', 'nfl']

stopwords.update(extra_stopwords)

In [136]:
vectorizer = CountVectorizer(analyzer = "word",
                             tokenizer = None,
                             preprocessor = None,
                             stop_words = stopwords,
                             max_features = 5000,
                             ngram_range = (1, 3))

train_data_features = vectorizer.fit_transform(clean_train_data)

test_data_features = vectorizer.transform(clean_test_data)

train_data_features = train_data_features.toarray()

train_data_features.shape, test_data_features.shape

((1251, 5000), (417, 5000))

In [137]:
lr.fit(train_data_features, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [138]:
lr.score(train_data_features, y_train)

0.9896083133493205

In [139]:
lr.score(test_data_features, y_test)

0.8896882494004796

In [140]:
coef_list = lr.coef_.tolist()
coef_list = coef_list[0]

coef_df = pd.DataFrame({'features' : vectorizer.get_feature_names(),
                       'coefs' : coef_list})

coef_df.sort_values(by = ['coefs'])

Unnamed: 0,features,coefs
1646,lebron,-1.431763
1530,kawhi,-1.196018
1271,harden,-1.021395
1594,lakers,-0.987757
4174,rockets,-0.986822
1403,india,-0.947695
1016,finals,-0.943875
4842,warriors,-0.868276
4623,think,-0.834553
78,2010s,-0.760639


### Decision Tree

In [141]:
from sklearn.tree import DecisionTreeClassifier

In [142]:
tree = DecisionTreeClassifier()

In [143]:
tree.fit(train_data_features, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [144]:
tree.score(train_data_features, y_train)

1.0

In [145]:
tree.score(test_data_features, y_test)

0.7937649880095923

### Random Forest

In [146]:
from sklearn.ensemble import RandomForestClassifier

In [147]:
forest = RandomForestClassifier(n_estimators = 100)

In [148]:
forest.fit(train_data_features, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [149]:
forest.score(train_data_features, y_train)

1.0

In [150]:
forest.score(test_data_features, y_test)

0.8465227817745803

###  Matrix on Logistic Regression

In [151]:
from sklearn.metrics import confusion_matrix

In [152]:
y_pred = lr.predict(test_data_features)

In [153]:
cm = confusion_matrix(y_test, y_pred)

In [154]:
cm_df = pd.DataFrame(cm,
                    columns=['predict_neg', 'predict_pos'],
                    index = ['actual_neg', 'actual_pos'])

In [155]:
cm_df

Unnamed: 0,predict_neg,predict_pos
actual_neg,168,16
actual_pos,30,203


## Checking where our model failed

In [156]:
comparison_df = pd.DataFrame({'y_actual' : y_test,
             'y_predicted' : y_pred})

In [157]:
mismatch_df = comparison_df[comparison_df['y_actual'] != comparison_df['y_predicted']]

In [158]:
mismatch2_df = pd.concat([mismatch_df, X_test], axis = 1)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


In [159]:
# All incorrect predictions with titles

mismatches = mismatch2_df.dropna()

In [160]:
mismatches

Unnamed: 0,y_actual,y_predicted,title
t3_da2dfq,nba,nfl,in his first 13 seasons john stockton missed 4 nba games total
t3_da2ict,nba,nfl,kyrie explains in depth what went wrong in boston says he failed them as a leader
t3_da4c56,nba,nfl,robert horry game winner vs portland game 3 playoffs 2002
t3_daagf7,nba,nfl,westbrook i don t have to have the ball to impact the game i don t have to score i don t have to do anything i can defend i can rebound i can pass i can lead
t3_dakizu,nba,nfl,first take stephen a repeatedly changes subject to discredit jeremy lin melo comes to his defense
t3_dakreq,nba,nfl,miami toronto trade idea
t3_dakzg1,nba,nfl,who hit the most game winners at the buzzer regular season and playoffs combined
t3_dan006,nba,nfl,silver asks silver adam silver and nate silver in conversation 2017
t3_davzae,nfl,nba,kempski nfl should discipline andrew sendejo for reckless friendly fire shot on avonte maddox
t3_dayv8t,nba,nfl,highlight pelicans draft and stash prospect didi louzada scores 24 points on 10 for 18 shooting


### Let's try TF-IDF

Term Frequency / Inverse Document Frequency

TF(w) = (Number of times term w appears in a document) / (Total number of terms in the document)

IDF(w) = log_e(Total number of documents / Number of documents with term w in it)

In [161]:
tfidf_vec = TfidfVectorizer(analyzer="word",
                            tokenizer=None,
                            preprocessor=None,
                            stop_words=['nba', 'nfl', 'football', 'basketball'],
                            max_features=5000,
                            ngram_range=(1, 3))

In [162]:
train_data_features = tfidf_vec.fit_transform(clean_train_data)

test_data_features = tfidf_vec.transform(clean_test_data)

train_data_features = train_data_features.toarray()

train_data_features.shape, test_data_features.shape

((1251, 5000), (417, 5000))

In [163]:
lr.fit(train_data_features, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [164]:
lr.score(train_data_features, y_train)

0.9776179056754596

In [165]:
lr.score(test_data_features, y_test)

0.8657074340527577

### Let's try on some other subreddits

In [166]:
train = pd.concat([politics_test, conservative_test])

In [167]:
X = train[['title']]
y = train['subreddit']

In [168]:
# politics_test = scrape_to_df(scraper_bike, 'https://www.reddit.com/r/politics.json')
# conservative_test = scrape_to_df(scraper_bike, 'https://www.reddit.com/r/conservative.json')

In [169]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, stratify = y)

In [170]:
politics_test = politics_test.drop(columns = 'selftext')
conservative_test = conservative_test.drop(columns = 'selftext')

train = pd.concat([politics_test, conservative_test])
tokenizer = RegexpTokenizer(r'\w+')

train['title'] = train['title'].map(lambda x: tokenizer.tokenize(x.lower()))
train['title'] = train['title'].map(lambda x: ' '.join(x))

In [171]:
# create our training data list - this is a list of strings, with each string being a post title

clean_train_data = []

for traindata in X_train['title']:
    clean_train_data.append(traindata)
    
    
# create test data list

clean_test_data = []

for testdata in X_test['title']:
    clean_test_data.append(testdata)

In [172]:
vectorizer = CountVectorizer(analyzer = "word",
                             tokenizer = None,
                             preprocessor = None,
                             stop_words = 'english',
                             max_features = 5000,
                             ngram_range = (1, 3))

train_data_features = vectorizer.fit_transform(clean_train_data)

test_data_features = vectorizer.transform(clean_test_data)

train_data_features = train_data_features.toarray()

train_data_features.shape, test_data_features.shape

vocab = vectorizer.get_feature_names()

### Modeling

In [173]:
lr = LogisticRegression(penalty = 'l2')

In [174]:
train_data_features.shape, y_train.shape

((1386, 5000), (1386,))

In [175]:
lr.fit(train_data_features, y_train)

lr.score(train_data_features, y_train)



0.9805194805194806

In [176]:
lr.score(test_data_features, y_test)

0.7408207343412527

In [177]:
coef_list = lr.coef_.tolist()

coef_list = coef_list[0]

In [178]:
coef_df = pd.DataFrame({'features' : vectorizer.get_feature_names(),
                       'coefs' : coef_list})

coef_df.sort_values(by = ['coefs'])

Unnamed: 0,features,coefs
265,biden,-1.347266
663,dems,-1.291742
651,democrats,-1.080588
1522,media,-0.977672
1184,illegal,-0.960153
627,deduction,-0.953604
1230,individual,-0.896006
89,admits,-0.893303
166,aoc,-0.866329
448,cia,-0.840527


In [180]:
pc_forest = RandomForestClassifier(n_estimators = 100)
pc_forest.fit(train_data_features, y_train)
print(pc_forest.score(train_data_features, y_train))
print(pc_forest.score(test_data_features, y_test))

0.9963924963924964
0.755939524838013


In [None]:
from sklearn import PCA

In [None]:
stop_words = list(s_words).extend(['nba'....])

In [None]:
s_words = set(stopwords.words('english') + stopwords.words('spanish'))