# Transform our Subreddits into Classification Model Inputs

In [21]:
import pandas as pd
import numpy as np

import nltk
from nltk.corpus import brown, stopwords
from nltk import sent_tokenize, word_tokenize
from nltk.tokenize.treebank import TreebankWordTokenizer

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.metrics import confusion_matrix

import matplotlib.pyplot as plt
import os

pd.set_option('display.max_rows', 500)

%matplotlib inline

### Load Data

In [9]:
files = os.listdir('./data')

df = pd.DataFrame()

for file in files:
    
    if file == '.ipynb_checkpoints':
        pass
    else:
        df = pd.concat([df, pd.read_csv(f'./data/{file}', low_memory = False) ])
        
df.head()

Unnamed: 0.1,Unnamed: 0,title,created_utc,subreddit,score,selftext,num_comments,num_crossposts
0,0,New Beyonce,1577854814,hiphopheads,1,check insta 🥳🖤,1,0
1,1,MF Doom-Hoe Cakes,1577854884,hiphopheads,1,,2,0
2,2,MF DOOM feat Ghostface Killah - Angels,1577855065,hiphopheads,1,,0,0
3,3,[FRESH] TeeJayx6 - 2020,1577855147,hiphopheads,1,,33,0
4,4,Eminem &amp; DJ Drama - Freestyle Part 1,1577855891,hiphopheads,1,,0,0


In [10]:
# Check Nulls
df.isnull().sum()

Unnamed: 0            0
title                 0
created_utc           0
subreddit             0
score                 0
selftext          52293
num_comments          0
num_crossposts        0
dtype: int64

In [11]:
# Define Quality Post
df.loc[:,'quality_post'] = (df['score'] > 2) | (df['num_comments'] > 2)
df['quality_post'].value_counts(normalize = True)

False    0.873654
True     0.126346
Name: quality_post, dtype: float64

In [15]:
# Subreddit Statistics
df.groupby('subreddit').agg({'score': 'mean', 'num_comments':'mean', 'quality_post': 'mean'})

Unnamed: 0_level_0,score,num_comments,quality_post
subreddit,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
hiphopheads,2.317263,12.953495,0.199691
listentothis,1.164341,1.451618,0.050249


### Preprocessing Test Data

In [49]:
# Create subset of data for easier transformation
data = df[['title', 'subreddit']]

In [18]:
# Tokenize the corpus
data.loc[:, "tokenized"] = data.loc[:,"title"].apply(nltk.word_tokenize)  #https://stackoverflow.com/questions/33098040/how-to-use-word-tokenize-in-data-frame

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [19]:
# Lemmatize

lemmatizer = nltk.stem.WordNetLemmatizer()

def lemmatize_text(text):
    return [lemmatizer.lemmatize(w) for w in text]

data.loc[:,'lemmatized'] = data.loc[:,'tokenized'].apply(lemmatize_text)

In [22]:
# Remove stopwords

def remove_stopwords(df):
    stop_words = set(stopwords.words('english'))
    
    return [text.lower() for text in df if text not in stop_words]  

data.loc[:,'stop_words'] = data.loc[:,'lemmatized'].apply(remove_stopwords)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [23]:
# Remove punctuation
garbage = "~`!@#$%^&*()_-+={[}]|\:;'<,>.?/"
    
data.loc[:,'cleaned'] = [[text for text in group if text not in garbage] for group in data.loc[:,'stop_words']]    

In [24]:
# Turn list into strings to input into model
data.loc[:,'feature'] = [' '.join(x) for x in data.loc[:,'cleaned']]

### Prepare Data for Classification

In [27]:
# Target Vector is predicting which subreddit the post came from
y = data['subreddit']
# Feature is the text from the title
X = data['feature']

In [28]:
#Baseline Model
y.value_counts()

hiphopheads     31674
listentothis    30528
Name: subreddit, dtype: int64

In [29]:
# Train Test Split our Data
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3, random_state = 42)

### Pipeline 1.1 Count Vectorizer and Logistic Regression

In [30]:
# Set up our pipeline and parameters for gridsearch
pipe = Pipeline([

    ('cvec', CountVectorizer()),
    (('lr'), LogisticRegression(solver = 'liblinear'))
])

pipe_params = {
    
    'cvec__max_features' : [1000, 2000, None],
    'cvec__ngram_range'  : [(1,1), (1,2)],
    'cvec__stop_words'   : ['english', None],
    'cvec__min_df'       :  [1,2]  
    
}

In [31]:
# Instantiate and fit our GridSearch
gs = GridSearchCV(pipe,
                 param_grid = pipe_params,
                 cv = 5,
                 n_jobs=3)

gs.fit(X_train, y_train);

In [32]:
# Training and Testing Acuracy Scores
print('Training score (Accuracy): '+str(gs.score(X_train, y_train)))
print('Testing score (Accuracy): '+str(gs.score(X_test, y_test)))


Training score (Accuracy): 0.9863347190004823
Testing score (Accuracy): 0.8836611114088205


### 1.1 Interpretation - Top 20 Words per Subreddit

In [33]:
# Pull coefficients from Estimator and Features from Transformer
lr = gs.best_estimator_.named_steps['lr']
cvec = gs.best_estimator_.named_steps['cvec']

# Turn into DataFrames
features_df = pd.DataFrame(lr.coef_.T,
                           cvec.get_feature_names()
                           , columns=['Importance'])
top_20_hiphop = features_df.sort_values('Importance', ascending = True).head(20)
top_20_listen = features_df.sort_values('Importance', ascending = False).head(20)




In [34]:
# r/hiphopheads Top 20 Distinguishing Words
top_20_hiphop.T

Unnamed: 0,fresh,pop smoke,eminem,hype,original,thread,jay,drake,kanye,pete rock,rappers,albums,aesop rock,aesop,leak,rapper,nas,tupac,schoolboy,hop playlist
Importance,-4.404412,-3.213142,-2.521578,-2.223327,-2.119303,-2.098899,-2.063866,-2.027939,-1.999949,-1.997338,-1.964576,-1.944476,-1.903494,-1.902975,-1.869498,-1.837395,-1.817742,-1.77314,-1.731072,-1.722597


In [35]:
# r/listentothis Top 20 Distinguishing Words
top_20_listen.T

Unnamed: 0,indie,folk,electronic,metal,pop,2019,2018,punk,liked youtube,2017,original song,alternative,rock,2014,band,ambient,techno,chillhop,2013,2016
Importance,3.789652,3.071333,3.003983,2.88604,2.778851,2.684443,2.669384,2.666096,2.634973,2.521862,2.515557,2.475594,2.464409,2.411144,2.35839,2.35551,2.251942,2.225665,2.068627,2.028744


### 1.1 Confusion Matrix

In [36]:
# Create Predictions for Confusion Matrix
X_transform = cvec.transform(X_test)
y_pred = lr.predict(X_transform)

In [37]:
# Confusion Matrix
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
# Print Scores
print("Accuracy: " + str((tp + tn) / (tp + tn + fp + fn)))
print("Sensitivity: " + str((tp) / (tp + fn)))
print("Specificity: " + str((tn) / (tn + fp)))
print("Precision: " + str((tp) / (tp + fp )))

Accuracy: 0.8836611114088205
Sensitivity: 0.8442221980650071
Specificity: 0.922003804692454
Precision: 0.9132173095014111


### Pipeline 1.2  TF-IDF Vectorizer and Logistic Regression

In [38]:
# Set up pipeline
pipe = Pipeline([

    ('tvec', TfidfVectorizer()),
    (('lr'), LogisticRegression(solver = 'liblinear'))
])

# Parameters for GridSearch
pipe_params = {
    
    'tvec__max_features' : [2000,3000,4000,5000,None],
    'tvec__ngram_range'  : [(1,1), (1,2)],
    'tvec__stop_words'   : ['english', None],
    'tvec__min_df'       :  (2,5),
    'tvec__max_df'       : (0.75, 0.95)
    
    
}


In [39]:
# Instantiate and fit our Gridsearch
gs = GridSearchCV(pipe,
                 param_grid = pipe_params,
                 cv = 5,
                 n_jobs=3)

gs.fit(X_train, y_train);

In [40]:
print('Training Accuracy: ' + str(gs.score(X_train, y_train)))
print('Testing Accuracy: ' + str(gs.score(X_test, y_test)))

Training Accuracy: 0.9306630532142119
Testing Accuracy: 0.8811424896843685


In [41]:
# Pull coefficients from Estimator and Features from Transformer
lr = gs.best_estimator_.named_steps['lr']
tvec = gs.best_estimator_.named_steps['tvec']


#turn it into a data frame
features_df = pd.DataFrame(lr.coef_.T,
                           tvec.get_feature_names()
                           , columns=['Importance'])
top_20_hiphop = features_df.sort_values('Importance', ascending = True).head(20)
top_20_listen = features_df.sort_values('Importance', ascending = False).head(20)


### 1.2 Interpretation - Top 20 Words per Subreddit

In [42]:
# r/hiphopheads top 20 words
top_20_hiphop.T

Unnamed: 0,fresh,eminem,original,pop smoke,lil,album,rapper,jay,drake,kanye,feat,hype,smoke,leak,ft,thread,kendrick,top,future,fresh video
Importance,-15.511374,-5.518547,-4.887793,-4.813343,-4.752123,-4.720969,-4.699521,-4.347822,-4.06415,-3.619456,-3.445985,-3.444556,-3.413657,-3.412053,-3.359101,-3.168982,-3.065876,-2.938429,-2.846222,-2.841309


In [43]:
# r/listentothis top 20 words
top_20_listen.T

Unnamed: 0,indie,2020,2019,rock,pop,electronic,folk,2018,metal,punk,alternative,2017,jazz,band,house,liked youtube,2016,hop 2020,2015,2014
Importance,10.546029,10.063975,9.808461,9.800317,8.503466,7.041222,6.693268,6.510102,6.456165,6.367025,6.17211,5.225184,4.917715,4.758196,4.636138,4.329111,4.215458,4.143717,3.987855,3.956869


In [44]:
combined = pd.DataFrame()
combined.loc[:, 'r/hiphopheads'] = top_20_hiphop.index.to_list()
combined.loc[:, 'r/listentothis'] = top_20_listen.index.to_list()

In [45]:
combined

Unnamed: 0,r/hiphopheads,r/listentothis
0,fresh,indie
1,eminem,2020
2,original,2019
3,pop smoke,rock
4,lil,pop
5,album,electronic
6,rapper,folk
7,jay,2018
8,drake,metal
9,kanye,punk


### 1.2 Confusion Matrix

In [46]:
# Create Predictions for Confusion Matrix
X_transform = tvec.transform(X_test)
y_pred = lr.predict(X_transform)

In [47]:
# Confusion Matrix
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
# Print Scores
print("Accuracy: " + str((tp + tn) / (tp + tn + fp + fn)))
print("Sensitivity: " + str((tp) / (tp + fn)))
print("Specificity: " + str((tn) / (tn + fp)))
print("Precision: " + str((tp) / (tp + fp )))

Accuracy: 0.8811424896843685
Sensitivity: 0.8395477769322752
Specificity: 0.9215810610864511
Precision: 0.9123449497932664
