# Reddit Post Classifier: Modelling

This notebook contains the model training/tuning/selection processes.

In [90]:
import numpy as np
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import TfidfVectorizer

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
english = set(nltk.corpus.words.words())

import time

from sklearn.neighbors import KNeighborsClassifier

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier

from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
import xgboost as xgb

from sklearn.decomposition import PCA
from sklearn import svm

from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score

## 1. Grabbing Data and Further Processing

In [23]:
x = pd.read_csv('data/lemmatized.csv')['0']
posts = pd.read_csv('data/arts-programming-reddit-posts.csv')
y = posts.label

In [24]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2)

### a. Preprocessing Pipeline

In [9]:
# 1. tfidf vectorizer

tfidf = TfidfVectorizer()
response = tfidf.fit_transform(x_train)
tfidf_df = pd.DataFrame(response.toarray(), columns=tfidf.get_feature_names())

# pickle this
pickle.dump(tfidf, open("pickles/tfidf.pkl", "wb"))

In [10]:
# 2. only keep words w average tfidf scores over 0.0001

relevant = []
for word in tfidf_df.columns:
    if tfidf_df[word].mean() > 0.0001:
        relevant.append(tfidf_df[word])
len(relevant)

4901

In [11]:
# 3. we need to save the relevant words to process test data

relevant_df = pd.DataFrame(relevant).transpose()
pickle.dump(relevant_df.columns, open("pickles/relevantwords.pkl", "wb"))

In [12]:
# 4. pipeline for processing test data

def preprocess(data):
    
    # lemmatize
    def lemmadata(doc):
        pattern = "([a-zA-Z]+(?:'[a-z]+)?)"
        raw_tokens = nltk.regexp_tokenize(doc, pattern)
        tokens = [i.lower() for i in raw_tokens]
        stop_words = set(stopwords.words('english'))
        listed = [w for w in tokens if not w in stop_words]
        lemmatized = [wordnet_lemmatizer.lemmatize(word, pos="v") for word in listed]
        lemmatized = list(filter(lambda w: w != 'lb', lemmatized))
        words = list(filter(lambda w: w in english, lemmatized))
        return " ".join(words)
    
    lemmatized = [lemmadata(post) for post in data]
    
    tfidf = pickle.load(open("pickles/tfidf.pkl", "rb"))
    
    transformed = tfidf.transform(lemmatized)
    tfidf_df = pd.DataFrame(transformed.toarray(), columns=tfidf.get_feature_names())
    
    relevant = pickle.load(open("pickles/relevantwords.pkl", "rb"))
    
    testset = [tfidf_df[word] for word in relevant if word in tfidf_df.columns]
    
    return pd.DataFrame(testset).transpose()

In [25]:
x_train = preprocess(x_train)
x_test = preprocess(x_test)

## 2. Models

Testing a bunch of different models to see which works best:

1. K Nearest Neighbors
2. Decision Tree
3. Bagged Trees
4. Random Forest
5. AdaBoost
6. Gradient Boost
7. XGBoost
8. Support Vector Machine (+ PCA)
9. Multinomial Naive Bayes


In [31]:
def evaluate(model, name):
    
    output = {'model': name}
    start1 = time.time()
    model.fit(x_train, y_train)
    traintime = time.time() - start1
    
    # training metrics
    
    trainpred = model.predict(x_train)
    output['train_precision'] = precision_score(y_train, trainpred)
    output['train_recall'] = recall_score(y_train, trainpred)
    output['train_accuracy'] = accuracy_score(y_train, trainpred)
    output['train_f1'] = f1_score(y_train, trainpred)
    output['train_time'] = traintime
    
    # testing metrics
    
    start2 = time.time()
    pred = model.predict(x_test)
    testtime = time.time() - start2
    
    output['test_precision'] = precision_score(y_test, pred)
    output['test_recall'] = recall_score(y_test, pred)
    output['test_accuracy'] = accuracy_score(y_test, pred)
    output['test_f1'] = f1_score(y_test, pred)
    output['test_time'] = testtime
    
    # confusion matrix for test set
    
    conf = pd.crosstab(y_test, pred)
    
    return output, conf


### a. K Nearest Neighbors

In [21]:
# find optimal k 
def find_best_k(X_train, y_train, X_test, y_test, min_k=1, max_k=25):
    best_k = 0
    best_score = 0.0
    for k in range(min_k, max_k+1, 2):
        knn = KNeighborsClassifier(n_neighbors=k)
        knn.fit(X_train, y_train)
        preds = knn.predict(X_test)
        f1 = f1_score(y_test, preds)
        if f1 > best_score:
            best_k = k
            best_score = f1
    
    print("Best Value for k: {}".format(best_k))
    print("F1-Score: {}".format(best_score))

In [26]:
find_best_k(x_train, y_train, x_test, y_test)

Best Value for k: 1
F1-Score: 0.709277650454121


In [32]:
knn = KNeighborsClassifier(n_neighbors=1)
knn_results = evaluate(knn, 'knn')

In [33]:
# results

knn_results[0]

{'model': 'knn',
 'train_precision': 1.0,
 'train_recall': 1.0,
 'train_accuracy': 1.0,
 'train_f1': 1.0,
 'train_time': 15.505398035049438,
 'test_precision': 0.5544055201698513,
 'test_recall': 0.9842167255594817,
 'test_accuracy': 0.5922619047619048,
 'test_f1': 0.709277650454121,
 'test_time': 1787.0238852500916}

In [34]:
# confusion matrix

knn_results[1]

col_0,0,1
label,Unnamed: 1_level_1,Unnamed: 2_level_1
0,797,3358
1,67,4178


### b. Trees

#### Decision Tree

In [35]:
dt = DecisionTreeClassifier()
dt_results = evaluate(dt, 'decision_tree')

In [36]:
dt_results[0]

{'model': 'decision_tree',
 'train_precision': 1.0,
 'train_recall': 1.0,
 'train_accuracy': 1.0,
 'train_f1': 1.0,
 'train_time': 441.2796437740326,
 'test_precision': 0.8998357963875205,
 'test_recall': 0.9036513545347468,
 'test_accuracy': 0.9004761904761904,
 'test_f1': 0.9017395392571698,
 'test_time': 0.14611506462097168}

In [37]:
dt_results[1]

col_0,0,1
label,Unnamed: 1_level_1,Unnamed: 2_level_1
0,3728,427
1,409,3836


#### Bagged Trees

In [38]:
bt = BaggingClassifier()
bt_results = evaluate(bt, 'bagged_trees')

In [39]:
bt_results[0]

{'model': 'bagged_trees',
 'train_precision': 0.9969428126123966,
 'train_recall': 0.9925992241122054,
 'train_accuracy': 0.9947916666666666,
 'train_f1': 0.9947662768789066,
 'train_time': 712.2519800662994,
 'test_precision': 0.9341978866474544,
 'test_recall': 0.9163722025912838,
 'test_accuracy': 0.9251190476190476,
 'test_f1': 0.9251991913426091,
 'test_time': 6.73611307144165}

In [40]:
bt_results[1]

col_0,0,1
label,Unnamed: 1_level_1,Unnamed: 2_level_1
0,3881,274
1,355,3890


#### Random Forest

In [41]:
rf = RandomForestClassifier()
rf_results = evaluate(rf, 'random_forest')



In [42]:
rf_results[0]

{'model': 'random_forest',
 'train_precision': 0.9991021727419644,
 'train_recall': 0.9962399283795882,
 'train_accuracy': 0.9976785714285714,
 'train_f1': 0.9976689976689976,
 'train_time': 14.12683892250061,
 'test_precision': 0.9535968573533022,
 'test_recall': 0.9149587750294464,
 'test_accuracy': 0.9345238095238095,
 'test_f1': 0.9338783361384949,
 'test_time': 0.1742258071899414}

In [43]:
rf_results[1]

col_0,0,1
label,Unnamed: 1_level_1,Unnamed: 2_level_1
0,3966,189
1,361,3884


### c. Boosting

#### AdaBoost

In [44]:
ada = AdaBoostClassifier()
ada_results = evaluate(ada, 'adaboost')

In [45]:
ada_results[0]

{'model': 'adaboost',
 'train_precision': 0.9536901107700521,
 'train_recall': 0.8529991047448523,
 'train_accuracy': 0.9060416666666666,
 'train_f1': 0.9005387353895593,
 'train_time': 258.65357208251953,
 'test_precision': 0.9543426922067699,
 'test_recall': 0.8567726737338045,
 'test_accuracy': 0.9069047619047619,
 'test_f1': 0.9029294935451838,
 'test_time': 6.550574779510498}

In [46]:
ada_results[1]

col_0,0,1
label,Unnamed: 1_level_1,Unnamed: 2_level_1
0,3981,174
1,608,3637


#### Gradient Boosting

In [47]:
gb = GradientBoostingClassifier()
gb_results = evaluate(gb, 'gradient_boosting')

In [48]:
gb_results[0]

{'model': 'gradient_boosting',
 'train_precision': 0.968716719914803,
 'train_recall': 0.8686362279916443,
 'train_accuracy': 0.9205059523809523,
 'train_f1': 0.9159507851096637,
 'train_time': 1091.8649680614471,
 'test_precision': 0.9648478488982162,
 'test_recall': 0.8664310954063604,
 'test_accuracy': 0.916547619047619,
 'test_f1': 0.9129949112572918,
 'test_time': 0.1918017864227295}

In [49]:
gb_results[1]

col_0,0,1
label,Unnamed: 1_level_1,Unnamed: 2_level_1
0,4021,134
1,567,3678


#### XGBoost

In [80]:
D_train = xgb.DMatrix(x_train, label=y_train)
D_test = xgb.DMatrix(x_test, label=y_test)

  if getattr(data, 'base', None) is not None and \


In [81]:
param = {
    'eta': 0.3, 
    'max_depth': 3,  
    'objective': 'multi:softprob',  
    'num_class': 3} 

steps = 20

In [93]:
def evaluateXGB(model, name, train, test):
    
    output = {'model': name}
    start1 = time.time()
    model = xgb.train(param, train, steps)
    traintime = time.time() - start1
    
    # training metrics
    
    trainpred = model.predict(D_train)
    trainpreds = np.asarray([np.argmax(line) for line in trainpred])
    
    output['train_precision'] = precision_score(y_train, trainpreds)
    output['train_recall'] = recall_score(y_train, trainpreds)
    output['train_accuracy'] = accuracy_score(y_train, trainpreds)
    output['train_f1'] = f1_score(y_train, trainpreds)
    output['train_time'] = traintime
    
    # testing metrics
    
    start2 = time.time()
    testpred = model.predict(D_test)
    pred = np.asarray([np.argmax(line) for line in testpred])
    testtime = time.time() - start2
    
    output['test_precision'] = precision_score(y_test, pred)
    output['test_recall'] = recall_score(y_test, pred)
    output['test_accuracy'] = accuracy_score(y_test, pred)
    output['test_f1'] = f1_score(y_test, pred)
    output['test_time'] = testtime
    
    # confusion matrix for test set
    
    conf = pd.crosstab(y_test, pred)
    
    return output, conf

In [94]:
xgb_results = evaluateXGB(xgb, 'xgboost', D_train, D_test)

In [95]:
xgb_results[0]

{'model': 'xgboost',
 'train_precision': 0.9594844773568784,
 'train_recall': 0.8042375410325276,
 'train_accuracy': 0.8854464285714285,
 'train_f1': 0.8750284100133122,
 'train_time': 234.82280683517456,
 'test_precision': 0.9548369110677446,
 'test_recall': 0.806831566548881,
 'test_accuracy': 0.883095238095238,
 'test_f1': 0.8746169560776301,
 'test_time': 1.2996101379394531}

In [96]:
xgb_results[1]

col_0,0,1
label,Unnamed: 1_level_1,Unnamed: 2_level_1
0,3993,162
1,820,3425


### d. Support Vector Machine

In [50]:
pca = PCA(n_components=50)
pca.fit(x_train)
pca50_train = pca.transform(x_train)
pca50_test = pca.transform(x_test)

In [57]:
def evaluatePCA(model, name, x_train, x_test, y_train, y_test):
    
    output = {'model': name}
    start1 = time.time()
    model.fit(x_train, y_train)
    traintime = time.time() - start1
    
    # training metrics
    
    trainpred = model.predict(x_train)
    output['train_precision'] = precision_score(y_train, trainpred)
    output['train_recall'] = recall_score(y_train, trainpred)
    output['train_accuracy'] = accuracy_score(y_train, trainpred)
    output['train_f1'] = f1_score(y_train, trainpred)
    output['train_time'] = traintime
    
    # testing metrics
    
    start2 = time.time()
    pred = model.predict(x_test)
    testtime = time.time() - start2
    
    output['test_precision'] = precision_score(y_test, pred)
    output['test_recall'] = recall_score(y_test, pred)
    output['test_accuracy'] = accuracy_score(y_test, pred)
    output['test_f1'] = f1_score(y_test, pred)
    output['test_time'] = testtime
    
    # confusion matrix for test set
    
    conf = pd.crosstab(y_test, pred)
    
    return output, conf

In [74]:
svm_results = evaluatePCA(svm, "svm50", pca50_train, pca50_test, y_train, y_test)

In [75]:
svm_results[0]

{'model': 'svm50',
 'train_precision': 0.9683286296644716,
 'train_recall': 0.9215159653834676,
 'train_accuracy': 0.9458333333333333,
 'train_f1': 0.94434250764526,
 'train_time': 69.82057690620422,
 'test_precision': 0.9715994020926756,
 'test_recall': 0.9187279151943463,
 'test_accuracy': 0.9453571428571429,
 'test_f1': 0.944424264438794,
 'test_time': 4.882258892059326}

In [76]:
svm_results[1]

col_0,0,1
label,Unnamed: 1_level_1,Unnamed: 2_level_1
0,4041,114
1,345,3900


### e. Naive Bayes

In [51]:
mnb = MultinomialNB()
mnb_results = evaluate(mnb, 'naive_bayes')

In [52]:
mnb_results[0]

{'model': 'naive_bayes',
 'train_precision': 0.9717605004468275,
 'train_recall': 0.9735004476275738,
 'train_accuracy': 0.9726785714285714,
 'train_f1': 0.9726296958855097,
 'train_time': 0.363353967666626,
 'test_precision': 0.9729408972228816,
 'test_recall': 0.9656065959952885,
 'test_accuracy': 0.969047619047619,
 'test_f1': 0.9692598723102388,
 'test_time': 0.038023948669433594}

In [53]:
mnb_results[1]

col_0,0,1
label,Unnamed: 1_level_1,Unnamed: 2_level_1
0,4041,114
1,146,4099


### Results Summary

In [97]:
result_dicts = [knn_results, dt_results, bt_results, 
                rf_results, ada_results, gb_results,
                xgb_results, svm_results, mnb_results]
results = pd.DataFrame([i[0] for i in result_dicts])

In [98]:
results

Unnamed: 0,model,test_accuracy,test_f1,test_precision,test_recall,test_time,train_accuracy,train_f1,train_precision,train_recall,train_time
0,knn,0.592262,0.709278,0.554406,0.984217,1787.023885,1.0,1.0,1.0,1.0,15.505398
1,decision_tree,0.900476,0.90174,0.899836,0.903651,0.146115,1.0,1.0,1.0,1.0,441.279644
2,bagged_trees,0.925119,0.925199,0.934198,0.916372,6.736113,0.994792,0.994766,0.996943,0.992599,712.25198
3,random_forest,0.934524,0.933878,0.953597,0.914959,0.174226,0.997679,0.997669,0.999102,0.99624,14.126839
4,adaboost,0.906905,0.902929,0.954343,0.856773,6.550575,0.906042,0.900539,0.95369,0.852999,258.653572
5,gradient_boosting,0.916548,0.912995,0.964848,0.866431,0.191802,0.920506,0.915951,0.968717,0.868636,1091.864968
6,xgboost,0.883095,0.874617,0.954837,0.806832,1.29961,0.885446,0.875028,0.959484,0.804238,234.822807
7,svm50,0.945357,0.944424,0.971599,0.918728,4.882259,0.945833,0.944343,0.968329,0.921516,69.820577
8,naive_bayes,0.969048,0.96926,0.972941,0.965607,0.038024,0.972679,0.97263,0.971761,0.9735,0.363354


In [None]:
# looking at metrics, we decide on the Multinomial Naive Bayes model.