# Reddit Post Classifier

## The right way this time

In [51]:
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import TfidfVectorizer

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
english = set(nltk.corpus.words.words())

from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score

## Data Grab + TTS

In [7]:
x = pd.read_csv('lemmatized.csv')['0']

In [4]:
posts = pd.read_csv('arts-programming-reddit-posts.csv')
y = posts.label

In [11]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2)

In [86]:
x_test.shape

(8400,)

## Preprocessing Pipeline

In [64]:
# 1. tfidf vectorizer

tfidf = TfidfVectorizer()
response = tfidf.fit_transform(x_train)
tfidf_df = pd.DataFrame(response.toarray(), columns=tfidf.get_feature_names())

# pickle this

# pickle.dump(tfidf, open("tfidf.pkl", "wb"))

In [44]:
# only keep words w average tfidf scores over 0.0001
relevant = []
for word in tfidf_df.columns:
    if tfidf_df[word].mean() > 0.0001:
        relevant.append(tfidf_df[word])
len(relevant)

4883

In [35]:
relevant_df = pd.DataFrame(relevant).transpose()

In [95]:
pickle.dump(relevant_df.columns, open("relevantwords.pkl", "wb"))

In [102]:
def preprocess(data):
    
    # lemmatize
    def lemmadata(doc):
        pattern = "([a-zA-Z]+(?:'[a-z]+)?)"
        raw_tokens = nltk.regexp_tokenize(doc, pattern)
        tokens = [i.lower() for i in raw_tokens]
        stop_words = set(stopwords.words('english'))
        listed = [w for w in tokens if not w in stop_words]
        lemmatized = [wordnet_lemmatizer.lemmatize(word, pos="v") for word in listed]
        lemmatized = list(filter(lambda w: w != 'lb', lemmatized))
        words = list(filter(lambda w: w in english, lemmatized))
        return " ".join(words)
    
    lemmatized = [lemmadata(post) for post in data]
    
    tfidf = pickle.load(open("tfidf.pkl", "rb"))
    
    transformed = tfidf.transform(lemmatized)
    tfidf_df = pd.DataFrame(transformed.toarray(), columns=tfidf.get_feature_names())
    
    relevant = pickle.load(open("relevantwords.pkl", "rb"))
    
    testset = [tfidf_df[word] for word in relevant if word in tfidf_df.columns]
    
    return pd.DataFrame(testset).transpose()

## Model

In [36]:
mnb = MultinomialNB()
mnb.fit(relevant_df, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [106]:
pickle.dump(mnb, open('mnb.pkl', 'wb'))

In [37]:
def print_metrics(labels, preds):
    print("Precision Score: {}".format(precision_score(labels, preds)))
    print("Recall Score: {}".format(recall_score(labels, preds)))
    print("Accuracy Score: {}".format(accuracy_score(labels, preds)))
    print("F1 Score: {}".format(f1_score(labels, preds)))

In [103]:
test = preprocess(x_test)

In [104]:
test

Unnamed: 0,aa,abandon,ability,able,abort,abroad,absolute,absolutely,absorb,abstract,...,young,younger,youth,yr,za,zero,zip,zipper,zone,zoom
0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
1,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
2,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
3,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
4,0.0,0.0,0.070944,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.071235,0.0,0.0,0.0,0.0
5,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
6,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
7,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
8,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.047393,0.0,0.0,0.0,0.0
9,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.185875,0.0,0.0,0.000000,0.0,0.0,0.0,0.0


In [105]:
mnb_pred = mnb.predict(test)
print_metrics(y_test, mnb_pred)

Precision Score: 0.965958474167069
Recall Score: 0.966191741125332
Accuracy Score: 0.9665476190476191
F1 Score: 0.9660750935651333


In [119]:
def classify_text(text):
    listtext = [text]
    processed = preprocess(listtext)
    return mnb.predict(processed)
    