# Reddit Post Classifier

In [1]:
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import TfidfVectorizer

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
english = set(nltk.corpus.words.words())

from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score

## Data Grab + TTS

In [2]:
x = pd.read_csv('data/lemmatized.csv')['0']

In [3]:
posts = pd.read_csv('data/arts-programming-reddit-posts.csv')
y = posts.label

In [4]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2)

In [5]:
x_test.shape

(8400,)

## Preprocessing Pipeline

In [6]:
# 1. tfidf vectorizer

tfidf = TfidfVectorizer()
response = tfidf.fit_transform(x_train)
tfidf_df = pd.DataFrame(response.toarray(), columns=tfidf.get_feature_names())

# pickle this
pickle.dump(tfidf, open("pickles/tfidf.pkl", "wb"))

In [7]:
# only keep words w average tfidf scores over 0.0001

relevant = []
for word in tfidf_df.columns:
    if tfidf_df[word].mean() > 0.0001:
        relevant.append(tfidf_df[word])
len(relevant)

4885

In [8]:
relevant_df = pd.DataFrame(relevant).transpose()

pickle.dump(relevant_df.columns, open("pickles/relevantwords.pkl", "wb"))

In [9]:
def preprocess(data):
    
    # lemmatize
    def lemmadata(doc):
        pattern = "([a-zA-Z]+(?:'[a-z]+)?)"
        raw_tokens = nltk.regexp_tokenize(doc, pattern)
        tokens = [i.lower() for i in raw_tokens]
        stop_words = set(stopwords.words('english'))
        listed = [w for w in tokens if not w in stop_words]
        lemmatized = [wordnet_lemmatizer.lemmatize(word, pos="v") for word in listed]
        lemmatized = list(filter(lambda w: w != 'lb', lemmatized))
        words = list(filter(lambda w: w in english, lemmatized))
        return " ".join(words)
    
    lemmatized = [lemmadata(post) for post in data]
    
    tfidf = pickle.load(open("pickles/tfidf.pkl", "rb"))
    
    transformed = tfidf.transform(lemmatized)
    tfidf_df = pd.DataFrame(transformed.toarray(), columns=tfidf.get_feature_names())
    
    relevant = pickle.load(open("pickles/relevantwords.pkl", "rb"))
    
    testset = [tfidf_df[word] for word in relevant if word in tfidf_df.columns]
    
    return pd.DataFrame(testset).transpose()

## Model

In [20]:
mnb = MultinomialNB()
mnb.fit(relevant_df, y_train)
pickle.dump(mnb, open('pickles/mnb.pkl', 'wb'))

In [21]:
def print_metrics(labels, preds):
    print("Precision Score: {}".format(precision_score(labels, preds)))
    print("Recall Score: {}".format(recall_score(labels, preds)))
    print("Accuracy Score: {}".format(accuracy_score(labels, preds)))
    print("F1 Score: {}".format(f1_score(labels, preds)))

In [22]:
test = preprocess(x_test)

In [23]:
mnb_pred = mnb.predict(test)
print_metrics(y_test, mnb_pred)

Precision Score: 0.9689737470167065
Recall Score: 0.9668968802095738
Accuracy Score: 0.9679761904761904
F1 Score: 0.9679341995470259


In [24]:
def classify_text(text):
    listtext = [text]
    processed = preprocess(listtext)
    return mnb.predict(processed)
    