In [179]:
#import libraries
import pandas as pd
import numpy as np
import emoji
import re
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import pickle

In [76]:
data = pd.read_csv("WSB_Comments.csv")

In [77]:
#counts of each class
print(data['Label'].value_counts())

Neutral    596
Bear       594
Bull       591
Name: Label, dtype: int64


In [78]:
def cleanText(text):
    """
    Applies pre-processing to text comment

    Steps:
        1. emoji to text
        2. remove html tags
        3. lowercase
        4. remove punctuation 
    """
    #demojize emojis
    text = emoji.demojize(text, delimiters=("", ""))

    # remove HTML tags
    text = re.sub(r'<.*?>', '', text)

    # convert text to lowercase
    text = text.strip().lower()
    
    # remove the characters [\], ['] and ["]
    text = re.sub(r"\\", "", text)    
    text = re.sub(r"\'", "", text)    
    text = re.sub(r"\"", "", text)   
    
    # replace punctuation characters with spaces
    filters ='!"\'#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'
    translate_dict = dict((c, " ") for c in filters)
    translate_map = str.maketrans(translate_dict)
    text = text.translate(translate_map)

    return text

class TextCleaner(BaseEstimator, TransformerMixin):
    def fit(self, X, y = None):
        return self

    def transform(self, X):
        return pd.Series(X).apply(cleanText).values

In [101]:
#split X and y
X = data['Comment']
y = data['Label']

In [176]:
#create pipelines and test few basic models
classifier_names = ["Naive Bayes", "Logistic Regression", "Random Forest", "KNN", "SVM"]
classifiers = [MultinomialNB(), LogisticRegression(), RandomForestClassifier(), KNeighborsClassifier(), LinearSVC()]
zipped_clf = zip(classifier_names, classifiers)
for n, c in zipped_clf:
    pipeline = Pipeline([
    ('clean', TextCleaner()),
    ('cv', CountVectorizer(stop_words = "english")),
    ('clf', c),
    ])
    print(n + ": {0:.2f}%".format(np.mean(cross_val_score(pipeline, X, y, cv = 5))*100))
    

Naive Bayes: 58.84%
Logistic Regression: 63.00%
Random Forest: 61.65%
KNN: 47.61%
SVM: 59.46%


In [181]:
#save the best model
pipeline = Pipeline([
    ('clean', TextCleaner()),
    ('cv', CountVectorizer(stop_words = "english")),
    ('clf', LogisticRegression()),
    ])
pipeline.fit(X, y)
with open("model.pk", 'wb') as file:
    pickle.dump(pipeline, file) 