In [1]:
#Import modules
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
#Check data
df = pd.read_csv("Reddit_Data.csv")
df.head()

Unnamed: 0,clean_comment,category
0,family mormon have never tried explain them t...,1
1,buddhism has very much lot compatible with chr...,1
2,seriously don say thing first all they won get...,-1
3,what you have learned yours and only yours wha...,0
4,for your own benefit you may want read living ...,1


In [3]:
df.isna().sum()

clean_comment    100
category           0
dtype: int64

In [4]:
#Drop nan values
df = df.dropna()

In [5]:
df.isna().sum()

clean_comment    0
category         0
dtype: int64

In [6]:
df['category'].value_counts()

category
 1    15830
 0    13042
-1     8277
Name: count, dtype: int64

In [7]:
#Split into train and test
train = df['clean_comment']
test = df['category']  

In [8]:
#Let's generate Weak Labels with the help of Labelling Functions
from snorkel.preprocess import preprocessor
from textblob import TextBlob

POSITIVE = 1
NEUTRAL = 0
NEGATIVE = -1

In [9]:
def polarity_score(df):
    scores = []
    for comment in df:
        # print(comment)
        score = TextBlob(comment).sentiment.polarity
        # score = TextBlob(x['clean_comment']).sentiment.polarity
        if score >= 0.5:
            scores.append(POSITIVE)
        elif score >=0 and score < 0.50:
            scores.append(NEUTRAL)
        else:
            scores.append(NEGATIVE)
    return scores

In [10]:
train_label_pred = polarity_score(train)

In [12]:
from collections import Counter
Counter(train_label_pred)

Counter({0: 25994, -1: 8277, 1: 2878})

In [13]:
df['category'].value_counts()
#Negative has been classified perfectly, but there is some error in classifying 0 and 1.

category
 1    15830
 0    13042
-1     8277
Name: count, dtype: int64

In [14]:
#Didnt work.
# @preprocessor(memoize=True)
# def textblob_sentiment(x):
#     scores = TextBlob(x.text)
#     x.polarity = scores.sentiment.polarity #Polarity == sentiment, +1 is positive, -1 is negative, 0 is a neutral sentiment.
#     return x

# @labeling_function(pre=[textblob_sentiment])
# def textblob_polarity(x):
#     if x.polarity >= 0.75:
#         return POSITIVE
#     elif x.polarity >=0 and x.polarity < 0.75:
#         return NEUTRAL
#     else:
#         return NEGATIVE

# lfs = [textblob_polarity]

In [15]:
#Convert the X_train from Series -> Dataframe
train_df = train.to_frame()
type(train_df)

pandas.core.frame.DataFrame

In [16]:
#DID NOT WORK - so applied manually. 
# Apply the LFS in the train dataset (without labels = y_train)
# applier = PandasLFApplier(lfs=lfs)
# L_train = applier.apply(df=X_train)

In [17]:
#Compare X_train_label_pred with y_train

In [20]:
df_confusion = pd.crosstab(test, train_label_pred)
df_confusion
#11680 comments that were acutally positive were classified as neutral

col_0,-1,0,1
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-1,8277,0,0
0,0,13042,0
1,0,12952,2878


In [21]:
#Training a model for this (SVM since there is a heavy imbalance per class)
#To replicate having no label column in the dataset initially (and we only used TextBlob to get it), I will remove the original and replace 
#with TextBlob results

In [22]:
df = df.drop('category', axis=1)

In [24]:
df['sentiment'] = train_label_pred

In [25]:
df.head()

Unnamed: 0,clean_comment,sentiment
0,family mormon have never tried explain them t...,0
1,buddhism has very much lot compatible with chr...,0
2,seriously don say thing first all they won get...,-1
3,what you have learned yours and only yours wha...,0
4,for your own benefit you may want read living ...,0


In [26]:
X = df['clean_comment']
y = df['sentiment']

In [28]:
#Build simple SVM model
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.calibration import CalibratedClassifierCV

vectorizer = CountVectorizer(analyzer='word')
comments = vectorizer.fit_transform(X)

lsvm = SGDClassifier(alpha=0.001, random_state=5, tol=None)
lsvm_calibrated = CalibratedClassifierCV(lsvm)

In [29]:
lsvm_calibrated.fit(comments, y)

In [32]:
import pickle
# # save the model
# with open("SVMModel", 'wb') as fout:
#     pickle.dump((lsvm_calibrated, vectorizer), fout)

In [33]:
#load model
with open('SVMModel', 'rb') as f:
    svm_model, vectorizer = pickle.load(f)

In [38]:
messageCV = vectorizer.transform(["I LOVE HAPPY DOGS THAT KISS ME"])
pred = svm_model.predict_proba(messageCV)

In [40]:
#Order of pred results = -1, 0, 1
pred

array([[0.02610456, 0.46377403, 0.51012142]])