In [105]:
import pandas as pd
import re
import numpy as np
import string
import nltk
import textblob
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from textblob import TextBlob
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.semi_supervised import SelfTrainingClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt



[nltk_data] Downloading package punkt to
[nltk_data]     /Users/Shenyiyang/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/Shenyiyang/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/Shenyiyang/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


<font size=5 > Read data </font>

In [111]:
train_raw_full = pd.read_pickle('train.pkl')
train_raw_text = train_raw_full['text']

train_emb_full = pd.read_pickle('train_emb.pkl')
train_emb = pd.DataFrame(train_emb_full.TFIDF.to_list())
train_emb_label = train_emb_full['Sentiment']

dev_raw = pd.read_pickle('dev.pkl')
dev_raw_text = dev_raw['text']

dev_emb_full = pd.read_pickle('dev_emb.pkl')
dev_emb = pd.DataFrame(dev_emb_full.TFIDF.to_list())
dev_emb_label = dev_emb_full['Sentiment']

test_raw_full = pd.read_pickle('test.pkl')
test_raw_text = test_raw_full['text']

test_emb_full = pd.read_pickle('test_emb.pkl')
test_emb = pd.DataFrame(test_emb_full.TFIDF.to_list())

unlabel_raw_full = pd.read_pickle('unlabeled.pkl')
unlabel_raw_text = unlabel_raw_full['text']

unlabel_emb_full = pd.read_pickle('unlabeled_emb.pkl')
unlabel_emb = pd.DataFrame(unlabel_emb_full.TFIDF.to_list())
unlabel_emb_label = unlabel_emb_full['Sentiment']

# train_raw_text
# train_emb
# unlabel_emb


<font size=5 > Feature Engineering </font>

In [84]:
# Feature Engineering

def process_text(text):
    
#      remove _TWITTER-ENTITY_
    text = re.sub(r'_TWITTER-ENTITY_', '', text)
    
#     remove any url
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags = re.MULTILINE)
    
#     remove symbols
    text = re.sub(r'\@\w+|\#', '', text)
      
#     remove punctuations
    text = text.translate(str.maketrans('', '', string.punctuation))
    
#     transfer text to lowercase
    text = text.lower()
    
    return text
    
    
train_raw_text = train_raw_text.apply(process_text)

# train_raw_text

dev_raw_text = dev_raw_text.apply(process_text)

# dev_raw_text

test_raw_text = test_raw_text.apply(process_text)
# test_raw_text

unlabel_raw_text = unlabel_raw_text.apply(process_text)
# unlabel_raw_text


0                                  whats school these days
1        so he got in some trouble and him and his fagg...
2                 happy gilmore is the greatest movie ever
3        a day like this and my dad cancels netflix why...
4        its so dead at work  i shouldnt even have to b...
                               ...                        
99995              thats prolly who stole my fuckn mk belt
99996    getting my car services without my dad or  is ...
99997                                               nf  fb
99998    kinley is having a milk watching the race that...
99999    i swear one day me and  will do the red nose t...
Name: text, Length: 100000, dtype: object

In [85]:
stop_words = stopwords.words('english')

train_raw_text_nostopwords = train_raw_text.str.split()
train_raw_text_nostopwords = train_raw_text_nostopwords.apply(lambda x: [word for word in x if word not in (stop_words)])

dev_raw_text_nostopwords = dev_raw_text.str.split()
dev_raw_text_nostopwords = dev_raw_text_nostopwords.apply(lambda x: [word for word in x if word not in (stop_words)])

test_raw_text_nostopwords = test_raw_text.str.split()
test_raw_text_nostopwords = test_raw_text_nostopwords.apply(lambda x: [word for word in x if word not in (stop_words)])

unlabel_raw_text_nostopwords = unlabel_raw_text.str.split()
unlabel_raw_text_nostopwords = unlabel_raw_text_nostopwords.apply(lambda x: [word for word in x if word not in (stop_words)])


# strmming
ps = PorterStemmer()

train_raw_text_stem = train_raw_text_nostopwords.apply(lambda x: [ps.stem(t) for t in x])

dev_raw_text_stem = dev_raw_text_nostopwords.apply(lambda x: [ps.stem(t) for t in x])

test_raw_text_stem = test_raw_text_nostopwords.apply(lambda x: [ps.stem(t) for t in x])

unlabel_raw_text_stem = unlabel_raw_text_nostopwords.apply(lambda x: [ps.stem(t) for t in x])


# lemmatizer
lemm = WordNetLemmatizer()

train_raw_text_final = train_raw_text_stem.apply(lambda x: [lemm.lemmatize(t, pos = 'a') for t in x])
train_text_final = train_raw_text_final.apply(', '.join)
train_text_final = train_text_final.replace(',','', regex = True)
train_text_final = train_text_final.to_frame()

dev_raw_text_final = dev_raw_text_stem.apply(lambda x: [lemm.lemmatize(t, pos = 'a') for t in x])
dev_raw_text_final = dev_raw_text_final.apply(', '.join)
dev_raw_text_final = dev_raw_text_final.replace(',','', regex = True)
dev_raw_text_final = dev_raw_text_final.to_frame()

test_raw_text_final = test_raw_text_stem.apply(lambda x: [lemm.lemmatize(t, pos = 'a') for t in x])
test_raw_text_final = test_raw_text_final.apply(', '.join)
test_raw_text_final = test_raw_text_final.replace(',','', regex = True)
test_raw_text_final = test_raw_text_final.to_frame()

unlabel_raw_text_final = unlabel_raw_text_stem.apply(lambda x: [lemm.lemmatize(t, pos = 'a') for t in x])
unlabel_raw_text_final = unlabel_raw_text_final.apply(', '.join)
unlabel_raw_text_final = unlabel_raw_text_final.replace(',','', regex = True)
unlabel_raw_text_final = unlabel_raw_text_final.to_frame()



Unnamed: 0,text
0,what school day
1,got troubl faggot friend cri babi band made song
2,happi gilmor great movi ever
3,day like dad cancel netflix whyyyy
4,dead work shouldnt even
...,...
99995,that prolli stole fuckn mk belt
99996,get car servic without dad terrifi idk saywhatnow
99997,nf fb
99998,kinley milk watch race that best commerci ever...


In [87]:
# create function to get the subjectivity
def getSubjectivity(text):
    return TextBlob(text).sentiment.subjectivity

# create function to get the polarity
def getPolarity(text):
    return TextBlob(text).sentiment.polarity

df_train_text_sp = pd.DataFrame()
df_train_text_sp['subjectivity'] = train_text_final['text'].apply(getSubjectivity)
df_train_text_sp['polarity'] = train_text_final['text'].apply(getPolarity)

# append the vlaue to the feature of train_emb dataset
train_emb['384'] = df_train_text_sp['subjectivity']
train_emb['385'] = df_train_text_sp['polarity']


df_dev_text_sp = pd.DataFrame()
df_dev_text_sp['subjectivity'] = dev_raw_text_final['text'].apply(getSubjectivity)
df_dev_text_sp['polarity'] = dev_raw_text_final['text'].apply(getPolarity)

dev_emb['384'] = df_dev_text_sp['subjectivity']
dev_emb['385'] = df_dev_text_sp['polarity']


df_test_text_sp = pd.DataFrame()
df_test_text_sp['subjectivity'] = test_raw_text_final['text'].apply(getSubjectivity)
df_test_text_sp['polarity'] = test_raw_text_final['text'].apply(getPolarity)

test_emb['384'] = df_test_text_sp['subjectivity']
test_emb['385'] = df_test_text_sp['polarity']


df_unlabel_text_sp = pd.DataFrame()
df_unlabel_text_sp['subjectivity'] = unlabel_raw_text_final['text'].apply(getSubjectivity)
df_unlabel_text_sp['polarity'] = unlabel_raw_text_final['text'].apply(getPolarity)

unlabel_emb['384'] = df_unlabel_text_sp['subjectivity']
unlabel_emb['385'] = df_unlabel_text_sp['polarity']



<font size=5 > Models and Evaluation</font>

In [104]:
# baseline model

GaussianNB_model = GaussianNB()
GaussianNB_model.fit(train_emb, train_emb_label)
GaussianNB_predict = GaussianNB_model.predict(dev_emb)

acc_score = accuracy_score(GaussianNB_predict, dev_emb_label)

print('Base model Accuracy result:', acc_score)
print('---------------------------------------------------------------------------------')
print(classification_report(GaussianNB_predict, dev_emb_label))
print('---------------------------------------------------------------------------------')

# logistic Regression model

LR_model = LogisticRegression()
LR_model.fit(train_emb, train_emb_label)
LR_predict = LR_model.predict(dev_emb)

LR_acc_score = accuracy_score(LR_predict, dev_emb_label)

print('logistic Regression Accuracy result:', LR_acc_score)
print('---------------------------------------------------------------------------------')
print(classification_report(LR_predict, dev_emb_label))
print('---------------------------------------------------------------------------------')

# Radom Forest and Decision Tree

dtc_model = DecisionTreeClassifier(random_state=0)
rfc_model = RandomForestClassifier(random_state=0)
dtc = dtc_model.fit(train_emb, train_emb_label)
rfc = rfc_model.fit(train_emb, train_emb_label)
dtc_predict = dtc_model.predict(dev_emb)
rfc_predict = rfc_model.predict(dev_emb)

dtc_score = dtc.score(dev_emb, dev_emb_label)
rfc_score = rfc.score(dev_emb, dev_emb_label)

print("Single Tree Accuracy score:{}".format(dtc_score))
print('---------------------------------------------------------------------------------')
print(classification_report(dtc_predict, dev_emb_label))
print('---------------------------------------------------------------------------------')
print("Random Forest Accuracy score:{}".format(rfc_score))
print(classification_report(rfc_predict, dev_emb_label))
print('---------------------------------------------------------------------------------')


Base model Accuracy result: 0.61525
---------------------------------------------------------------------------------
              precision    recall  f1-score   support

    negative       0.70      0.60      0.65      2339
    positive       0.53      0.64      0.58      1661

    accuracy                           0.62      4000
   macro avg       0.62      0.62      0.61      4000
weighted avg       0.63      0.62      0.62      4000

---------------------------------------------------------------------------------
logistic Regression Accuracy result: 0.6995
---------------------------------------------------------------------------------
              precision    recall  f1-score   support

    negative       0.68      0.71      0.69      1922
    positive       0.72      0.69      0.71      2078

    accuracy                           0.70      4000
   macro avg       0.70      0.70      0.70      4000
weighted avg       0.70      0.70      0.70      4000

--------------------

<font size=5 > Improve Models </font>

In [79]:
# adjust parameters

params = {'C':[0.0001, 1, 100, 1000],
          'max_iter':[1, 10, 100, 500],
          'class_weight':['balanced', None],
          'solver':['liblinear','sag','lbfgs','newton-cg']
         }
lr = LogisticRegression()
clf = GridSearchCV(lr, param_grid=params, cv=10)
clf.fit(dev_emb, dev_emb_label)
clf.best_params_

{'C': 1, 'class_weight': 'balanced', 'max_iter': 10, 'solver': 'lbfgs'}

In [83]:
# new_lr = LogisticRegression(**clf.best_params_)
# new_lr.fit(train_emb, train_emb_label)
# new_predict = new_lr.predict(dev_emb)
# new_acc_score = accuracy_score(dev_emb_label, new_predict)
# print('new logistic Regression Accuracy result:', new_acc_score)

<font size=5 > semi-supervised learning </font>

In [134]:
semi_train_emb = pd.concat([train_emb, unlabel_emb]).reset_index(drop = True)
replace_label = pd.Series([-1]*100000)
semi_train_emb_label = pd.concat([train_emb_label, replace_label]).reset_index(drop = True)

new_lr = LogisticRegression()
semi_model = SelfTrainingClassifier(new_lr, threshold = 0.6)
semi_model.fit(semi_train_emb, semi_train_emb_label)
semi_predict = semi_model.predict(dev_emb)

semi_acc_score = accuracy_score(semi_predict, dev_emb_label)

print('Semi-supervised Accuracy result:', semi_acc_score)
print('---------------------------------------------------------------------------------')
print(classification_report(semi_predict, dev_emb_label))
print('---------------------------------------------------------------------------------')


Semi-supervised Accuracy result: 0.70175
---------------------------------------------------------------------------------
              precision    recall  f1-score   support

    negative       0.67      0.71      0.69      1877
    positive       0.73      0.69      0.71      2123

    accuracy                           0.70      4000
   macro avg       0.70      0.70      0.70      4000
weighted avg       0.70      0.70      0.70      4000

---------------------------------------------------------------------------------


<font size=5 > Predict the test dataset </font>

In [122]:
new_lr = LogisticRegression()
semi_model = SelfTrainingClassifier(new_lr, threshold = 0.6)
semi_model.fit(semi_train_emb, semi_train_emb_label)
semi_predict = semi_model.predict(test_emb)
predict_result = pd.DataFrame(semi_predict)
predict_result.to_csv('predict.csv', header = 'Category', index = 'Id')

In [None]:
# rfc_model = RandomForestClassifier(random_state=0)
# semi_model_2 = SelfTrainingClassifier(rfc_model, threshold = 0.6)
# semi_model_2.fit(semi_train_emb, semi_train_emb_label)
# semi_predict_2 = semi_model_2.predict(test_emb)
# predict_result_2 = pd.DataFrame(semi_predict_2)
# predict_result_2.to_csv('predict_2.csv', header = 'Category', index = 'Id')

In [125]:
new_lr = LogisticRegression()
semi_model = SelfTrainingClassifier(new_lr, threshold = 0.9)
semi_model.fit(semi_train_emb, semi_train_emb_label)
semi_predict = semi_model.predict(test_emb)
predict_result = pd.DataFrame(semi_predict)
predict_result.to_csv('predict_3.csv', header = 'Category', index = 'Id')

In [131]:
rfc_model_new = RandomForestClassifier(random_state=0)
rfc_new = rfc_model_new.fit(train_emb, train_emb_label)
rfc_predict_new = rfc_new.predict(test_emb)
predict_result_7 = pd.DataFrame(rfc_predict_new)
predict_result_7.to_csv('predict_7.csv', header = 'Category', index = 'Id')

In [132]:
rfc_model = RandomForestClassifier(random_state=0)
semi_model_3 = SelfTrainingClassifier(rfc_model, threshold = 0.9)
semi_model_3.fit(semi_train_emb, semi_train_emb_label)
semi_predict_3 = semi_model_3.predict(test_emb)
predict_result_8 = pd.DataFrame(semi_predict_3)
predict_result_8.to_csv('predict_8.csv', header = 'Category', index = 'Id')