# Train a Sentiment Analysis Classifier

In [2]:
# load data and take a quick look
import pandas as pd
import numpy as np
import re, string, nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import wordnet as wn
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from collections import defaultdict
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore")


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/venkatasandeep/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/venkatasandeep/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
raw_data = pd.read_csv('training_data.csv')
raw_data.head()

Unnamed: 0.1,Unnamed: 0,text,sentiment
0,0,Enjoy the opening credits. They're the best th...,neg
1,1,"Well, the Sci-Fi channel keeps churning these ...",neg
2,2,It takes guts to make a movie on Gandhi in Ind...,pos
3,3,The Nest is really just another 'nature run am...,neg
4,4,Waco: Rules of Engagement does a very good job...,pos


In [4]:
#checking for null data
raw_data.isnull().sum()

if('Unnamed: 0' in raw_data.columns):
    del raw_data['Unnamed: 0']

raw_data.sentiment.value_counts()

raw_data['sentiment'] = raw_data.sentiment.map(lambda x: int(1) if x =='pos' else int(0))

raw_data.head()

Unnamed: 0,text,sentiment
0,Enjoy the opening credits. They're the best th...,0
1,"Well, the Sci-Fi channel keeps churning these ...",0
2,It takes guts to make a movie on Gandhi in Ind...,1
3,The Nest is really just another 'nature run am...,0
4,Waco: Rules of Engagement does a very good job...,1


In [5]:
negate_handle = {
"aren\'t" : "are not",
"can\'t" : "cannot",
"couldn\'t" : "could not",
"didn\'t" : "did not",
"doesn\'t" : "does not",
"don\'t" : "do not",
"hadn\'t" : "had not",
"hasn\'t" : "has not",
"haven\'t" : "have not",
"he\'d" : "he would",
"he\'ll" : "he will",
"he\'s" : "he is",
"i\'d" : "I would",
"i\'d" : "I had",
"i\'ll" : "I will",
"i\'m" : "I am",
"isn\'t" : "is not",
"it\'s" : "it is",
"it\'ll":"it will",
"i\'ve" : "I have",
"let\'s" : "let us",
"mightn\'t" : "might not",
"mustn\'t" : "must not",
"shan\'t" : "shall not",
"she\'d" : "she would",
"she\'ll" : "she will",
"she\'s" : "she is",
"shouldn\'t" : "should not",
"that\'s" : "that is",
"there\'s" : "there is",
"they\'d" : "they would",
"they\'ll" : "they will",
"they\'re" : "they are",
"they\'ve" : "they have",
"we\'d" : "we would",
"we\'re" : "we are",
"weren\'t" : "were not",
"we\'ve" : "we have",
"what\'ll" : "what will",
"what\'re" : "what are",
"what\'s" : "what is",
"what\'ve" : "what have",
"where\'s" : "where is",
"who\'d" : "who would",
"who\'ll" : "who will",
"who\'re" : "who are",
"who\'s" : "who is",
"who\'ve" : "who have",
"won\'t" : "will not",
"wouldn\'t" : "would not",
"you\'d" : "you would",
"you\'ll" : "you will",
"you\'re" : "you are",
"you\'ve" : "you have",
"\'re": " are",
"wasn\'t": "was not",
"we\'ll":" will",
"didn\'t": "did not"
}

In [6]:
# check the size of the data and its class distribution
all_text = raw_data['text'].tolist()
all_lables = raw_data['sentiment'].tolist()

print('entry num', len(all_text))
print('num of pos entries', len([l for l in all_lables if l==1]))
print('num of neg entries', len([l for l in all_lables if l==0]))

#all_text

entry num 40000
num of pos entries 20000
num of neg entries 20000


['Enjoy the opening credits. They\'re the best thing about this second-rate but inoffensive time-killer which features passable performances from the likes of Eric Roberts and Martin Kove. The main part, however, goes to newcomer Tommy Lee Thomas who looks a bit diminutive for this kind of action but who, nevertheless, occasionally manages to project a banty-rooster kind of belligerence. The first time we see him he\'s bare-chested, sweaty, and engaged in that favorite "beefcake" activity -- chopping wood. After this he has seven more scenes without his shirt including one in which he\'s hanged by his wrists and zapped with electricity a la Mel Gibson in "Lethal Weapon." He could use a better script, however, since the manner in which he exposes the truth about corruption and violence inside the prison is never very convincing. There\'s also talk about millions of dollars which apparently is tied in with this investigation but which is never explained. There are a few pluses, though. S

In [6]:
#all_text[25]
#ratings = [re.findall('(\d{1,2}[\/]\d{1,2})', x) for x in all_text]
#print(ratings)

In [7]:
# text cleaning and preprocessing:

def text_preprocessor(data):
    
    data = [re.sub(re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});'), '', line) for line in data]
    data = [re.sub(r'[^a-zA-Z\s]', ' ', line) for line in data]
    
    # strip whitespace and converting to lower case
    lower_data = [line.strip().lower() for line in data]
    
    # Replace apostrophes with words
    processed_data = []
    for line in lower_data:
        line = line.replace("-", " ")
        ref_words = [negate_handle[word] if word in negate_handle else word for word in line.split()]
        processed_data.append(" ".join(ref_words))
        
    #remove punctuation
    processed_data = [line.translate(str.maketrans('', '', string.punctuation)) for line in processed_data] 
    
    # removing stopwords
    stops = set(stopwords.words("english"))
    processed_data = [" ".join([word for word in line.split() if word not in stops]) for line in processed_data]
    
    return processed_data

def stem_lemmatize(data):
    # stemming
    # stemmer= PorterStemmer()
    # processed_data = [" ".join([stemmer.stem(word) for word in line.split()]) for line in data]
    
    # lemmatization
    lemmatizer=WordNetLemmatizer()
    processed_data = [" ".join([lemmatizer.lemmatize(word) for word in line.split()]) for line in data]
    
    return processed_data
 
processed_data = text_preprocessor(all_text)
processed_data = stem_lemmatize(processed_data)

In [8]:
processed_data[0]

'enjoy opening credit best thing second rate inoffensive time killer feature passable performance like eric robert martin kove main part however go newcomer tommy lee thomas look bit diminutive kind action nevertheless occasionally manages project banty rooster kind belligerence first time see bare chested sweaty engaged favorite beefcake activity chopping wood seven scene without shirt including one hanged wrist zapped electricity la mel gibson lethal weapon could use better script however since manner expose truth corruption violence inside prison never convincing also talk million dollar apparently tied investigation never explained plus though sending john woodrow undercover john wilson amusing play presidential name co star jody ross nolan show promise inmate early proceeding shown hanged wrist getting punched burly guard one final note movie low budget painfully responsible lack extra despite impressive size prison seems hold inmate note cast credit end helpful record burly bald 

In [9]:
raw_data['text'] = processed_data

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import precision_recall_fscore_support,accuracy_score
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report 
from sklearn.preprocessing import MaxAbsScaler

In [11]:
train_text, test_text, train_labels, test_labels = train_test_split(raw_data['text'], raw_data['sentiment'], test_size=0.25, random_state=2908)

In [12]:
# tf-idf initialization
train_vectorizer = TfidfVectorizer(max_features=5000,use_idf=True)
train_vecs = train_vectorizer.fit_transform(train_text)
test_vecs = TfidfVectorizer(max_features=5000,vocabulary=train_vectorizer.vocabulary_).fit_transform(test_text)

tfidf_feature_names = train_vectorizer.get_feature_names()

In [13]:
#train_vectorizer.get_feature_names()
#df = pd.DataFrame(train_vecs[5].T.todense(), index=train_vectorizer.get_feature_names(), columns=["tf-idf"])
#df.sort_values(by=["tfidf"],ascending=False)

In [14]:
# CountVectorizer initialization
count_vect = CountVectorizer(max_features=5000)
X_train_vectorized = count_vect.fit_transform(train_text)
X_test_vectorized = CountVectorizer(max_features=5000,vocabulary=count_vect.vocabulary_).fit_transform(test_text)

cv_feature_names = count_vect.get_feature_names()

In [15]:
train_vecs.shape

(30000, 5000)

In [16]:
# Create hyperparameter options

log_hyperparameters = dict(C=[0.001, 0.01, 0.1, 1, 10, 100, 1000], penalty=['l1', 'l2'])
log_clf = GridSearchCV(LogisticRegression(), log_hyperparameters, cv=10, verbose=0)

In [17]:
# training: tf-idf + logistic regression
# train model

best_model = log_clf.fit(train_vecs, train_labels)
best_penalty = best_model.best_estimator_.get_params()['penalty']
best_c = best_model.best_estimator_.get_params()['C']
print('Best Penalty:', best_penalty)
print('Best C:', best_c)

best_clf = LogisticRegression(C=best_c,penalty=best_penalty).fit(train_vecs, train_labels)

# test model
test_pred = best_clf.predict(test_vecs)

acc = accuracy_score(test_labels, test_pred)
print('accuracy', acc)

# confusion matrix
print(confusion_matrix(test_labels, test_pred))

# classification_report
print(classification_report(test_labels, test_pred))

Best Penalty: l2
Best C: 1
accuracy 0.8849
[[4412  623]
 [ 528 4437]]
              precision    recall  f1-score   support

           0       0.89      0.88      0.88      5035
           1       0.88      0.89      0.89      4965

    accuracy                           0.88     10000
   macro avg       0.88      0.88      0.88     10000
weighted avg       0.89      0.88      0.88     10000



In [18]:
weighted_dict = {}
for (i,w) in enumerate(tfidf_feature_names):
    weighted_dict[w] = abs(best_clf.coef_[0][i])

sorted_weighted_dict = {k: v for k, v in sorted(weighted_dict.items(), key=lambda item: item[1], reverse = True)}

print("Non Zero weights:",np.count_nonzero(best_clf.coef_))
df = pd.DataFrame([sorted_weighted_dict])
df.iloc[:,0:10]

Non Zero weights: 5000


Unnamed: 0,worst,waste,bad,awful,great,excellent,boring,poor,terrible,nothing
0,9.505981,7.745703,7.319295,7.002033,6.814151,6.490482,6.202134,5.440128,5.284875,5.040415


In [19]:
# training: CountVectorizer + logistic regression
# train model

best_model = log_clf.fit(train_vecs, train_labels)
best_penalty = best_model.best_estimator_.get_params()['penalty']
best_c = best_model.best_estimator_.get_params()['C']
print('Best Penalty:', best_penalty)
print('Best C:', best_c)

clf = LogisticRegression(C=best_c,penalty=best_penalty).fit(X_train_vectorized, train_labels)

# test model
test_pred = clf.predict(X_test_vectorized)
acc = accuracy_score(test_labels, test_pred)
print('accuracy', acc)

# confusion matrix
print(confusion_matrix(test_labels, test_pred))

# classification_report
print(classification_report(test_labels, test_pred))

Best Penalty: l2
Best C: 1
accuracy 0.8663
[[4367  668]
 [ 669 4296]]
              precision    recall  f1-score   support

           0       0.87      0.87      0.87      5035
           1       0.87      0.87      0.87      4965

    accuracy                           0.87     10000
   macro avg       0.87      0.87      0.87     10000
weighted avg       0.87      0.87      0.87     10000



In [20]:
weighted_dict = {}
for (i,w) in enumerate(cv_feature_names):
    weighted_dict[w] = abs(clf.coef_[0][i])

sorted_weighted_dict = {k: v for k, v in sorted(weighted_dict.items(), key=lambda item: item[1], reverse = True)}

print("Non Zero weights:",np.count_nonzero(clf.coef_))
df = pd.DataFrame([sorted_weighted_dict])
df.iloc[:,0:10]


Non Zero weights: 5000


Unnamed: 0,waste,forgettable,sensitive,mst,disappointment,unwatchable,worst,tedious,disappoint,subtle
0,2.618997,2.429935,2.281061,2.203693,2.173855,2.030097,1.998058,1.931919,1.92398,1.91087


In [21]:
# training: tf-idf + Naive Bayes
# train model

clf = naive_bayes.MultinomialNB()
clf.fit(train_vecs, train_labels)

# test model
test_pred = clf.predict(test_vecs)
acc = accuracy_score(test_labels, test_pred)
print('accuracy', acc)

# confusion matrix
print(confusion_matrix(test_labels, test_pred))

# classification_report
print(classification_report(test_labels, test_pred))

accuracy 0.8549
[[4294  741]
 [ 710 4255]]
              precision    recall  f1-score   support

           0       0.86      0.85      0.86      5035
           1       0.85      0.86      0.85      4965

    accuracy                           0.85     10000
   macro avg       0.85      0.85      0.85     10000
weighted avg       0.85      0.85      0.85     10000



In [22]:
# training: CountVectorizer + Naive Bayes
# train model

clf = naive_bayes.MultinomialNB()
clf.fit(X_train_vectorized, train_labels)

# test model
test_pred = clf.predict(X_test_vectorized)
acc = accuracy_score(test_labels, test_pred)
print('accuracy', acc)

# confusion matrix
print(confusion_matrix(test_labels, test_pred))

# classification_report
print(classification_report(test_labels, test_pred))

accuracy 0.8491
[[4318  717]
 [ 792 4173]]
              precision    recall  f1-score   support

           0       0.85      0.86      0.85      5035
           1       0.85      0.84      0.85      4965

    accuracy                           0.85     10000
   macro avg       0.85      0.85      0.85     10000
weighted avg       0.85      0.85      0.85     10000



## SAVE YOUR TRAINED MODEL
After you have found the best model, save your trained model and other necessary components (e.g. vocabulary, vectorizer) to a file. I will load your model from the saved file and apply your trained model on some held-out test data. **At submission time, you are supposed to submit the saved model file, and I will NOT re-run your code to train your model; instead, I will directly use your trained model to run test (see notebook *cw1-test.ipybn*)**. 

Below is a sample code for saving the model (and other necessary components) obtained above, using the *pickle* package in Python. *You should adjust the code to save the necessary components for re-running your model!*

In [23]:
import pickle

# save model and other necessary modules
all_info_want_to_save = {
    'model': best_clf,
    'vectorizer': TfidfVectorizer(max_features=5000,vocabulary=train_vectorizer.vocabulary_),
    'negate_handle' : negate_handle,
    'feature_names' : tfidf_feature_names
    #'text_preprocessor' : text_preprocessor(test_text)
}
with open("sample_trained_model.pickle", "wb") as f:
    pickle.dump(all_info_want_to_save, f)


In *cw1-test.ipynb*, I provide a sample code to illustrate how to re-load your saved model and apply it to some test data. 