Sentiment Analysis using Bag of Words ( words ) + Bag of Words ( character ) + TF-IDF (word+char) + Logistic Regression 

*F1 - score of 89% on validation set*

In [31]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import textblob

from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

import re
import pickle
from scipy.sparse import hstack

from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score, make_scorer,roc_curve, roc_auc_score

import os
print(os.listdir("../input/"))

['testData.tsv', 'sampleSubmission.csv', 'labeledTrainData.tsv', 'unlabeledTrainData.tsv']


In [2]:
# Load Data
print("Loading data...")
train = pd.read_csv('../input/labeledTrainData.tsv', sep="\t")
print("Train shape:", train.shape)
test = pd.read_csv('../input/testData.tsv', sep="\t")
print("Test shape:", test.shape)

sample = pd.read_csv('../input/sampleSubmission.csv', sep=",")

Loading data...
Train shape: (25000, 3)
Test shape: (25000, 2)


In [3]:
train.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


In [4]:
sample.head()

Unnamed: 0,id,sentiment
0,12311_10,0
1,8348_2,0
2,5828_4,0
3,7186_2,0
4,12128_7,0


In [5]:
train['sentiment'].value_counts() # balanced dataset

1    12500
0    12500
Name: sentiment, dtype: int64

In [6]:
# Check the first review
print('The first review is:\n\n',train["review"][0])

The first review is:

 With all this stuff going down at the moment with MJ i've started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ's feeling towards the press and also the obvious message of drugs are bad m'kay.<br /><br />Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.<br /><br />The actual feature film b

In [7]:
# all the words in the test set that also occurs in the train set
from sklearn.feature_extraction.text import CountVectorizer

cv1 = CountVectorizer()
cv1.fit(train["review"])

cv2 = CountVectorizer()
cv2.fit(test["review"])

print("Train Set Vocabulary Size:", len(cv1.vocabulary_))
print("Test Set Vocabulary Size:", len(cv2.vocabulary_))
print("Number of Words that occur in both:", len(set(cv1.vocabulary_.keys()).intersection(set(cv2.vocabulary_.keys()))))

Train Set Vocabulary Size: 74849
Test Set Vocabulary Size: 73822
Number of Words that occur in both: 46776


In [8]:
# clean description
print("Cleaning train data...\n")
train['review'] = train['review'].map(lambda x: BeautifulSoup(x).get_text())
print("Cleaning test data...")
test['review'] = test['review'].map(lambda x: BeautifulSoup(x).get_text())

Cleaning train data...

Cleaning test data...


In [9]:
# function to clean data

stops = set(stopwords.words("english"))
def cleanData(text, lowercase = False, remove_stops = False, stemming = False):
    txt = str(text)
    txt = re.sub(r'[^A-Za-z0-9\s]',r'',txt)
    txt = re.sub(r'\n',r' ',txt)
    
    if lowercase:
        txt = " ".join([w.lower() for w in txt.split()])
        
    if remove_stops:
        txt = " ".join([w for w in txt.split() if w not in stops])
    
    if stemming:
        st = PorterStemmer()
        txt = " ".join([st.stem(w) for w in txt.split()])

    return txt

In [10]:
# clean description
train['review'] = train['review'].map(lambda x: cleanData(x, lowercase=True, remove_stops=True, stemming=True))
test['review'] = test['review'].map(lambda x: cleanData(x, lowercase=True, remove_stops=True, stemming=True))

In [11]:
y = train['sentiment']

In [12]:
# Bag of Words (word based)
ctv_word = CountVectorizer(analyzer='word',token_pattern=r'\w{1,}',min_df = 200, max_features=5000,
            ngram_range=(1,2), stop_words = 'english')

# Fitting CountVectorizer to both training and test sets
ctv_word.fit(list(train['review']) + list(test['review']))
train_ctv_word =  ctv_word.transform(train['review']) 
test_ctv_word = ctv_word.transform(test['review'])

In [13]:
# Bag of words (charater based)
ctv_char = TfidfVectorizer(sublinear_tf=True, strip_accents='unicode',analyzer='char',
    stop_words='english', ngram_range=(2, 6), max_features=10000)

# Fitting CountVectorizer to both training and test sets
ctv_char.fit(list(train['review']) + list(test['review']))
train_ctv_char =  ctv_char.transform(train['review']) 
test_ctv_char = ctv_char.transform(test['review'])

In [14]:
# TF - IDF (words)

tfv_word = TfidfVectorizer(min_df=150,  max_features= 5000, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1,2),
            stop_words = 'english')

# Fitting TF-IDF to both training and test sets (semi-supervised learning)
tfv_word.fit(list(train['review']) + list(test['review']))
train_tfv_word =  tfv_word.transform(train['review'])
test_tfv_word = tfv_word.transform(test['review'])

In [15]:
# TF-IDF(char)

tfv_char = TfidfVectorizer(sublinear_tf=True,strip_accents='unicode',analyzer='char',
    stop_words='english',ngram_range=(2, 6),max_features=10000)
tfv_char.fit(list(train['review']) + list(test['review']))
train_tfv_char = tfv_char.transform(train['review'])
test_tfv_char = tfv_char.transform(test['review'])

In [16]:
# bag of words for training set (words + char)
train_bow = hstack([train_ctv_word, train_ctv_char])
test_bow = hstack([test_ctv_word, test_ctv_char])

# TF-IDF for test set (words + char)
train_tfidf = hstack([train_tfv_word, train_tfv_char])
test_tfidf = hstack([test_tfv_word, test_tfv_char])

In [17]:
# let's check cross validation score of the model
# cv score acts a unbiased estimate of models accuracy on unseen data

mod1 = MultinomialNB()

In [18]:
print(cross_val_score(mod1, train_bow, y, cv=3, scoring=make_scorer(f1_score)))

[0.83948864 0.84285029 0.84180387]


In [19]:
## Naive Bayes 2 - tfidf is giving higher CV score
print(cross_val_score(mod1, train_tfidf, y, cv=3, scoring=make_scorer(f1_score)))

[0.84241359 0.84772296 0.85149218]


In [51]:
clf_lr = LogisticRegression() # Logistic Regression Model

In [41]:
## 5-fold cross validation
print(cross_val_score(clf_lr, train_tfidf, y, cv=5, scoring=make_scorer(f1_score)))



[0.88523941 0.88779914 0.87996804 0.88554455 0.87939599]


In [52]:
clf_lr.fit(train_tfidf,y)

# Make predictions on test data
preds = clf_lr.predict(test_tfidf)



In [53]:
# Make submission

sample['sentiment'] = preds
sample.to_csv("sub_lr.csv")