## Sentiment Analysis using Spacy for Amazon and IMDB Review Dataset.

## The dataset has two classes:
## Class 0 : Bad Review
## Class 1 : Positive Review

In [1]:
import pandas as pd
import spacy

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [3]:
data_imdb = pd.read_csv('imdb_labelled.txt', sep='\t', header=None)


In [4]:
data_imdb

Unnamed: 0,0,1
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1
...,...,...
743,I just got bored watching Jessice Lange take h...,0
744,"Unfortunately, any virtue in this film's produ...",0
745,"In a word, it is embarrassing.",0
746,Exceptionally bad!,0


In [5]:
columns_name = ['Review', 'Sentiment']
data_imdb.columns = columns_name

In [6]:
data_imdb

Unnamed: 0,Review,Sentiment
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1
...,...,...
743,I just got bored watching Jessice Lange take h...,0
744,"Unfortunately, any virtue in this film's produ...",0
745,"In a word, it is embarrassing.",0
746,Exceptionally bad!,0


In [7]:
data_amazon = pd.read_csv('amazon_cells_labelled.txt', sep='\t', header=None)

In [8]:
data_amazon.head()

Unnamed: 0,0,1
0,So there is no way for me to plug it in here i...,0
1,"Good case, Excellent value.",1
2,Great for the jawbone.,1
3,Tied to charger for conversations lasting more...,0
4,The mic is great.,1


In [9]:
data_amazon.columns = columns_name

In [10]:
data_amazon.head()

Unnamed: 0,Review,Sentiment
0,So there is no way for me to plug it in here i...,0
1,"Good case, Excellent value.",1
2,Great for the jawbone.,1
3,Tied to charger for conversations lasting more...,0
4,The mic is great.,1


In [11]:
data_amazon.shape

(1000, 2)

In [12]:
new_data = data_imdb.append([data_amazon], ignore_index=True)
new_data.head()

Unnamed: 0,Review,Sentiment
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


In [13]:
new_data.shape

(1748, 2)

In [14]:
new_data['Sentiment'].value_counts()

1    886
0    862
Name: Sentiment, dtype: int64

In [15]:
new_data.isnull().sum()

Review       0
Sentiment    0
dtype: int64

In [16]:
## Tokenization.

In [17]:
import string

In [18]:
punct = string.punctuation
punct

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [19]:
import spacy

nlp = spacy.load('en_core_web_sm')


In [20]:
from spacy.lang.en.stop_words import STOP_WORDS
stopwords = list(STOP_WORDS)
len(stopwords)

326

In [21]:
def text_data_cleaning(sentence):
    doc = nlp(sentence)
    
    tokens = []
    
    for token in doc:
        if token.lemma_ != "-PRON-": # Performing Sentence Lemmatization if it is not pronoun.
            temp = token.lemma_.lower().strip()
        else:
            temp = token.lower_
        
        tokens.append(temp) # Lemmatized and converted text into lowercase.
        
        # Clean tokens.
        cleaned_tokens = []
        
        for token in tokens:
            if token not in stopwords and token not in punct:
                cleaned_tokens.append(token)
        return cleaned_tokens
        
        
            

In [22]:
# TF-IDF 

In [23]:
from sklearn.svm import LinearSVC

tfidf = TfidfVectorizer(tokenizer = text_data_cleaning)
classifier = LinearSVC()

In [24]:
X = new_data['Review']
y = new_data['Sentiment']


In [25]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=42)

In [26]:
# Creating a Pipeline to perform TFidf and classifier

clf = Pipeline([('tfidf', tfidf), ('clf', classifier)])



In [27]:
clf.fit(X_train, y_train)

Pipeline(steps=[('tfidf',
                 TfidfVectorizer(tokenizer=<function text_data_cleaning at 0x000001E45852B700>)),
                ('clf', LinearSVC())])

In [28]:
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.52      0.98      0.68       256
           1       0.89      0.12      0.21       269

    accuracy                           0.54       525
   macro avg       0.70      0.55      0.44       525
weighted avg       0.71      0.54      0.44       525



In [29]:
# Testing an example with our trained model.

clf.predict(['Bad Product Bad food']) # 0 Means Negative Review.

array([0], dtype=int64)

In [30]:
# Testing an example with our trained model.

clf.predict(['Good Product']) # 1 Means Positive Review.

array([1], dtype=int64)