### Importing Necessary Libraries



In [47]:
import spacy
from spacy import displacy
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.svm import LinearSVC
import string

## Loading SpaCy's small english model

In [48]:
# Loading Spacy small model as nlp
nlp = spacy.load("en_core_web_sm")

## Gathering all the Stop words which does not convey much meaning in the Sentiment

In [49]:
# Gathering all the stopwords
from spacy.lang.en.stop_words import STOP_WORDS
stopwords = list(STOP_WORDS)
print(len(stopwords))

326


In [50]:
# Loading train dataset
data_train = pd.read_csv("train.csv",header = None, sep=",", on_bad_lines='skip')
data_train=data_train.iloc[:, 0:2] 
data_train.head()

Unnamed: 0,0,1
0,I grew up (b. 1965) watching and loving the Th...,0
1,"When I put this movie in my DVD player, and sa...",0
2,Why do people who do not know what a particula...,0
3,Even though I have great interest in Biblical ...,0
4,Im a die hard Dads Army fan and nothing will e...,1


In [51]:
# Adding column names to the dataframe
columnName = ['Review','Sentiment']
data_train.columns = columnName
data_train.head()

Unnamed: 0,Review,Sentiment
0,I grew up (b. 1965) watching and loving the Th...,0
1,"When I put this movie in my DVD player, and sa...",0
2,Why do people who do not know what a particula...,0
3,Even though I have great interest in Biblical ...,0
4,Im a die hard Dads Army fan and nothing will e...,1


## So here we can deduce that Sentiment 1 is Positive and 0 is negative

In [52]:
print(data_train.shape)

(40000, 2)


In [53]:
# Adding test dataset and adding its column name
data_test = pd.read_csv("test.csv", sep=',', header= None)
data_test.columns = columnName
data_test.head()

Unnamed: 0,Review,Sentiment
0,I always wrote this series off as being a comp...,0
1,1st watched 12/7/2002 - 3 out of 10(Dir-Steve ...,0
2,This movie was so poorly written and directed ...,0
3,The most interesting thing about Miryang (Secr...,1
4,"when i first read about ""berlin am meer"" i did...",0


In [54]:
print(data_test.shape)

(5000, 2)


In [55]:
# Adding valid dataset and adding its column name
data_valid = pd.read_csv("valid.csv",sep=',', header= None)
data_valid.columns = columnName
data_valid.head()

Unnamed: 0,Review,Sentiment
0,It's been about 14 years since Sharon Stone aw...,0
1,someone needed to make a car payment... this i...,0
2,The Guidelines state that a comment must conta...,0
3,This movie is a muddled mish-mash of clichés f...,0
4,Before Stan Laurel became the smaller half of ...,0


In [56]:
print(data_valid.shape)

(5000, 2)


## Appending all the Datasets

In [57]:
# Merging all the three dataframes
data = data_train.append([data_test, data_valid], ignore_index=True)
print(data.shape)
data

(50000, 2)


  data = data_train.append([data_test, data_valid], ignore_index=True)


Unnamed: 0,Review,Sentiment
0,I grew up (b. 1965) watching and loving the Th...,0
1,"When I put this movie in my DVD player, and sa...",0
2,Why do people who do not know what a particula...,0
3,Even though I have great interest in Biblical ...,0
4,Im a die hard Dads Army fan and nothing will e...,1
...,...,...
49995,"Man, I loved this movie! This really takes me ...",1
49996,Recovery is an incredibly moving piece of work...,1
49997,"You can take the crook out of the joint, but i...",1
49998,FUTZ is the only show preserved from the exper...,1


In [58]:
# Sentiment ditribution in the dataset
data.Sentiment.value_counts()

0    25000
1    25000
Name: Sentiment, dtype: int64

In [59]:
# Getting information regarding the null entries in the dataset
data.isnull().sum()

Review       0
Sentiment    0
dtype: int64

In [60]:
punct = string.punctuation
print(punct)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [61]:
def dataCleaning(sentence):
  doc = nlp(sentence)
  tokens = []
  for token in doc:
    if token.lemma_ != '-PRON-':
      temp = token.lemma_.lower().strip()
    else:
      temp = token.lower_
    tokens.append(temp)
  clean_tokens = []
  for token in tokens:
    if token not in punct and token not in stopwords:
      clean_tokens.append(token)
  return clean_tokens

## Here after passing a particular sentence in dataCleaning method we are returned with relevant words which contribute to the sentiments

In [66]:
# Spillting the train and test data
X = data['Review']
y = data['Sentiment']
# print(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
print(X_train.shape,y_test.shape)
# clm = ['Review']
# X_train.columns = clm
# X_train.head()

(40000,) (10000,)


## Preparing Model

In [18]:
# Creating the model and pipeline
tfidf = TfidfVectorizer(tokenizer = dataCleaning)
svm = LinearSVC()
steps = [('tfidf',tfidf),('svm',svm)]
pipe = Pipeline(steps)

In [19]:
# Training the model
out=pipe.fit(X_train,y_train)

In [20]:
# Testing on the test dataset
y_pred = out.predict(X_test)

In [21]:
# Printing the classification report and the confusion matrix
print(classification_report(y_test,y_pred))
print("\n\n")
print(confusion_matrix(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.90      0.88      0.89      4976
           1       0.88      0.90      0.89      5024

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000




[[4378  598]
 [ 478 4546]]


## Testing on the Random Manual Examples

**Here '1' represent that the input is positive sentiment**

In [22]:
# Testing on random inputs
pipe.predict(["very nice but not upto the mark"])

array([1], dtype=int64)

**Here '0' represent that input is negative sentiment**

In [23]:
pipe.predict(["event"])

array([0], dtype=int64)

In [25]:
# Saving our model
import joblib
joblib.dump(out, 'model.pkl')
print("Model dumped!")

Model dumped!


In [26]:
# Load the model that we just saved
mp = joblib.load('model.pkl')

In [29]:
mp.predict(["this is bad"])

array([0], dtype=int64)

In [43]:
model_columns= list(X_train)
joblib.dump(model_columns, 'model_columns.pkl')
print("Models columns dumped!")

Models columns dumped!
