In [156]:
import re
import pandas as pd
import spacy
from sklearn.feature_extraction.text import CountVectorizer , TfidfVectorizer
from sklearn.pipeline import Pipeline
from string import punctuation

In [195]:
nlp = spacy.load('en')
dataset = [('I like the movie ' , 'positive') , 
          ("It's a good movie .Nice story" , 'positive') , 
          ("Hero's acting is bat but heroine looks good . Overall nice movie" , 'positive'),
          ('Overall nice movie' , 'postive'),
          ('Nice songs .But sadly boring ending.' , 'negative'),
          ('sad movie , boring movie' , 'negative')]

In [196]:
dataset = pd.DataFrame(dataset)

In [197]:
dataset.columns = ['Text' , 'Reviews']

In [198]:
from spacy.lang.en.stop_words import STOP_WORDS

stopwords = list(STOP_WORDS)
punct = list(punctuation)

In [199]:
dataset.head()

Unnamed: 0,Text,Reviews
0,I like the movie,positive
1,It's a good movie .Nice story,positive
2,Hero's acting is bat but heroine looks good . ...,positive
3,Overall nice movie,postive
4,Nice songs .But sadly boring ending.,negative


In [200]:
def change_into_tokens(text):
    doc = nlp(text)
    tokens = []
    cleaned_text=[]
    
    for token in doc:
        if token.lemma_ != '-PRON-':
            temp = token.lemma_.lower().strip()
        else:
            temp = token.lower_
        tokens.append(temp)
        
    for token in tokens:
        if token not in stopwords and token not in punct:
            cleaned_text.append(token)
            
    return cleaned_text

In [201]:
change_into_tokens('Nice movie. I like the story')

['nice', 'movie', 'like', 'story']

In [202]:
X = dataset['Text']
y = dataset['Reviews']

In [203]:
from sklearn.model_selection import train_test_split

In [204]:
X_train , X_test  , y_train , y_test = train_test_split(X , y , test_size=0.25 , random_state=42)

X_train

5                             sad movie , boring movie
2    Hero's acting is bat but heroine looks good . ...
4                 Nice songs .But sadly boring ending.
3                                   Overall nice movie
Name: Text, dtype: object

# Now use the pipeline

In [205]:
# fitting naive bayes to the training set 
from sklearn.naive_bayes import GaussianNB , MultinomialNB
from sklearn.metrics import confusion_matrix 

In [206]:
vectorizer = TfidfVectorizer(tokenizer = change_into_tokens)
classifier = MultinomialNB()

In [207]:
#use the pipeline now 

clf = Pipeline([('vect', vectorizer ), ('clf', classifier)])

In [208]:
clf.fit(X_train , y_train)

Pipeline(memory=None,
         steps=[('vect',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=<function change_into_tokens at 0x000001D62B96E168>,
                                 use_idf=True, vocabulary=None)),
                ('clf',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False

In [209]:
y_pred = clf.predict(X_test)

In [210]:
cm = confusion_matrix(y_test , y_pred)

In [211]:
cm

array([[0, 0],
       [2, 0]], dtype=int64)

In [212]:
from sklearn.metrics import classification_report , accuracy_score

In [213]:
print(classification_report(y_test , y_pred))

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


              precision    recall  f1-score   support

    negative       0.00      0.00      0.00       0.0
    positive       0.00      0.00      0.00       2.0

    accuracy                           0.00       2.0
   macro avg       0.00      0.00      0.00       2.0
weighted avg       0.00      0.00      0.00       2.0



In [214]:
print(accuracy_score(y_test ,y_pred))

0.0
