In [20]:
import spacy
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report , accuracy_score , confusion_matrix

In [3]:
df = pd.read_csv('moviereviews.tsv' , sep='\t') 

In [4]:
df.head()

Unnamed: 0,label,review
0,neg,how do films like mouse hunt get into theatres...
1,neg,some talented actresses are blessed with a dem...
2,pos,this has been an extraordinary year for austra...
3,pos,according to hollywood movies made in last few...
4,neg,my first press screening of 1998 and already i...


In [5]:
#get shape
df.shape

(2000, 2)

In [6]:
df[df['label'] == 'neg'].shape

(1000, 2)

# first split the data

In [7]:
df.head()

Unnamed: 0,label,review
0,neg,how do films like mouse hunt get into theatres...
1,neg,some talented actresses are blessed with a dem...
2,pos,this has been an extraordinary year for austra...
3,pos,according to hollywood movies made in last few...
4,neg,my first press screening of 1998 and already i...


In [8]:
X = df['review'].apply(lambda x: np.str(x)) #changing unicode to str
y = df['label']

In [9]:
X_train , X_test , y_train , y_test = train_test_split(X , y , test_size=0.3,shuffle=True , random_state=42)

In [10]:
X_train.head() , X_train.shape

(836     as a devout atheist and an avowed believer in ...
 575     the summer movie season is always the biggest ...
 557      " the 44 caliber killer has struck again . " ...
 1235    the crown jewel of 1970's irwin allen disaster...
 1360     " my name is jack carter , and you don't want...
 Name: review, dtype: object, (1400,))

In [11]:
X_test.head()  , X_test.shape

(1860    only a year after the initial release of " scr...
 353     susan granger's review of " songcatcher " in t...
 1333    the laserman : somehow the title of writer-dir...
 905     there have been merchant-ivory costume dramas ...
 1289    it seemed wholly appropriate that at a weekend...
 Name: review, dtype: object, (600,))

In [12]:
#now change the text into tokens
from spacy.lang.en.stop_words import STOP_WORDS
nlp = spacy.load('en')
stopwords = list(STOP_WORDS)
import string
punct = string.punctuation

In [13]:
def change_into_tokens(text):
    doc = nlp(text)
    cleaned_text = []
    
    tokens = []
    
    for token in doc:
        if token.lemma_ != "-PRON-":
            temp = token.lemma_.lower().strip()
        else:
            temp = token.lower_
        tokens.append(temp)
        
    for token in tokens:
        if token not in stopwords and token not in punct:
            cleaned_text.append(token)    
    return cleaned_text

In [14]:
change_into_tokens('hi how are you')

['hi']

In [21]:
#using vectorizer

tfidf= TfidfVectorizer(tokenizer=change_into_tokens)

classifier = RandomForestClassifier()
classifier2 = MultinomialNB()

In [22]:
clf = Pipeline([('tfidf' , tfidf) , ('clf' , classifier2)])

In [23]:
clf.fit(X_train , y_train) #adding vocabulary to our model (fit means add vocabulary and data to classify the test set)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=<function change_into_tokens at 0x0000016CB26728B8>,
                                 use_idf=True, vocabulary=None)),
                ('clf',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=Fals

In [24]:
y_pred  = clf.predict(X_test)

In [25]:
print(accuracy_score(y_pred , y_test))

0.79
