### Create a NLP Pipeline to 'Clean' Reviews Data
- Load Input File and Read Reviews
- Tokenize
- Remove Stopwards
- Perform Stemming 
- Write cleaned data to outpu

In [113]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from nltk import sent_tokenize,word_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
tokenizer = RegexpTokenizer(r'\w+')
en_stopwords = set(stopwords.words('english'))
ps = PorterStemmer()

In [34]:
def  clean_review(review):      ### Function to clean the data
    review = review.lower()
    review = review.replace("<br /><br />"," ")
    # Tokenize
    tokens = tokenizer.tokenize(review)
    useful_words = [w for w in tokens if w not in en_stopwords]
    stemmed_words = [ps.stem(w) for w in useful_words]
    clean_review = ' '.join(stemmed_words)
    return clean_review

In [35]:
clean_review(x_data.review[0])  #Example of cleaning of data

'matur intellig highli charg melodrama unbelivebl film china 1948 wei wei stun perform catylast love triangl simpli stun oppurun see magnific film take'

In [126]:
x_data = pd.read_csv("../data/movie_rating_prediction/Train.csv")
le = LabelEncoder()
# Applies transformation on each coloumn
x_reviews = x_data.review.values
rev_type = x_data.apply(le.fit_transform)
rev_type = rev_type.label.values
## pos = 1 in rev_type ans neg = 0
# prepared the train data 

In [172]:
x_test = pd.read_csv("../data/movie_rating_prediction/Test.csv")
x_test = x_test.review.values
# prepared the test data

In [146]:
x_cleaned_reviews = x_data.review.values
l = x_cleaned_reviews.shape[0]
for i in range(l):
    x_cleaned_reviews[i] = clean_review(x_cleaned_reviews[i])
# Cleaned the reviews of train data

In [173]:
l2 = x_test.shape[0]
for i in range(l2):
    x_test[i] = clean_review(x_test[i])
#Cleaned the reviews of test data

In [111]:
x_cleaned_reviews = pd.DataFrame(x_cleaned_reviews,columns=["review"])

In [131]:
x_cleaned_reviews.to_csv("../data/movie_rating_prediction/cleaned_data.csv",index=False)
# Created the clean_review file which contains cleaned reviews so that i do not need to clean every time

In [177]:
x_reviews_list = []
x_test_list = []
for i in range(40000):
    x_reviews_list.append(x_cleaned_reviews[i])
for i in range(l2):
    x_test_list.append(x_test[i])
# prepared a list of reviews     

### Vectorization

In [215]:
cv = CountVectorizer()
x_vec = cv.fit_transform(x_reviews_list)
x_test_vec = cv.transform(x_test_list)

In [None]:
x_test_vec.shape,x_vec.shape

### Multinomial Navie Bayes

In [193]:
from sklearn.naive_bayes import MultinomialNB,BernoulliNB,GaussianNB

In [216]:
mnb = MultinomialNB()
bnb = BernoulliNB()

In [221]:
mnb.fit(x_vec,rev_type)
bnb.fit(x_vec,rev_type)
x_test_pred_mnb =  mnb.predict(x_test_vec)
x_test_pred_bnb =  bnb.predict(x_test_vec)

BernoulliNB()

In [223]:
rev_test_type_mnb = []
rev_test_type_bnb = []
for i in range(10000):
    if x_test_pred_mnb[i]:
        rev_test_type_mnb.append("pos")
    else:
        rev_test_type_mnb.append("neg")
    if x_test_pred_bnb[i]:
        rev_test_type_bnb.append("pos")
    else:
        rev_test_type_bnb.append("neg")    

In [224]:
ans_df_mnb = pd.DataFrame(rev_test_type_mnb,columns=["label"])
ans_df_mnb.index.name = "Id"
ans_df_bnb = pd.DataFrame(rev_test_type_bnb,columns=["label"])
ans_df_bnb.index.name = "Id"
ans_df_mnb.to_csv("../data/movie_rating_prediction/ans_mnb.csv")
ans_df_bnb.to_csv("../data/movie_rating_prediction/ans_bnb.csv")

In [236]:
mnb.predict_proba(x_test_vec[5])

array([[9.99998157e-01, 1.84254866e-06]])

In [238]:
mnb.score(x_vec,rev_type)

0.89035

In [239]:
bnb.score(x_vec,rev_type)

0.885725