## Movie Rating Prediction

In [1]:
import numpy as np
import pandas as pd


In [None]:
df = pd.read_csv('./Train/Train.csv')

In [3]:
df.head()

Unnamed: 0,review,label
0,mature intelligent and highly charged melodram...,pos
1,http://video.google.com/videoplay?docid=211772...,pos
2,Title: Opera (1987) Director: Dario Argento Ca...,pos
3,I think a lot of people just wrote this off as...,pos
4,This is a story of two dogs and a cat looking ...,pos


In [4]:
df.label.value_counts()

label
pos    20011
neg    19989
Name: count, dtype: int64

### We'll need to perform some text preprocessing

- Remove HTML tags
- Make everything lowercase
- Remove special chars
- Stopword Removal
- Stemming

### Removing HTML tags

In [5]:
import re
# as per recommendation from @freylis, compile once only
CLEANR = re.compile('<.*?>') 

def cleanhtml(text):
  cleantext = re.sub(CLEANR, '',text)
  return cleantext

In [6]:
df.shape

(40000, 2)

In [7]:
df['review'] = df['review'].apply(cleanhtml)
df.head()

Unnamed: 0,review,label
0,mature intelligent and highly charged melodram...,pos
1,http://video.google.com/videoplay?docid=211772...,pos
2,Title: Opera (1987) Director: Dario Argento Ca...,pos
3,I think a lot of people just wrote this off as...,pos
4,This is a story of two dogs and a cat looking ...,pos


### Converting the reviews in lowercase


In [8]:
def lowercase(text):
    return text.lower()


In [9]:
df['review'] = df['review'].apply(lowercase)

In [10]:
df.head()

Unnamed: 0,review,label
0,mature intelligent and highly charged melodram...,pos
1,http://video.google.com/videoplay?docid=211772...,pos
2,title: opera (1987) director: dario argento ca...,pos
3,i think a lot of people just wrote this off as...,pos
4,this is a story of two dogs and a cat looking ...,pos


### Removing Special Characters

In [11]:
def remove_special(text):
    x = ''
    for i in text:
        if i.isalnum():
            x = x + i

        else:
            x = x + ' '
        
    return x

In [12]:
df['review'] = df['review'].apply(remove_special)
df.head()

Unnamed: 0,review,label
0,mature intelligent and highly charged melodram...,pos
1,http video google com videoplay docid 211772...,pos
2,title opera 1987 director dario argento ca...,pos
3,i think a lot of people just wrote this off as...,pos
4,this is a story of two dogs and a cat looking ...,pos


### Stopword Removal

In [13]:
from nltk.corpus import stopwords

In [14]:
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [15]:
def remove_stopword(text):
    final_text = []
    for i in text.split():
        if i not in stopwords.words('english'):
            final_text.append(i)
    return final_text

In [16]:
df['review'] = df['review'].apply(remove_stopword)


In [17]:
df.head()

Unnamed: 0,review,label
0,"[mature, intelligent, highly, charged, melodra...",pos
1,"[http, video, google, com, videoplay, docid, 2...",pos
2,"[title, opera, 1987, director, dario, argento,...",pos
3,"[think, lot, people, wrote, another, one, tom,...",pos
4,"[story, two, dogs, cat, looking, way, back, ho...",pos


### Stemming

In [18]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()


In [19]:
def stemmer(text):
    actual_text = []
    for i in text:
        actual_text.append(ps.stem(i))
    
    return actual_text


In [20]:
df['review'] = df['review'].apply(stemmer)

In [21]:
def join_back(text):
    return " ".join(text)

In [22]:
df['review'] = df['review'].apply(join_back)

In [23]:
df.head()

Unnamed: 0,review,label
0,matur intellig highli charg melodrama unbelive...,pos
1,http video googl com videoplay docid 211772166...,pos
2,titl opera 1987 director dario argento cast cr...,pos
3,think lot peopl wrote anoth one tom cruis weir...,pos
4,stori two dog cat look way back home old wise ...,pos


In [111]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=1000)

In [112]:
X = cv.fit_transform(df['review']).toarray()


In [113]:
X.shape

(40000, 1000)

In [114]:
y = df.iloc[:,-1].values
y

array(['pos', 'pos', 'pos', ..., 'neg', 'pos', 'pos'], dtype=object)

## Training and Testing

In [115]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

In [116]:
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import CategoricalNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB

In [117]:
gb = GaussianNB()
cb = CategoricalNB()
bb = BernoulliNB()
mb = MultinomialNB()


In [118]:
gb.fit(X_train, y_train)
cb.fit(X_train, y_train)
bb.fit(X_train, y_train)
mb.fit(X_train, y_train)

In [119]:
gb_pred = gb.predict(X_test)
bb_pred = bb.predict(X_test)
mb_pred = mb.predict(X_test)


In [120]:
from sklearn.metrics import accuracy_score
gb_accuracy = accuracy_score(y_test, gb_pred)
bb_accuracy = accuracy_score(y_test, bb_pred)
mb_accuracy = accuracy_score(y_test, mb_pred)


In [121]:
print(f"Accuracy of Gaussian : {gb_accuracy}")
print(f"Accuracy of Bernaulli : {bb_accuracy}")
print(f"Accuracy of Multinomial : {mb_accuracy}")


Accuracy of Gaussian : 0.786
Accuracy of Bernaulli : 0.836625
Accuracy of Multinomial : 0.832


### We are going with bernaulli classifier as it is giving the best possible accuracy

In [157]:
mb.fit(X, y)

### Working on Test set

In [35]:
test = pd.read_csv('Train/Test.csv')
test.head()

Unnamed: 0,review
0,Remember those old kung fu movies we used to w...
1,This movie is another one on my List of Movies...
2,How in the world does a thing like this get in...
3,"""Queen of the Damned"" is one of the best vampi..."
4,The Caprica episode (S01E01) is well done as a...


### Applying the same text preprocessing steps on the test data

In [124]:
test['review'] = test['review'].apply(cleanhtml)

In [125]:
test['review'] = test['review'].apply(remove_special)


In [126]:
test['review'] = test['review'].apply(lowercase)


In [127]:
test['review'] = test['review'].apply(remove_stopword)


In [128]:
test['review'] = test['review'].apply(stemmer)

In [129]:
test['review'] = test['review'].apply(join_back)


In [134]:
Y = cv.fit_transform(test['review']).toarray()

In [158]:
pred = mb.predict(Y)

In [159]:
predictions = pd.DataFrame(pred,columns=['label'],index=None)

In [160]:
predictions['Id'] = predictions.index

In [161]:
predictions = predictions[['Id', 'label']]

In [162]:
predictions

Unnamed: 0,Id,label
0,0,pos
1,1,pos
2,2,neg
3,3,pos
4,4,neg
...,...,...
9995,9995,neg
9996,9996,neg
9997,9997,pos
9998,9998,pos


In [163]:
predictions.head()

Unnamed: 0,Id,label
0,0,pos
1,1,pos
2,2,neg
3,3,pos
4,4,neg


In [164]:
predictions.to_csv('submission.csv',index=False)