# Movie Reviews

In [15]:
import pickle
import string
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import KFold, cross_val_score


with open("reviews", "rb") as pkl_reviews: data = pickle.load(pkl_reviews)

data.head()

Unnamed: 0,target,reviews
0,neg,"plot : two teen couples go to a church party ,..."
1,neg,the happy bastard's quick movie review \ndamn ...
2,neg,it is movies like these that make a jaded movi...
3,neg,""" quest for camelot "" is warner bros . ' firs..."
4,neg,synopsis : a mentally unstable man undergoing ...


The dataset is made up of positive and negative movie reviews.

## Preprocessing

👇 Remove punctuation and lower case the text.

In [16]:
def remove_punctuations(text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation, '')
    return text

def lower_all(text): return text.lower()

data["clean_text"] = data['reviews'].apply(remove_punctuations)
data["clean_text"] = data['clean_text'].apply(lower_all)
print(data.head())

  target                                            reviews  \
0    neg  plot : two teen couples go to a church party ,...   
1    neg  the happy bastard's quick movie review \ndamn ...   
2    neg  it is movies like these that make a jaded movi...   
3    neg   " quest for camelot " is warner bros . ' firs...   
4    neg  synopsis : a mentally unstable man undergoing ...   

                                          clean_text  
0  plot  two teen couples go to a church party  d...  
1  the happy bastards quick movie review \ndamn t...  
2  it is movies like these that make a jaded movi...  
3    quest for camelot  is warner bros   first fe...  
4  synopsis  a mentally unstable man undergoing p...  


## Bag-of-Words modelling

👇 Using `cross_validate`, score a Multinomial Naive Bayes model trained on a Bag-of-Word representation of the texts.

In [30]:
vectorizer = CountVectorizer()
X_bow = vectorizer.fit_transform(data["clean_text"])


model = MultinomialNB()
k_fold = KFold(n_splits=10, shuffle=True, random_state=0)
clf = model
results = cross_val_score(clf, X_bow, data["target"], cv=k_fold, n_jobs=1)
print("Accuracies with cross_validation:")
print(results)
print(f"Mean accuracy: {results.mean()}")

Accuracies with cross_validation:
[0.81  0.87  0.825 0.81  0.795 0.83  0.835 0.81  0.76  0.8  ]
Mean accuracy: 0.8145000000000001


## N-gram modelling

👇 Using `cross_validate`, score a Multinomial Naive Bayes model trained on a 2-gram Bag-of-Word representation of the texts.

In [31]:
vectorizer = TfidfVectorizer(ngram_range=(2, 2))
X_bow = vectorizer.fit_transform(data["clean_text"])
X_bow_sparse = pd.DataFrame(X_bow.toarray(), columns=vectorizer.get_feature_names())
print(X_bow_sparse.head())



   00 am  00 feet  00 for  00 if  00 showing  00 sunday  00 wasnt  000 000  \
0    0.0      0.0     0.0    0.0         0.0        0.0       0.0      0.0   
1    0.0      0.0     0.0    0.0         0.0        0.0       0.0      0.0   
2    0.0      0.0     0.0    0.0         0.0        0.0       0.0      0.0   
3    0.0      0.0     0.0    0.0         0.0        0.0       0.0      0.0   
4    0.0      0.0     0.0    0.0         0.0        0.0       0.0      0.0   

   000 and  000 at  ...  zwick thinks  zwicks 1994  zwicks courage  \
0      0.0     0.0  ...           0.0          0.0             0.0   
1      0.0     0.0  ...           0.0          0.0             0.0   
2      0.0     0.0  ...           0.0          0.0             0.0   
3      0.0     0.0  ...           0.0          0.0             0.0   
4      0.0     0.0  ...           0.0          0.0             0.0   

   zwicks favorite  zwicks latest  zwicks the  zwigoffs brilliant  \
0              0.0            0.0        

In [32]:
model = MultinomialNB()
k_fold = KFold(n_splits=10, shuffle=True, random_state=0)
clf = model
results = cross_val_score(clf, X_bow, data["target"], cv=k_fold, n_jobs=1)
print("Accuracies with cross_validation:")
print(results)
print(f"Mean accuracy: {results.mean()}")

Accuracies with cross_validation:
[0.805 0.855 0.805 0.78  0.835 0.88  0.865 0.865 0.8   0.81 ]
Mean accuracy: 0.8299999999999998


⚠️ Please push the exercise once you are done 🙃

## 🏁 