## Final Project: Predicting Movie MPAA content rating given the movie script

In [1]:
import numpy as np
import pandas as pd
import pylab as pl
import time
from nltk.stem.snowball import EnglishStemmer
from sklearn import datasets 
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
from sklearn.model_selection import train_test_split 
from nltk.stem.snowball import EnglishStemmer

### First gather and format all the features

In [2]:
# NOTE: set to the proper data root directory on your local machine
data_dir = "/Users/tshprecher/Workspace/mpaa_ml/data" 

# load all the features, drop 'title'
features_all = pd.read_csv(data_dir + "/movie_features.txt")
features_all = features_all.drop('title', axis=1)

# make the content rating numeric for training
features_all.content_rating = features_all.content_rating.map({'G': 0, 'PG': 1, 'PG-13': 2, 'R': 3, 'NC-17': 4})

# remove the NC-17 films since they are so rare. instead, let's just classify over G, PG, PG-13, and R
features_all = features_all[features_all['content_rating'] < 4]
features_all.head()

Unnamed: 0,content_rating,0,1,10,100,1000,10000,100_continued,100_ext,100_int,...,zippo,zips,zombie,zone,zoo,zoom,zoom_in,zooming,zooms,zooms_in
0,3,0,0,3,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,3,0,2,6,7,0,0,2,1,0,...,0,0,0,0,0,0,0,0,0,0
2,2,0,9,5,3,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2,1,6,2,1,0,0,0,0,0,...,0,1,0,2,0,1,1,0,0,0
4,3,2,1,2,14,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2,0


In [3]:
# Use unigrams for now, and perhaps explore bigrams later. Ignoring bigrams helps with having to deal with covariance
features_unigrams = features_all.filter(regex='^((content_rating)|([a-z]+))$', axis=1)
#features_bigrams = features_all.filter(regex='_', axis=1)

features_unigrams.head()

Unnamed: 0,content_rating,aback,abandon,abandoned,abandoning,abdomen,abilities,ability,ablaze,able,...,zipper,zipping,zippo,zips,zombie,zone,zoo,zoom,zooming,zooms
0,3,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
1,3,0,0,0,0,0,2,0,0,7,...,0,0,0,0,0,0,0,0,0,0
2,2,0,0,0,0,0,0,1,0,2,...,0,0,0,0,0,0,0,0,0,0
3,2,0,0,1,0,0,0,0,0,3,...,0,0,0,1,0,2,0,1,0,0
4,3,0,0,1,1,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,2


In [4]:
# stem all the words
features_unigrams_stemmed = pd.DataFrame()
features_unigrams_stemmed['content_rating'] = features_unigrams['content_rating']

seen = dict()
es = EnglishStemmer()
for col in features_unigrams.columns.tolist()[1:]:
    word = es.stem(col)
    if word not in seen:
        features_unigrams_stemmed[word] = features_unigrams[col]        
        seen[word] = True
    else:
        existing = features_unigrams_stemmed[word]
        features_unigrams_stemmed[word] = existing + features_unigrams[col]        
        
features_unigrams_stemmed.head()
features_unigrams = features_unigrams_stemmed
print(features_unigrams.shape)
features_unigrams.head()

(480, 6589)


Unnamed: 0,content_rating,aback,abandon,abdomen,abil,ablaz,abl,aboard,abort,abov,...,yup,z,zero,zip,zipper,zippo,zombi,zone,zoo,zoom
0,3,0,0,0,0,0,1,0,0,26,...,0,0,0,0,0,0,0,0,0,1
1,3,0,0,0,2,0,7,0,0,5,...,0,0,0,0,0,0,0,0,0,0
2,2,0,0,0,1,0,2,0,0,1,...,0,0,0,1,0,0,0,0,0,0
3,2,0,1,0,0,0,3,0,0,8,...,0,0,2,2,0,0,0,2,0,1
4,3,0,2,0,0,1,1,0,0,4,...,0,0,1,0,0,0,0,0,0,2


## Let's focus on unigrams. Find the features with the strongest correlation to the content_rating, and pick the top quartile ranked by correlation

In [5]:
features_unigrams_corr = features_unigrams.drop("content_rating", axis=1) \
    .apply(lambda x: x.corr(features_unigrams['content_rating'])) \
    .abs() \
    .sort_values(ascending=False)
    
# get the top 25% of words
features_unigrams_corr = features_unigrams_corr.head(n=int(.25 * features_unigrams_corr.size))

In [6]:
target = features_unigrams['content_rating']
features = features_unigrams[features_unigrams_corr.keys().tolist()]
features.head()

Unnamed: 0,fuck,shit,blood,ha,fli,jesus,christ,bullshit,paw,tail,...,deadlin,cow,complet,unti,folder,clumsi,fellow,bum,davi,papa
0,20,10,16,0,8,1,0,0,0,1,...,0,0,3,0,0,0,2,0,0,0
1,1,0,6,2,1,3,0,0,0,1,...,0,0,5,1,0,2,3,0,3,4
2,0,1,2,0,0,2,0,0,0,1,...,0,0,2,0,2,0,0,0,0,0
3,1,7,0,0,23,3,2,2,0,4,...,0,0,6,0,1,0,1,0,0,0
4,163,60,9,0,4,3,1,3,0,1,...,0,0,5,0,4,0,0,0,1,0


# Training classifiers

In [7]:
# define a helper function to print out results
def print_classifier_results(classifier, y_test, y_pred_test, y_train, y_pred_train):
    # print the results for the naive bayes classifier
    print("****** " + classifier + " Classifier Results *********")
    print()
    print("** TEST DATA **")
    print()
    print("Accuracy (test): {:.2f}%".format(accuracy_score(y_test, y_pred_test) * 100))
    print("\nConfusion Matrix (test):\n", confusion_matrix(y_test, y_pred_test))
    print()
    print("** TRAINING DATA **")
    print()
    print("Accuracy (training): {:.2f}%".format(accuracy_score(y_train, y_pred_train) * 100))
    print("\nConfusion Matrix (training):\n", confusion_matrix(y_train, y_pred_train))

## Train a gaussian naive bayes classifier

In [8]:
from sklearn.naive_bayes import GaussianNB

# Since we're working with quantitative and not qualitative features, use Gaussian NB
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size = 0.2,random_state = 0)
nb_model = GaussianNB().fit(X_train, y_train)

# run on training set to detect overfitting
y_pred_training_nb = nb_model.predict(X_train)

# run on test set
y_pred_test_nb = nb_model.predict(X_test)

# print results
print_classifier_results("Naive Bayes", y_test, y_pred_test_nb, y_train, y_pred_training_nb)

****** Naive Bayes Classifier Results *********

** TEST DATA **

Accuracy (test): 70.83%

Confusion Matrix (test):
 [[ 0  0  1  0]
 [ 1  0  3  3]
 [ 0  0 24  9]
 [ 2  0  9 44]]

** TRAINING DATA **

Accuracy (training): 95.83%

Confusion Matrix (training):
 [[  5   0   0   0]
 [  0  28   0   0]
 [  1   0 129   4]
 [  3   1   7 206]]


## Train random forest classifier

In [9]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(n_estimators = 100, criterion = 'gini', oob_score = True, random_state=1) 
rf_model.fit(X_train, y_train)

# run on training set to detect overfitting
y_pred_training_rf = rf_model.predict(X_train)

# run on test set
y_pred_test_rf = rf_model.predict(X_test)

# print results
print_classifier_results("Random Forest", y_test, y_pred_test_rf, y_train, y_pred_training_rf)

****** Random Forest Classifier Results *********

** TEST DATA **

Accuracy (test): 71.88%

Confusion Matrix (test):
 [[ 0  0  1  0]
 [ 0  1  5  1]
 [ 0  0 18 15]
 [ 0  0  5 50]]

** TRAINING DATA **

Accuracy (training): 99.48%

Confusion Matrix (training):
 [[  4   0   0   1]
 [  0  28   0   0]
 [  0   0 133   1]
 [  0   0   0 217]]


## Train Logistic Regression

In [10]:
from sklearn.linear_model import LogisticRegression

lr_model = LogisticRegression(random_state=1, solver='liblinear',
                          multi_class='ovr', max_iter=10000).fit(X_train, y_train)

# run on training set to detect overfitting
y_pred_training_lrm = lr_model.predict(X_train)

# run on test set
y_pred_test_lrm = lr_model.predict(X_test)

# print results
print_classifier_results("Logistic Regression", y_test, y_pred_test_lrm, y_train, y_pred_training_lrm)

****** Logistic Regression Classifier Results *********

** TEST DATA **

Accuracy (test): 67.71%

Confusion Matrix (test):
 [[ 0  0  1  0]
 [ 1  0  4  2]
 [ 0  4 22  7]
 [ 0  1 11 43]]

** TRAINING DATA **

Accuracy (training): 99.48%

Confusion Matrix (training):
 [[  4   0   0   1]
 [  0  28   0   0]
 [  0   0 133   1]
 [  0   0   0 217]]


# The Random Forest classifier is the most accurate with nearly 72% accuracy, but I consider the Naive Bayes classifier slightly better. The other two classifiers have an accuracy of over 99% on the training data. The Naive Bayes classifier has an accuracy of 95% on the training data with an accuracy of 71% on the test data. This suggests to me that the Naive Bayes classifier is the most generalizable of the three.