# Movie Ratings - Classification Model

In [1]:
# Import initial dependencies
import pandas as pd
import os
import nltk

In [2]:
# Import movies dataframe
movies_df = pd.read_csv("https://data-bootcamp-ztc.s3.amazonaws.com/movies_complete_cleaned.csv")
movies_df.head()

Unnamed: 0,name,production,director,runtime,released,year,month,country_kaggle,country_omdb,star_kaggle,...,plot,awards,score_imdb,votes_imdb,score_metacritic,budget,genre_kaggle,gross,genres_omdb,rating
0,Doctor Strange,Marvel Studios,Scott Derrickson,115,2016-11-04,2016,11,USA,USA,Benedict Cumberbatch,...,"Marvel's ""Doctor Strange"" follows the story of...",Nominated for 1 Oscar. Another 19 wins & 67 no...,7.5,348307,72.0,165000000,Action,232641920,"Action, Adventure, Fantasy, Sci-Fi",PG-13
1,Sleight,Diablo Entertainment (II),J.D. Dillard,89,2017-04-28,2016,4,USA,USA,Jacob Latimore,...,A young street magician (Jacob Latimore) is le...,3 nominations.,5.9,4012,62.0,250000,Action,3986245,"Crime, Drama, Sci-Fi",R
2,Silence,Cappa Defina Productions,Martin Scorsese,161,2017-01-13,2016,1,USA,"USA, UK, Taiwan, Japan, Mexico, Italy",Andrew Garfield,...,The story of two Catholic missionaries (Andrew...,Nominated for 1 Oscar. Another 6 wins & 56 nom...,7.2,61798,79.0,46000000,Adventure,7100177,"Drama, History",R
3,Manchester by the Sea,Amazon Studios,Kenneth Lonergan,137,2016-12-16,2016,12,USA,USA,Casey Affleck,...,"Lee Chandler is a brooding, irritable loner wh...",Won 2 Oscars. Another 127 wins & 263 nominations.,7.9,159673,96.0,8500000,Drama,47695371,Drama,R
4,Dirty Grandpa,Lionsgate,Dan Mazer,102,2016-01-22,2016,1,USA,"United States, United Kingdom",Robert De Niro,...,"Jason Kelly, the grandson of Dick Kelly, loses...",2 wins & 11 nominations,6.0,82289,21.0,27500000,Comedy,35593113,Comedy,R


## Cleaning Data for Tokenization

In [3]:
# Extract plot data
plot_df = movies_df[["name", "plot", "genre_kaggle", "rating"]]
plot_df

Unnamed: 0,name,plot,genre_kaggle,rating
0,Doctor Strange,"Marvel's ""Doctor Strange"" follows the story of...",Action,PG-13
1,Sleight,A young street magician (Jacob Latimore) is le...,Action,R
2,Silence,The story of two Catholic missionaries (Andrew...,Adventure,R
3,Manchester by the Sea,"Lee Chandler is a brooding, irritable loner wh...",Drama,R
4,Dirty Grandpa,"Jason Kelly, the grandson of Dick Kelly, loses...",Comedy,R
...,...,...,...,...
6251,Hoosiers,Based on the true story of a small-town Indian...,Drama,PG
6252,Off Beat,Traumatised paramedic looks for the woman of h...,Comedy,PG
6253,Big Trouble in Little China,"Truck driver Jack Burton arrives in Chinatown,...",Action,PG-13
6254,Biggles: Adventures in Time,One minute the New Yorker advertising expert J...,Adventure,PG


In [4]:
plot_df[plot_df["plot"].isna()]
plot_df["plot"].fillna("", inplace=True)
plot_df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  downcast=downcast,


Unnamed: 0,name,plot,genre_kaggle,rating
0,Doctor Strange,"Marvel's ""Doctor Strange"" follows the story of...",Action,PG-13
1,Sleight,A young street magician (Jacob Latimore) is le...,Action,R
2,Silence,The story of two Catholic missionaries (Andrew...,Adventure,R
3,Manchester by the Sea,"Lee Chandler is a brooding, irritable loner wh...",Drama,R
4,Dirty Grandpa,"Jason Kelly, the grandson of Dick Kelly, loses...",Comedy,R
...,...,...,...,...
6251,Hoosiers,Based on the true story of a small-town Indian...,Drama,PG
6252,Off Beat,Traumatised paramedic looks for the woman of h...,Comedy,PG
6253,Big Trouble in Little China,"Truck driver Jack Burton arrives in Chinatown,...",Action,PG-13
6254,Biggles: Adventures in Time,One minute the New Yorker advertising expert J...,Adventure,PG


In [5]:
# Remove punctuation
import string


def remove_punct(text):
    table = str.maketrans("", "", string.punctuation)
    return text.translate(table)


plot_df["plot"] = [remove_punct(x) for x in plot_df["plot"]]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


In [6]:
# Remove stopwords
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

stop = set(stopwords.words("english"))


def remove_stopwords(text):
    text = [word.lower() for word in text.split() if word.lower() not in stop]

    return " ".join(text)


plot_df["plot"] = [remove_stopwords(x) for x in plot_df["plot"]]

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/alannahmarie/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app


In [7]:
# # Implement stemming
# from nltk.stem.porter import PorterStemmer

# stemmer = PorterStemmer()


# def stemming(text):
#     text = [stemmer.stem(word) for word in text.split()]

#     return " ".join(text)

In [8]:
# plot_df["plot"] = [stemming(x) for x in plot_df["plot"]]

## Tokenization

In [9]:
from nltk.tokenize import TreebankWordTokenizer

tokenizer = TreebankWordTokenizer()

plot_df["tokens"] = plot_df["plot"].map(tokenizer.tokenize)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [10]:
plot_df[["plot", "tokens"]].head()

Unnamed: 0,plot,tokens
0,marvels doctor strange follows story talented ...,"[marvels, doctor, strange, follows, story, tal..."
1,young street magician jacob latimore left care...,"[young, street, magician, jacob, latimore, lef..."
2,story two catholic missionaries andrew garfiel...,"[story, two, catholic, missionaries, andrew, g..."
3,lee chandler brooding irritable loner works ha...,"[lee, chandler, brooding, irritable, loner, wo..."
4,jason kelly grandson dick kelly loses grandmot...,"[jason, kelly, grandson, dick, kelly, loses, g..."


In [11]:
plot_df = plot_df[plot_df["rating"] != "NC-17"]

In [12]:
# TFIDF
from sklearn.feature_extraction.text import TfidfVectorizer


def tfidf(data, ngrams=(1, 1)):
    tfidf_vectorizer = TfidfVectorizer(ngram_range=ngrams)
    train = tfidf_vectorizer.fit_transform(data)

    return train, tfidf_vectorizer


train_tfidf, tfidf_vectorizer = tfidf(plot_df["plot"])
# test_tfidf = tfidf_vectorizer.transform(test["text"])

In [13]:
# Test TFIDF
[x for x in train_tfidf.todense()[0][0:].tolist()[0] if x != 0]

[0.13208518554434392,
 0.10010192923231205,
 0.110688149121315,
 0.15810799752318755,
 0.17284733473105102,
 0.1689703819261967,
 0.14336866031532408,
 0.09538473722900326,
 0.11470273586100549,
 0.09276911399057444,
 0.17284733473105102,
 0.1297200426309384,
 0.18370971913406017,
 0.32743493745710167,
 0.1423311682756979,
 0.09811315665210747,
 0.1775923414782809,
 0.11415746576218766,
 0.18370971913406017,
 0.09786150438138708,
 0.12147445341205515,
 0.18370971913406017,
 0.17284733473105102,
 0.19233167868614437,
 0.13901298394480913,
 0.18370971913406017,
 0.18370971913406017,
 0.05758307997817369,
 0.10356596870013475,
 0.09836782279409588,
 0.09190106004819884,
 0.11046072055022857,
 0.14134194951150433,
 0.07258751515350385,
 0.31963829504698205,
 0.12709146989530906,
 0.12147445341205515,
 0.1333743298546905,
 0.19233167868614437,
 0.15095312782778209,
 0.11115043621259274,
 0.13523015967037505,
 0.0830597167066226]

In [14]:
plot_df.rating.value_counts()

R        3261
PG-13    1903
PG        927
G         143
Name: rating, dtype: int64

In [15]:
no_nc_df = plot_df[plot_df["rating"] != "NC-17"]
no_nc_df.head()

Unnamed: 0,name,plot,genre_kaggle,rating,tokens
0,Doctor Strange,marvels doctor strange follows story talented ...,Action,PG-13,"[marvels, doctor, strange, follows, story, tal..."
1,Sleight,young street magician jacob latimore left care...,Action,R,"[young, street, magician, jacob, latimore, lef..."
2,Silence,story two catholic missionaries andrew garfiel...,Adventure,R,"[story, two, catholic, missionaries, andrew, g..."
3,Manchester by the Sea,lee chandler brooding irritable loner works ha...,Drama,R,"[lee, chandler, brooding, irritable, loner, wo..."
4,Dirty Grandpa,jason kelly grandson dick kelly loses grandmot...,Comedy,R,"[jason, kelly, grandson, dick, kelly, loses, g..."


In [16]:
# Check to make sure that NC-17 rows have been removed
no_nc_df.rating.value_counts()

R        3261
PG-13    1903
PG        927
G         143
Name: rating, dtype: int64

In [17]:
# Grab ratings data to apply label encoding 
ratings = no_nc_df[["name", "rating"]]
ratings

Unnamed: 0,name,rating
0,Doctor Strange,PG-13
1,Sleight,R
2,Silence,R
3,Manchester by the Sea,R
4,Dirty Grandpa,R
...,...,...
6251,Hoosiers,PG
6252,Off Beat,PG
6253,Big Trouble in Little China,PG-13
6254,Biggles: Adventures in Time,PG


In [18]:
# Import label encoder 
from sklearn import preprocessing

In [19]:
# Set label encoder method to variable for use
label_encoder = preprocessing.LabelEncoder()

In [20]:
# Encode ratigs  
ratings["encoded_rating"] = label_encoder.fit_transform(ratings["rating"]) 
ratings

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,name,rating,encoded_rating
0,Doctor Strange,PG-13,2
1,Sleight,R,3
2,Silence,R,3
3,Manchester by the Sea,R,3
4,Dirty Grandpa,R,3
...,...,...,...
6251,Hoosiers,PG,1
6252,Off Beat,PG,1
6253,Big Trouble in Little China,PG-13,2
6254,Biggles: Adventures in Time,PG,1


In [21]:
ratings.encoded_rating.value_counts()

3    3261
2    1903
1     927
0     143
Name: encoded_rating, dtype: int64

In [22]:
ratings.rating.value_counts()

R        3261
PG-13    1903
PG        927
G         143
Name: rating, dtype: int64

In [23]:
# # Read in tokenized plot Dataframe 
# path = os.path.join('..','resources','cleaned_data', 'plot_features.csv')
# plots_df = pd.read_csv(path)
# plots_df.head()

In [24]:
# # Merge ratings and plot dataframes
# merged_df = ratings.merge(plots_df, on="name")
# merged_df.head()

In [25]:
target = ratings["encoded_rating"]
target_names = ["G", "PG", "PG-13", "R"]
target

0       2
1       3
2       3
3       3
4       3
       ..
6251    1
6252    1
6253    2
6254    1
6255    2
Name: encoded_rating, Length: 6234, dtype: int64

In [26]:
data = train_tfidf
data

<6234x34827 sparse matrix of type '<class 'numpy.float64'>'
	with 280250 stored elements in Compressed Sparse Row format>

In [27]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=42)

In [28]:
# Support vector machine linear classifier
from sklearn.svm import SVC 
model = SVC(kernel='linear')
model.fit(X_train, y_train)

SVC(kernel='linear')

In [29]:
# Model Accuracy
print('Test Acc: %.3f' % model.score(X_test, y_test))

Test Acc: 0.564


In [30]:
# Calculate classification report
from sklearn.metrics import classification_report
predictions = model.predict(X_test)
print(classification_report(y_test, predictions,
                            target_names=target_names))

              precision    recall  f1-score   support

           G       0.00      0.00      0.00        38
          PG       0.45      0.13      0.20       215
       PG-13       0.46      0.25      0.32       489
           R       0.59      0.89      0.71       817

    accuracy                           0.56      1559
   macro avg       0.38      0.32      0.31      1559
weighted avg       0.52      0.56      0.50      1559



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
