As a baseline for comparison, this notebook creates a Naive Bayes classifier to predict the genre of a piece of media based on its synopsis.

In [35]:
# sklearn for various machine learning algorithms
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
# CountVectorizer for converting text data into a matrix of token counts
# Not the only way to do this, but a simple and common way
from sklearn.feature_extraction.text import CountVectorizer

# Pandas for working with dataframes (including the one already created by the data processing notebook)
import pandas as pd

In [36]:
# Load the data
data = pd.read_csv('Datasets/onehotplotgenre.csv')

# Print the first few rows of the data
print(data.head())

                                    Title  \
0                             # (2012/II)   
1                               #1 (2018)   
2          #1 Cheerleader Camp (2010) (V)   
3                 #1 Serial Killer (2013)   
4  #1 at the Apocalypse Box Office (2015)   

                                                Plot  Action  Adult  \
0  The night falls on the big city and a hooded f...       0      0   
1  After reaching #1 at Mutual of New York and se...       0      0   
2  When they're hired to work at a cheerleading c...       0      0   
3  Years of seething rage against the racism he's...       0      0   
4  Jules is, self declared, the most useless pers...       0      0   

   Adventure  Animation  Biography  Comedy  Crime  Documentary  ...  \
0          0          1          0       0      0            0  ...   
1          0          0          0       0      0            1  ...   
2          0          0          0       1      0            0  ...   
3          0      

To start, we'll work on training a single Naive Bayes classifier - action movie or not.

In [37]:
# Extract just the title, plot, and Action columns
# This is the data that we will use to train the action classifier
actiondata = data[['Title', 'Plot', 'Action']]
print(actiondata.head())

# Split the data into training and testing sets
# The training set will be used to train the classifier
# The testing set will be used to evaluate the classifier
X_train, X_test, y_train, y_test = train_test_split(actiondata['Plot'], actiondata['Action'], test_size=0.2)

# Create a CountVectorizer object
# This object will convert the text data into a matrix of token counts
vectorizer = CountVectorizer()

# Fit the vectorizer to the training data
# This step determines which words are in the vocabulary
vectorizer.fit(X_train)

# Transform the training data using the vectorizer
# The data is transformed into a matrix of token counts
X_train = vectorizer.transform(X_train)
X_test = vectorizer.transform(X_test)


                                    Title  \
0                             # (2012/II)   
1                               #1 (2018)   
2          #1 Cheerleader Camp (2010) (V)   
3                 #1 Serial Killer (2013)   
4  #1 at the Apocalypse Box Office (2015)   

                                                Plot  Action  
0  The night falls on the big city and a hooded f...       0  
1  After reaching #1 at Mutual of New York and se...       0  
2  When they're hired to work at a cheerleading c...       0  
3  Years of seething rage against the racism he's...       0  
4  Jules is, self declared, the most useless pers...       0  


In [38]:
# Create a Multinomial Naive Bayes classifier
actionclassifier = MultinomialNB()

# Train the classifier
actionclassifier.fit(X_train, y_train)

# Evaluate the classifier
score = actionclassifier.score(X_test, y_test)
print('Accuracy:', score)

Accuracy: 0.884546682350891


In [42]:
# Predict if a new plot is an action movie
plot = "So many fuzzy kittens."

# Convert the plot into a matrix of token counts
plot = vectorizer.transform([plot])

# Predict if the plot is an action movie
prediction = actionclassifier.predict_proba(plot)

# Print the prediction
print(prediction)

# Predict if a new plot is an action movie
plot2 = "James Bond saves the world from evil."

# Convert the plot into a matrix of token counts
plot2 = vectorizer.transform([plot2])

# Predict if the plot is an action movie
prediction2 = actionclassifier.predict_proba(plot2)

# Print the prediction
print(prediction2)

[[0.98720028 0.01279972]]
[[0.30579668 0.69420332]]


In [43]:
# Get the list of genres from the dataset
genres = data.columns[2:]
print(genres)

Index(['Action', 'Adult', 'Adventure', 'Animation', 'Biography', 'Comedy',
       'Crime', 'Documentary', 'Drama', 'Family', 'Fantasy', 'Film-Noir',
       'Game-Show', 'Hardcore', 'History', 'Horror', 'Lifestyle', 'Music',
       'Musical', 'Mystery', 'News', 'Reality-TV', 'Reality-tv', 'Romance',
       'Sci-Fi', 'Sex', 'Short', 'Sport', 'Talk-Show', 'Thriller', 'War',
       'Western'],
      dtype='object')
