As a baseline for comparison, this notebook creates a Naive Bayes classifier to predict the genre of a piece of media based on its synopsis.

In [24]:
# sklearn for various machine learning algorithms
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
# CountVectorizer for converting text data into a matrix of token counts
# Not the only way to do this, but a simple and common way
from sklearn.feature_extraction.text import CountVectorizer

# Pandas for working with dataframes (including the one already created by the data processing notebook)
import pandas as pd

In [25]:
# Load the data
data = pd.read_csv('Datasets/onehotplotgenre.csv')

# Print the first few rows of the data
print(data.head())

                                    Title  \
0                             # (2012/II)   
1                               #1 (2018)   
2          #1 Cheerleader Camp (2010) (V)   
3                 #1 Serial Killer (2013)   
4  #1 at the Apocalypse Box Office (2015)   

                                                Plot  Action  Adult  \
0  The night falls on the big city and a hooded f...       0      0   
1  After reaching #1 at Mutual of New York and se...       0      0   
2  When they're hired to work at a cheerleading c...       0      0   
3  Years of seething rage against the racism he's...       0      0   
4  Jules is, self declared, the most useless pers...       0      0   

   Adventure  Animation  Biography  Comedy  Crime  Documentary  ...  News  \
0          0          1          0       0      0            0  ...     0   
1          0          0          0       0      0            1  ...     0   
2          0          0          0       1      0            0  ...    

To start, we'll work on training a single Naive Bayes classifier - action movie or not.

In [26]:
# Extract just the title, plot, and Action columns
# This is the data that we will use to train the action classifier
actiondata = data[['Title', 'Plot', 'Action']]
print(actiondata.head())

# Split the data into training and testing sets
# The training set will be used to train the classifier
# The testing set will be used to evaluate the classifier
X_train, X_test, y_train, y_test = train_test_split(actiondata['Plot'], actiondata['Action'], test_size=0.2, random_state=42)

# Create a CountVectorizer object
# This object will convert the text data into a matrix of token counts
vectorizer = CountVectorizer()

# Fit the vectorizer to the training data
# This step determines which words are in the vocabulary
vectorizer.fit(X_train)

# Transform the training data using the vectorizer
# The data is transformed into a matrix of token counts
X_train = vectorizer.transform(X_train)
X_test = vectorizer.transform(X_test)


                                    Title  \
0                             # (2012/II)   
1                               #1 (2018)   
2          #1 Cheerleader Camp (2010) (V)   
3                 #1 Serial Killer (2013)   
4  #1 at the Apocalypse Box Office (2015)   

                                                Plot  Action  
0  The night falls on the big city and a hooded f...       0  
1  After reaching #1 at Mutual of New York and se...       0  
2  When they're hired to work at a cheerleading c...       0  
3  Years of seething rage against the racism he's...       0  
4  Jules is, self declared, the most useless pers...       0  


In [27]:
# Create a Multinomial Naive Bayes classifier
actionclassifier = MultinomialNB()

# Train the classifier
actionclassifier.fit(X_train, y_train)

# Evaluate the classifier
score = actionclassifier.score(X_test, y_test)
print('Accuracy:', score)

Accuracy: 0.8836161149154563


In [28]:
# Predict if a new plot is an action movie
plot = "So many fuzzy kittens."

# Convert the plot into a matrix of token counts
plot = vectorizer.transform([plot])

# Predict if the plot is an action movie
prediction = actionclassifier.predict_proba(plot)

# Print the prediction
print(prediction)

# Predict if a new plot is an action movie
plot2 = "James Bond saves the world from evil."

# Convert the plot into a matrix of token counts
plot2 = vectorizer.transform([plot2])

# Predict if the plot is an action movie
prediction2 = actionclassifier.predict_proba(plot2)

# Print the prediction
print(prediction2)

[[0.97074974 0.02925026]]
[[0.30885402 0.69114598]]


In [29]:
# Get the list of genres from the dataset
genres = data.columns[2:]
print(genres)

# Create a dictionary to store the classifiers for each genre
classifiers = {}

Index(['Action', 'Adult', 'Adventure', 'Animation', 'Biography', 'Comedy',
       'Crime', 'Documentary', 'Drama', 'Family', 'Fantasy', 'History',
       'Horror', 'Music', 'Musical', 'Mystery', 'News', 'Reality-TV',
       'Romance', 'Sci-Fi', 'Short', 'Sport', 'Talk-Show', 'Thriller', 'War',
       'Western'],
      dtype='object')


In [30]:
# Train a classifier for each genre
for genre in genres:
    print('Training classifier for:', genre)
    # If the genre has already been trained, skip it
    if genre in classifiers:
        continue
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(data['Plot'], data[genre], test_size=0.2, random_state=42)
    
    # Create a Multinomial Naive Bayes classifier
    classifier = MultinomialNB()

    # Use the vectorizer to convert the text data into a matrix of token counts
    X_train = vectorizer.transform(X_train)
    
    # Train the classifier
    classifier.fit(X_train, y_train)
    
    # Store the classifier in the dictionary
    classifiers[genre] = classifier
    print('Done training classifier for:', genre)

Training classifier for: Action
Done training classifier for: Action
Training classifier for: Adult
Done training classifier for: Adult
Training classifier for: Adventure
Done training classifier for: Adventure
Training classifier for: Animation
Done training classifier for: Animation
Training classifier for: Biography
Done training classifier for: Biography
Training classifier for: Comedy
Done training classifier for: Comedy
Training classifier for: Crime
Done training classifier for: Crime
Training classifier for: Documentary
Done training classifier for: Documentary
Training classifier for: Drama
Done training classifier for: Drama
Training classifier for: Family
Done training classifier for: Family
Training classifier for: Fantasy
Done training classifier for: Fantasy
Training classifier for: History
Done training classifier for: History
Training classifier for: Horror
Done training classifier for: Horror
Training classifier for: Music
Done training classifier for: Music
Training c

In [31]:
# Evaluate the classifiers
scores = {}
for genre, classifier in classifiers.items():
    # Use the vectorizer to convert the text data into a matrix of token counts
    X_test = vectorizer.transform(data['Plot'])
    
    # Evaluate the classifier
    score = classifier.score(X_test, data[genre])
    scores[genre] = score
    print('Accuracy for', genre, ':', score)

Accuracy for Action : 0.8943100069205762
Accuracy for Adult : 0.9898284976534013
Accuracy for Adventure : 0.8984274589854087
Accuracy for Animation : 0.9390727590999761
Accuracy for Biography : 0.8096347215195027
Accuracy for Comedy : 0.8102017435199562
Accuracy for Crime : 0.8964646905222999
Accuracy for Documentary : 0.8568516612290711
Accuracy for Drama : 0.7538368488697361
Accuracy for Family : 0.9120156323603817
Accuracy for Fantasy : 0.9184069880372897
Accuracy for History : 0.8333507801641166
Accuracy for Horror : 0.9239928816930405
Accuracy for Music : 0.8920709969700671
Accuracy for Musical : 0.9597065443062268
Accuracy for Mystery : 0.9093288204197707
Accuracy for News : 0.9025478188553716
Accuracy for Reality-TV : 0.9856965065629162
Accuracy for Romance : 0.8169711138638333
Accuracy for Sci-Fi : 0.9383748858686486
Accuracy for Short : 0.7912864711458497
Accuracy for Sport : 0.9671069083634292
Accuracy for Talk-Show : 0.9924629691016627
Accuracy for Thriller : 0.8486487429558

In [32]:
# Write the scores to a file, alongside the number of movies in each genre
with open('scores.txt', 'w') as f:
    for genre, score in scores.items():
        num_movies = data[genre].sum()
        f.write(genre + ': ' + str(score) + ' (' + str(num_movies) + ')\n')