In [1]:
import pandas as pd
import numpy as np

import timeit
import time
import ast
import json
import joblib
from collections import Counter

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn import model_selection
from sklearn.multioutput import MultiOutputClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import hamming_loss, accuracy_score, f1_score, classification_report
from sentence_transformers import SentenceTransformer
import xgboost as xgb


# Preprocessing

In [2]:
df = pd.read_csv('../data/movies_metadata.csv')
df.shape

  df = pd.read_csv('../data/movies_metadata.csv')


(45466, 24)

In [3]:
df = df[['overview', 'genres']]
# df = df.head(5000)
df.dropna(inplace=True)

# Clean up the genres column
df['genres'] = df['genres'].apply(ast.literal_eval) \
                          .apply(lambda x: [i['name'] for i in x])


In [4]:
counter = Counter([genre for sublist in df['genres'] for genre in sublist])
counter

Counter({'Drama': 20023,
         'Comedy': 12806,
         'Thriller': 7586,
         'Romance': 6673,
         'Action': 6565,
         'Horror': 4660,
         'Crime': 4269,
         'Documentary': 3886,
         'Adventure': 3470,
         'Science Fiction': 3028,
         'Family': 2732,
         'Mystery': 2451,
         'Fantasy': 2290,
         'Animation': 1920,
         'Foreign': 1599,
         'Music': 1588,
         'History': 1379,
         'War': 1310,
         'Western': 1035,
         'TV Movie': 751,
         'Carousel Productions': 1,
         'Vision View Entertainment': 1,
         'Telescene Film Group Productions': 1,
         'Aniplex': 1,
         'GoHands': 1,
         'BROSTA TV': 1,
         'Mardock Scramble Production Committee': 1,
         'Sentai Filmworks': 1,
         'Odyssey Media': 1,
         'Pulser Productions': 1,
         'Rogue State': 1,
         'The Cartel': 1})

In [5]:
# Filter out rare genres
GENRE_FREQ_THRESHOLD = 10
popular_genres = [genre for genre, count in counter.items() if count >= GENRE_FREQ_THRESHOLD]

df['genres'] = df['genres'].apply(lambda genres: [genre for genre in genres if genre in popular_genres])

# Drop rows with no popular genre
df['genres'] = df['genres'].apply(lambda genres: genres if len(genres) > 0 else np.nan)
df.dropna(subset=['genres'], inplace=True)

In [6]:
# One hot encoding of genres
mlb = MultiLabelBinarizer()
df_genres = pd.DataFrame(mlb.fit_transform(df.pop('genres')),
                          columns=mlb.classes_,
                          index=df.index)

joblib.dump(mlb, '../models/mlb.joblib')
mlb.classes_

array(['Action', 'Adventure', 'Animation', 'Comedy', 'Crime',
       'Documentary', 'Drama', 'Family', 'Fantasy', 'Foreign', 'History',
       'Horror', 'Music', 'Mystery', 'Romance', 'Science Fiction',
       'TV Movie', 'Thriller', 'War', 'Western'], dtype=object)

In [7]:
df = df.join(df_genres)
df.to_csv('../data/processed_movies_metadata.csv', index=False)

# Model training

In [8]:
df = pd.read_csv('../data/processed_movies_metadata.csv')

X = df['overview']
y = df.drop(['overview'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Reset indices so that SentenceTransformer encode doesn't complain
X_train.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)

In [9]:
def train_model(X_train, y_train, encoder, multilabeler, estimator):
    if encoder == 'TfIdf':
        encoder_model = TfidfVectorizer().fit(X_train)
    elif encoder == 'SentenceTransformer':
        encoder_model = SentenceTransformer('all-MiniLM-L6-v2')
        encoder_model.transform = encoder_model.encode
    
    X_encoded = encoder_model.transform(X_train)
    classifier_model = multilabeler(estimator=estimator)
    classifier_model.fit(X_encoded, y_train)
    return (encoder_model, classifier)


In [12]:
# for encoder in ['TfIdf', 'SentenceTransformer']:
#     for multilabeler in [OneVsRestClassifier, MultiOutputClassifier]:
#         for estimator in [DecisionTreeClassifier, RandomForestClassifier, 
#                           MultinomialNB, LogisticRegression, xgb.XGBClassifier]:
#             if encoder == 'SentenceTransformer' and estimator == MultinomialNB:
#                 continue

for encoder in ['TfIdf', 'SentenceTransformer']:
    multilabeler = OneVsRestClassifier
    for estimator in [LogisticRegression, xgb.XGBClassifier]:
        start = time.time()
        print(encoder, multilabeler.__name__, estimator.__name__)
        encoder_model, classifier_model = train_model(X_train, y_train, 
                                               encoder, multilabeler, estimator())
        y_pred = classifier_model.predict(encoder_model.transform(X_test))
        report = classification_report(y_test, y_pred, output_dict=True)
        print("F1 Score: %.4f"%(report['micro avg']['f1-score']))
        print("Hamming Loss: %.4f"%(hamming_loss(y_test, y_pred)))
        stop = time.time()
        print('time taken: %.2fs' %(stop-start))
        print()


TfIdf OneVsRestClassifier LogisticRegression
(33859, 65531)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


F1 Score: 0.4589
time taken: 14.43s

TfIdf OneVsRestClassifier XGBClassifier
(33859, 65531)
F1 Score: 0.4456
time taken: 125.03s



In [None]:
joblib.dump(encoder_model, '../models/encoder_model.joblib')
joblib.dump(classifier_model, '../models/classifier_model.joblib')


In [None]:
# for encoder in ['TfIdf', 'SentenceTransformer']:
#     for multilabeler in [OneVsRestClassifier, MultiOutputClassifier]:
#         for estimator in [DecisionTreeClassifier, RandomForestClassifier, 
#                           MultinomialNB, LogisticRegression, xgb.XGBClassifier]:
#             if encoder == 'SentenceTransformer' and estimator == MultinomialNB:
#                 continue
#             start = time.time()
#             print(encoder, multilabeler.__name__, estimator.__name__)
#             encoder_model, genre_clf = train_model(X_train, y_train, 
#                                                    encoder, multilabeler, estimator())
#             y_pred = genre_clf.predict(encoder_model.transform(X_test))

#             # Calculate metrics
# #             print("Hamming Loss: %.4f"%(hamming_loss(y_test, y_pred)))
# #             print("Accuracy Score: %.4f"%(accuracy_score(y_test, y_pred)))
#             print("F1 Score: %.4f"%(f1_score(y_test, y_pred, average='micro')))  # average can be: micro, macro, weighted, samples
# #             report = classification_report(y_test, y_pred, output_dict=True)
# #             print("Micro F1 Score: %.4f"%(report['micro avg']['f1-score']))  # average can be: micro, macro, weighted, samples
# #             print("Macro F1 Score: %.4f"%(report['macro avg']['f1-score']))  # average can be: micro, macro, weighted, samples
# #             print("Weighted F1 Score: %.4f"%(report['weighted avg']['f1-score']))  # average can be: micro, macro, weighted, samples

# #             print("Classification Report:\n", classification_report(y_test, y_pred))

#             stop = time.time()
#             duration = stop - start
#             print('time taken: %.2fs' %(duration))
#             print()
