In [3]:
import pandas as pd
import numpy as np

In [5]:
df = pd.read_csv("data/IMDb movies.csv")
df.head(5)

Unnamed: 0,imdb_title_id,title,original_title,year,date_published,genre,duration,country,language,director,...,actors,description,avg_vote,votes,budget,usa_gross_income,worlwide_gross_income,metascore,reviews_from_users,reviews_from_critics
0,tt0000009,Miss Jerry,Miss Jerry,1894,1894-10-09,Romance,45,USA,,Alexander Black,...,"Blanche Bayliss, William Courtenay, Chauncey D...",The adventures of a female reporter in the 1890s.,5.9,154,,,,,1.0,2.0
1,tt0000574,The Story of the Kelly Gang,The Story of the Kelly Gang,1906,1906-12-26,"Biography, Crime, Drama",70,Australia,,Charles Tait,...,"Elizabeth Tait, John Tait, Norman Campbell, Be...",True story of notorious Australian outlaw Ned ...,6.1,589,$ 2250,,,,7.0,7.0
2,tt0001892,Den sorte drøm,Den sorte drøm,1911,1911-08-19,Drama,53,"Germany, Denmark",,Urban Gad,...,"Asta Nielsen, Valdemar Psilander, Gunnar Helse...",Two men of high rank are both wooing the beaut...,5.8,188,,,,,5.0,2.0
3,tt0002101,Cleopatra,Cleopatra,1912,1912-11-13,"Drama, History",100,USA,English,Charles L. Gaskill,...,"Helen Gardner, Pearl Sindelar, Miss Fielding, ...",The fabled queen of Egypt's affair with Roman ...,5.2,446,$ 45000,,,,25.0,3.0
4,tt0002130,L'Inferno,L'Inferno,1911,1911-03-06,"Adventure, Drama, Fantasy",68,Italy,Italian,"Francesco Bertolini, Adolfo Padovan",...,"Salvatore Papa, Arturo Pirovano, Giuseppe de L...",Loosely adapted from Dante's Divine Comedy and...,7.0,2237,,,,,31.0,14.0


In [4]:
def read_vectors(file: str):
    result = {}

    with open(file, "r") as fd:
        for line in fd:
            line = line.strip()
            film_id = line[:9]
            vector = np.array(eval(line[11:]))
            result[film_id] = vector

    return result

In [13]:
# Load film embeddings

# fasttext description embeddings
id_to_ft = read_vectors("fasttext-vecs")

# fasttext desciprions + PCA actors embeddings
id_to_ftac = read_vectors("concated_vectors")

# PCA actors embeddings
id_to_ac = read_vectors("pca_vectors")

In [19]:
# Build dataset to run classification on
from typing import Dict, Tuple

common_film_keys = list(id_to_genre.keys() & id_to_ft.keys() & id_to_ftac.keys() & id_to_ac.keys())
id_to_genre = {row["imdb_title_id"]: row["genre"].split(",")[0] for i, row in df.iterrows()}
class_to_genre = list(set(id_to_genre.values()))
genre_to_class = {k: v for v, k in enumerate(class_to_genre)}

def get_x_y_from_vecs(vecs: Dict[str, np.array]) -> Tuple[np.array, np.array]:
    vectors = []
    classes = []
    
    for key in common_film_keys:
        vectors.append(vecs[key])
        classes.append(genre_to_class[id_to_genre[key]])
    
    return np.array(vectors), np.array(classes)

print(f"Films in dataset {len(common_film_keys)}")

Films in dataset 82755


In [75]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from collections import Counter


print(f"X shape: {X.shape}")
print(f"Class balance: {Counter(id_to_genre.values())}")

def train_and_eval_model(model, X, y):
    # Split 80% train, 20% test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)

    # Metrics
    acc = accuracy_score(y_test, y_pred)
    # Can be changed 
    precision = precision_score(y_test, y_pred, average="micro")
    recall = recall_score(y_test, y_pred, average="micro")
    f1 = f1_score(y_test, y_pred, average="micro")
#     Not working, fix if you want to
#     roc_auc = roc_auc_score(y_test, y_pred_proba, multi_class="ovr")

    print(f"{acc = }")
    print(f"{precision= }")
    print(f"{recall = }")
    print(f"{f1 = }")
#     print(f"{roc_auc = }")

X shape: (82755, 96)
Class balance: Counter({'Drama': 25386, 'Comedy': 24297, 'Action': 12182, 'Crime': 5557, 'Horror': 5113, 'Adventure': 3599, 'Animation': 2108, 'Biography': 2067, 'Thriller': 1375, 'Romance': 767, 'Family': 638, 'Western': 617, 'Mystery': 593, 'Fantasy': 502, 'Sci-Fi': 417, 'Musical': 324, 'War': 107, 'History': 80, 'Music': 77, 'Film-Noir': 29, 'Sport': 16, 'Adult': 2, 'Documentary': 2})


In [76]:
from sklearn.tree import DecisionTreeClassifier

print("Decision tree with description fasttext")
X, y = get_x_y_from_vecs(id_to_ft)
model = DecisionTreeClassifier(random_state=42)
train_and_eval_model(model, X, y)

Decision tree with description fasttext
acc = 0.21950335327170564
precision= 0.21950335327170564
recall = 0.21950335327170564
f1 = 0.21950335327170564


In [77]:
from sklearn.tree import DecisionTreeClassifier

print("Decision tree with actors PCA")
X, y = get_x_y_from_vecs(id_to_ac)
model = DecisionTreeClassifier(random_state=42)
train_and_eval_model(model, X, y)

Decision tree with actors PCA
acc = 0.19062292308621837
precision= 0.19062292308621837
recall = 0.19062292308621837
f1 = 0.19062292308621837


In [78]:
from sklearn.tree import DecisionTreeClassifier

print("Decision tree with actors PCA and description fasttext")
X, y = get_x_y_from_vecs(id_to_ftac)
model = DecisionTreeClassifier(random_state=42)
train_and_eval_model(model, X, y)

Decision tree with actors PCA and description fasttext
acc = 0.19062292308621837
precision= 0.19062292308621837
recall = 0.19062292308621837
f1 = 0.19062292308621837


In [80]:
from sklearn.neighbors import KNeighborsClassifier

print("KNeighbors classifier with description fasttext")
X, y = get_x_y_from_vecs(id_to_ft)
model = KNeighborsClassifier(n_jobs=-1)
train_and_eval_model(model, X, y)

KNeighbors classifier with description fasttext
acc = 0.2751495377922784
precision= 0.2751495377922784
recall = 0.2751495377922784
f1 = 0.2751495377922784


In [81]:
from sklearn.neighbors import KNeighborsClassifier

print("KNeighbors classifier with actors PCA")
X, y = get_x_y_from_vecs(id_to_ac)
model = KNeighborsClassifier(n_jobs=-1)
train_and_eval_model(model, X, y)

KNeighbors classifier with actors PCA
acc = 0.2454836565766419
precision= 0.2454836565766419
recall = 0.2454836565766419
f1 = 0.2454836565766419


In [82]:
from sklearn.neighbors import KNeighborsClassifier

print("KNeighbors classifier with description fasttext and actors")
X, y = get_x_y_from_vecs(id_to_ftac)
model = KNeighborsClassifier(n_jobs=-1)
train_and_eval_model(model, X, y)

KNeighbors classifier with description fasttext and actors
acc = 0.2454836565766419
precision= 0.2454836565766419
recall = 0.2454836565766419
f1 = 0.2454836565766419


In [83]:
from sklearn.ensemble import RandomForestClassifier

print("Random forest classifier with actors PCA")
X, y = get_x_y_from_vecs(id_to_ac)
model = RandomForestClassifier(n_jobs=-1, random_state=42)
train_and_eval_model(model, X, y)

Random forest classifier with actors PCA
acc = 0.273336958491934
precision= 0.273336958491934
recall = 0.273336958491934
f1 = 0.273336958491934


In [84]:
from sklearn.ensemble import RandomForestClassifier

print("Random forest classifier with description fasttext")
X, y = get_x_y_from_vecs(id_to_ft)
model = RandomForestClassifier(n_jobs=-1, random_state=42)
train_and_eval_model(model, X, y)

Random forest classifier with description fasttext
acc = 0.33291039816325296
precision= 0.33291039816325296
recall = 0.33291039816325296
f1 = 0.33291039816325296


In [85]:
from sklearn.ensemble import RandomForestClassifier

print("Random forest classifier with description fasttext and fine-tuned parameters")
X, y = get_x_y_from_vecs(id_to_ft)
model = RandomForestClassifier(n_jobs=-1, random_state=42, max_depth=20, max_features=13, min_samples_leaf=7)
train_and_eval_model(model, X, y)

Random forest classifier with description fasttext and fine-tuned parameters
acc = 0.3350854933236662
precision= 0.3350854933236662
recall = 0.3350854933236662
f1 = 0.3350854933236662


In [None]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_score

parameters = {
    'max_features': [4, 7, 10, 13], 
    'min_samples_leaf': [1, 3, 5, 7], 
    'max_depth': [5, 10, 15, 20]
}

X, y = get_x_y_from_vecs(id_to_ft)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
gcv = GridSearchCV(model, parameters, n_jobs=-1, cv=skf, verbose=1)
gcv.fit(X, y)

Fitting 5 folds for each of 64 candidates, totalling 320 fits




In [65]:
gcv.best_params_, gcv.best_score_

({'max_depth': 20, 'max_features': 13, 'min_samples_leaf': 7},
 0.33925442571445835)