In [1]:
from pathlib import Path
from tqdm import tqdm
import json
import re
import warnings
warnings.simplefilter("ignore")

import numpy as np
import pandas as pd

from PIL import Image

from text_features import bag_of_words, tf_idf, spacy_approach
from img_features import resnet50_features

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier

In [2]:
data_dir = Path("data")
imgs_dir = data_dir / "imgs"
embeds_dir = Path("embeddings")

movie_info = pd.read_csv(data_dir / "movie_info.csv")
movie_info["plot"].fillna("No description", inplace=True)
movie_info["genres"] = movie_info["genres"].map(lambda x: json.loads(x.replace("\'", "\"")))

classes = ["Action", "Adventure", "Animation", "Biography", "Comedy", "Crime", "Documentary", "Drama",
            "Family", "Fantasy", "Film-Noir", "History", "Horror", "Music", "Musical", "Mystery",
            "Romance", "Sci-Fi", "Short", "Sport", "Superhero", "Thriller", "War", "Western"]

for genre in classes:
    movie_info[genre] = movie_info["genres"].map(lambda x: genre in x).astype(int)

In [3]:
for i in tqdm(range(len(movie_info))):
    if (imgs_dir / f"{movie_info.iloc[i].imdb_id}.jpg").exists():
        movie_info["imdb_id"].iloc[i] = str(movie_info.iloc[i]["imdb_id"])
    elif (imgs_dir / f"00{movie_info.iloc[i].imdb_id}.jpg").exists():
        movie_info["imdb_id"].iloc[i] = "00" + str(movie_info.iloc[i]["imdb_id"])
    elif (imgs_dir / f"0{movie_info.iloc[i].imdb_id}.jpg").exists():
        movie_info["imdb_id"].iloc[i] = "0" + str(movie_info.iloc[i]["imdb_id"])

100%|██████████| 4702/4702 [00:01<00:00, 2639.74it/s]


In [4]:
# inds = []
# for i in tqdm((2487,)):
#     try:
#         Image.open(imgs_dir / f"{movie_info.iloc[i].imdb_id}.jpg").convert("RGB").resize((224, 224))
#     except Exception as e:
#         print(e)
#         inds.append(i)
# len(inds)

In [4]:
def clean_text(text):
    text = re.sub("[^ a-zA-Z0-9]", " ", text)  # deleting everything besides whitespaces and letters
    text = re.sub(" +", " ", text)  # merging multiple whitespaces into one
    text = text.lower()  # text to lowercase

    stop_words = stopwords.words('english')
    text = [word for word in text.split(" ") if not word in stop_words]  # removing stop_words

    lemmatizer = WordNetLemmatizer()
    text = [lemmatizer.lemmatize(token) for token in text]  # lemmatization
    return ' '.join(text)

In [5]:
for col in ("title", "plot"):
    movie_info[col] = movie_info[col].map(clean_text)

In [7]:
image_methods = [resnet50_features]
text_methods = [bag_of_words, tf_idf, spacy_approach]
classif_methods = [LogisticRegression, CatBoostClassifier]
classif_methods_params = [{"random_state": 0, "solver": "saga"}, {"random_state": 0, "silent": True, "iterations": 100}]

val_size = 0.2

y = movie_info[classes].values
y_train, y_val = train_test_split(y, test_size=val_size, random_state=0)

best_score = - np.inf
for im in image_methods:
    print(f"retrieving image embeddings for {im.__name__}")
    if (embeds_dir / f"{im.__name__}.npy").exists():
        image_embeds = np.load(embeds_dir / f"{im.__name__}.npy")
    else:
        imgs = [Image.open(imgs_dir / f"{movie_info.iloc[i].imdb_id}.jpg").convert("RGB").resize((224, 224)) 
                            for i in range(len(movie_info))]
        image_embeds = []
        batch_size = 512
        for idx in range(0, len(imgs), batch_size):
            image_embeds.append(im(imgs[idx: idx + batch_size]))
        image_embeds = np.concatenate(image_embeds, axis=0)
        np.save(embeds_dir / f"{im.__name__}.npy", image_embeds)

    for tm in text_methods[2:]:
        print(f"retrieving text embeddings for {tm.__name__}")
        if (embeds_dir / f"{tm.__name__}_title.npy").exists():
            tm_title_embeds = np.load(embeds_dir / f"{tm.__name__}_title.npy")
            tm_plot_embeds = np.load(embeds_dir / f"{tm.__name__}_plot.npy")
        else:
            tm_title_embeds, tm_title = tm(movie_info.title.values)
            tm_plot_embeds, tm_plot = tm(movie_info["plot"].values)
            np.save(embeds_dir / f"{tm.__name__}_title.npy", tm_title_embeds)
            np.save(embeds_dir / f"{tm.__name__}_plot.npy", tm_plot_embeds)

        X = np.column_stack([image_embeds, tm_title_embeds, tm_plot_embeds])
        X_train, X_val = train_test_split(X, test_size=val_size, random_state=0)
        for cm, cmp in zip(classif_methods, classif_methods_params):
            print(f"Using {im.__name__} + {tm.__name__} + {cm.__name__}")
            print(f"Embeds size: {X.shape[1]}")

            models = [cm(**cmp) for _ in range(len(classes))]
            score = 0
            for i in tqdm(range(len(models))):
                models[i].fit(X_train, y_train[:, i])
                score += f1_score(y_val[:, i], models[i].predict(X_val))
            score /= len(models)
            print(f"Score {score}")
            if score > best_score:
                best_score = score
                best_comb = (im, tm, cm)
print(best_score, [x.__name__ for x in best_comb])

retrieving image embeddings for resnet50_features
retrieving text embeddings for spacy_appoach
Using resnet50_features + spacy_appoach + LogisticRegression
Embeds size: 2240


100%|██████████| 24/24 [03:13<00:00,  8.08s/it]


Score 0.6305036171827988
Using resnet50_features + spacy_appoach + CatBoostClassifier
Embeds size: 2240


100%|██████████| 24/24 [10:17<00:00, 25.72s/it]

Score 0.5968512397586431
0.6305036171827988 ['resnet50_features', 'spacy_appoach', 'LogisticRegression']





Method | number of generated features

resnet50_features | 2048

bag_of_words, tf-idf | 22997

spacy | 192

Pipeline                                               | mean f1 score

resnet50_features + bag_of_words + LogisticRegression  | 0.67489

resnet50_features + tf_idf + LogisticRegression        | 0.63171

resnet50_features + spacy_approach + LogisticRegression | 0.63050

resnet50_features + bag_of_words + CatBoostClassifier  | 0.57421

resnet50_features + tf_idf + CatBoostClassifier        | 0.57266

resnet50_features + spacy_approach + CatBoostClassifier | 0.59685

In [6]:
import torch
from torch import nn

In [7]:
class SimpleNet(nn.Module):
    def __init__(self, input_size, classes_size):
        super().__init__()
        self.fc1 = nn.Linear(input_size, 1024)
        self.fc2 = nn.Linear(1024, 512)
        self.fc3 = nn.Linear(512, classes_size)

    def forward(self, X):
        X = nn.functional.relu(self.fc1(X))
        X = nn.functional.relu(self.fc2(X))
        X = nn.functional.sigmoid(self.fc3(X))
        return X

In [11]:
image_methods = [resnet50_features]
text_methods = [bag_of_words, tf_idf, spacy_approach]

val_size = 0.2

y = movie_info[classes].values

best_score = - np.inf
for im in image_methods:
    print(f"retrieving image embeddings for {im.__name__}")
    if (embeds_dir / f"{im.__name__}.npy").exists():
        image_embeds = np.load(embeds_dir / f"{im.__name__}.npy")
    else:
        imgs = [Image.open(imgs_dir / f"{movie_info.iloc[i].imdb_id}.jpg").convert("RGB").resize((224, 224)) 
                            for i in range(len(movie_info))]
        image_embeds = []
        batch_size = 512
        for idx in range(0, len(imgs), batch_size):
            image_embeds.append(im(imgs[idx: idx + batch_size]))
        image_embeds = np.concatenate(image_embeds, axis=0)
        np.save(embeds_dir / f"{im.__name__}.npy", image_embeds)

    for tm in text_methods:
        print(f"retrieving text embeddings for {tm.__name__}")
        if (embeds_dir / f"{tm.__name__}_title.npy").exists():
            tm_title_embeds = np.load(embeds_dir / f"{tm.__name__}_title.npy")
            tm_plot_embeds = np.load(embeds_dir / f"{tm.__name__}_plot.npy")
        else:
            tm_title_embeds, tm_title = tm(movie_info.title.values)
            tm_plot_embeds, tm_plot = tm(movie_info["plot"].values)
            np.save(embeds_dir / f"{tm.__name__}_title.npy", tm_title_embeds)
            np.save(embeds_dir / f"{tm.__name__}_plot.npy", tm_plot_embeds)

        X = np.column_stack([image_embeds, tm_title_embeds, tm_plot_embeds])
        X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=val_size, random_state=0)

        # to tensors
        batch_size = 128
        train_dl = torch.utils.data.DataLoader(list(zip(X_train, y_train)), batch_size=batch_size, shuffle=True)
        val_dl = torch.utils.data.DataLoader(list(zip(X_val, y_val)), batch_size=batch_size, shuffle=False)

        # training
        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        model = SimpleNet(X.shape[1], len(classes)).float().to(device)
        optim = torch.optim.AdamW(model.parameters(), lr=4e-4)
        criterion = nn.L1Loss()
        epochs = 5
        losses = []
        model.train()
        for epoch in range(1, epochs + 1):
            losses.append(0)
            for X_cur, y_cur in tqdm(train_dl, leave=False):
                model.zero_grad()
                y_hat = model(X_cur.float().to(device))
                loss = criterion(y_cur.float().to(device), y_hat)
                loss.backward()
                optim.step()
                losses[- 1] += loss.item()
            losses[- 1] /= len(train_dl)
            print(f"Epoch #{epoch} loss: {losses[- 1]: 0.5f}")
        model.eval()
        preds = []
        with torch.no_grad():
            for X_cur, y_cur in val_dl:
                preds.append(model(X_cur.float().to(device)).cpu().numpy())
        preds = np.row_stack(preds)
        score = np.mean([f1_score(y_val[:, i], (preds[:, i] > 0.5).astype(int)) for i in range(len(classes))])
        print(f"Score: {score}")

retrieving image embeddings for resnet50_features
retrieving text embeddings for bag_of_words


                                               

Epoch #1 loss:  0.21696


                                               

Epoch #2 loss:  0.14580


                                               

Epoch #3 loss:  0.14559


                                               

Epoch #4 loss:  0.14557


                                               

Epoch #5 loss:  0.14485
Score: 0.03177733946964716
retrieving text embeddings for tf_idf


                                               

Epoch #1 loss:  0.22168


                                               

Epoch #2 loss:  0.14566


                                               

Epoch #3 loss:  0.14548


                                               

Epoch #4 loss:  0.14552


                                               

Epoch #5 loss:  0.14576
Score: 0.03177733946964716
retrieving text embeddings for spacy_appoach


                                                

Epoch #1 loss:  0.21506


                                                

Epoch #2 loss:  0.14554


                                                

Epoch #3 loss:  0.14568


                                                

Epoch #4 loss:  0.14524


                                                

Epoch #5 loss:  0.14566
Score: 0.03177733946964716




In [43]:
avg_weights = []
with torch.no_grad():
    for i in range(model.fc1.weight.shape[1]):
        avg_weights.append(model.fc1.weight[:, i].sum().item())
np.min(avg_weights), np.max(avg_weights), np.mean(avg_weights), np.std(avg_weights)

(-4.56007719039917, 4.925796985626221, 3.1783318028253102, 1.4912840477227125)

Pipeline with SimpleNet          | mean f1 score

resnet50_features + bag_of_words | 0.03177

resnet50_features + tf_idf       | 0.03177

resnet50_features + spacy        | 0.03177