In [1]:
from pathlib import Path
from tqdm import tqdm
import json
import warnings
warnings.simplefilter("ignore")

import numpy as np
import pandas as pd

from PIL import Image

from text_features import bag_of_words, tf_idf, spacy_appoach
from img_features import resnet50_features

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier

In [2]:
data_dir = Path("data")
imgs_dir = data_dir / "imgs"
embeds_dir = Path("embeddings")

movie_info = pd.read_csv(data_dir / "movie_info.csv")
movie_info["plot"].fillna("No description", inplace=True)
movie_info["genres"] = movie_info["genres"].map(lambda x: json.loads(x.replace("\'", "\"")))

classes = ["Action", "Adventure", "Animation", "Biography", "Comedy", "Crime", "Documentary", "Drama",
            "Family", "Fantasy", "Film-Noir", "History", "Horror", "Music", "Musical", "Mystery",
            "Romance", "Sci-Fi", "Short", "Sport", "Superhero", "Thriller", "War", "Western"]

for genre in classes:
    movie_info[genre] = movie_info["genres"].map(lambda x: genre in x).astype(int)

In [3]:
for i in tqdm(range(len(movie_info))):
    if (imgs_dir / f"{movie_info.iloc[i].imdb_id}.jpg").exists():
        movie_info["imdb_id"].iloc[i] = str(movie_info.iloc[i]["imdb_id"])
    elif (imgs_dir / f"00{movie_info.iloc[i].imdb_id}.jpg").exists():
        movie_info["imdb_id"].iloc[i] = "00" + str(movie_info.iloc[i]["imdb_id"])
    elif (imgs_dir / f"0{movie_info.iloc[i].imdb_id}.jpg").exists():
        movie_info["imdb_id"].iloc[i] = "0" + str(movie_info.iloc[i]["imdb_id"])

100%|██████████| 4702/4702 [00:02<00:00, 2288.63it/s]


In [4]:
# inds = []
# for i in tqdm((2487,)):
#     try:
#         Image.open(imgs_dir / f"{movie_info.iloc[i].imdb_id}.jpg").convert("RGB").resize((224, 224))
#     except Exception as e:
#         print(e)
#         inds.append(i)
# len(inds)

In [7]:
image_methods = [resnet50_features]
text_methods = [bag_of_words, tf_idf, spacy_appoach]
classif_methods = [LogisticRegression, lambda: CatBoostClassifier(random_state=42, silent=True, iterations=100)]

val_size = 0.2

y = movie_info[classes].values
y_train, y_val = train_test_split(y, test_size=val_size)

best_score = - np.inf
for im in image_methods:
    print(f"retrieving image embeddings for {im.__name__}")
    if (embeds_dir / f"{im.__name__}.npy").exists():
        image_embeds = np.load(embeds_dir / f"{im.__name__}.npy")
    else:
        imgs = [Image.open(imgs_dir / f"{movie_info.iloc[i].imdb_id}.jpg").convert("RGB").resize((224, 224)) 
                            for i in range(len(movie_info))]
        image_embeds = []
        batch_size = 512
        for idx in range(0, len(imgs), batch_size):
            image_embeds.append(im(imgs[idx: idx + batch_size]))
        image_embeds = np.concatenate(image_embeds, axis=0)
        np.save(embeds_dir / f"{im.__name__}.npy", image_embeds)

    for tm in text_methods:
        print(f"retrieving text embeddings for {tm.__name__}")
        if (embeds_dir / f"{tm.__name__}_title.npy").exists():
            tm_title_embeds = np.load(embeds_dir / f"{tm.__name__}_title.npy")
            tm_plot_embeds = np.load(embeds_dir / f"{tm.__name__}_plot.npy")
        else:
            tm_title_embeds, tm_title = tm(movie_info.title.values)
            tm_plot_embeds, tm_plot = tm(movie_info["plot"].values)
            np.save(embeds_dir / f"{tm.__name__}_title.npy", tm_title_embeds)
            np.save(embeds_dir / f"{tm.__name__}_plot.npy", tm_plot_embeds)

        X = np.column_stack([image_embeds, tm_title_embeds, tm_plot_embeds])
        X_train, X_val = train_test_split(X, test_size=val_size)
        for cm in classif_methods[1:2]:
            print(f"Using {im.__name__} + {tm.__name__} + {cm.__name__}")
            print(f"Embeds size: {X.shape[1]}")

            models = [cm() for _ in range(len(classes))]
            score = 0
            for i in range(len(models)):
                models[i].fit(X_train, y_train[:, i])
                score += f1_score(y_val[:, i], models[i].predict(X_val))
            score /= len(models)
            print(f"Score {score}")
            if score > best_score:
                best_score = score
                best_comb = (im, tm, cm)
print(best_score, [x.__name__ for x in best_comb])

retrieving image embeddings for resnet50_features
retrieving text embeddings for bag_of_words
Using resnet50_features + bag_of_words + <lambda>
Embeds size: 27598
Score 0.06191386135236041
retrieving text embeddings for tf_idf
Using resnet50_features + tf_idf + <lambda>
Embeds size: 27598
Score 0.06213319344385634
retrieving text embeddings for spacy_appoach
Using resnet50_features + spacy_appoach + <lambda>
Embeds size: 2240
Score 0.0682355012858485
0.0682355012858485 ['resnet50_features', 'spacy_appoach', '<lambda>']


Method | number of generated features

resnet50_features | 2048

bag_of_words, tf-idf | 25550

spacy | 192

Pipeline                                              | mean f1 score

resnet50_features + bag_of_words + LogisticRegression | 0.12156873831875155

resnet50_features + tf_idf + LogisticRegression       | 0.14007789891666134

resnet50_features + spacy_appoach + LogisticRegression| 0.133247543506684

resnet50_features + bag_of_words + CatBoostClassifier | 0.06191386135236041

resnet50_features + tf_idf + CatBoostClassifier       | 0.06213319344385634

resnet50_features + spacy_appoach + CatBoostClassifier| 0.0682355012858485

In [8]:
import torch
from torch import nn

In [10]:
class SimpleNet(nn.Module):
    def __init__(self, input_size, classes_size):
        super().__init__()
        self.fc1 = nn.Linear(input_size, 1024)
        self.fc2 = nn.Linear(1024, 512)
        self.fc3 = nn.Linear(512, classes_size)

    def forward(self, X):
        X = nn.functional.relu(self.fc1(X))
        X = nn.functional.relu(self.fc2(X))
        X = nn.functional.sigmoid(self.fc3(X))
        return X

In [34]:
image_methods = [resnet50_features]
text_methods = [bag_of_words, tf_idf, spacy_appoach]

val_size = 0.2

y = movie_info[classes].values
y_train, y_val = train_test_split(y, test_size=val_size)

best_score = - np.inf
for im in image_methods:
    print(f"retrieving image embeddings for {im.__name__}")
    if (embeds_dir / f"{im.__name__}.npy").exists():
        image_embeds = np.load(embeds_dir / f"{im.__name__}.npy")
    else:
        imgs = [Image.open(imgs_dir / f"{movie_info.iloc[i].imdb_id}.jpg").convert("RGB").resize((224, 224)) 
                            for i in range(len(movie_info))]
        image_embeds = []
        batch_size = 512
        for idx in range(0, len(imgs), batch_size):
            image_embeds.append(im(imgs[idx: idx + batch_size]))
        image_embeds = np.concatenate(image_embeds, axis=0)
        np.save(embeds_dir / f"{im.__name__}.npy", image_embeds)

    for tm in text_methods:
        print(f"retrieving text embeddings for {tm.__name__}")
        if (embeds_dir / f"{tm.__name__}_title.npy").exists():
            tm_title_embeds = np.load(embeds_dir / f"{tm.__name__}_title.npy")
            tm_plot_embeds = np.load(embeds_dir / f"{tm.__name__}_plot.npy")
        else:
            tm_title_embeds, tm_title = tm(movie_info.title.values)
            tm_plot_embeds, tm_plot = tm(movie_info["plot"].values)
            np.save(embeds_dir / f"{tm.__name__}_title.npy", tm_title_embeds)
            np.save(embeds_dir / f"{tm.__name__}_plot.npy", tm_plot_embeds)

        X = np.column_stack([image_embeds, tm_title_embeds, tm_plot_embeds])
        X_train, X_val = train_test_split(X, test_size=val_size)

        # to tensors
        batch_size = 512
        train_dl = torch.utils.data.DataLoader(list(zip(X_train, y_train)), batch_size=batch_size, shuffle=True, num_workers=2)
        val_dl = torch.utils.data.DataLoader(list(zip(X_val, y_val)), batch_size=batch_size, shuffle=False, num_workers=2)

        # training
        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        model = SimpleNet(X.shape[1], len(classes)).float().to(device)
        optim = torch.optim.AdamW(model.parameters(), lr=4e-4)
        criterion = nn.L1Loss()
        epochs = 5
        losses = []
        model.train()
        for epoch in range(1, epochs + 1):
            losses.append(0)
            for X_cur, y_cur in train_dl:
                model.zero_grad()
                y_hat = model(X_cur.float().to(device))
                loss = criterion(y_cur.float().to(device), y_hat)
                loss.backward()
                optim.step()
                losses[- 1] += loss.item()
            losses[- 1] /= len(train_dl)
            print(f"Epoch #{epoch} loss: {losses[- 1]: 0.5f}")
        model.eval()
        preds = []
        with torch.no_grad():
            for X_cur, y_cur in val_dl:
                preds.append(model(X_cur.float().to(device)).cpu().numpy())
        preds = np.row_stack(preds)
        score = np.mean([f1_score(y_val[:, i], (preds[:, i] > 0.5).astype(int)) for i in range(len(classes))])
        print(f"Score: {score}")

retrieving image embeddings for resnet50_features
retrieving text embeddings for bag_of_words
Epoch #1 loss:  0.39641
Epoch #2 loss:  0.16206
Epoch #3 loss:  0.14414
Epoch #4 loss:  0.14372
Epoch #5 loss:  0.14393
Score: 0.030349099099099094
retrieving text embeddings for tf_idf
Epoch #1 loss:  0.41277
Epoch #2 loss:  0.17078
Epoch #3 loss:  0.14486
Epoch #4 loss:  0.14307


Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x0000016B6C6AA5E0>
Traceback (most recent call last):
  File "c:\Users\Владислав\AppData\Local\Programs\Python\Python38\lib\site-packages\torch\utils\data\dataloader.py", line 1466, in __del__
    self._shutdown_workers()
  File "c:\Users\Владислав\AppData\Local\Programs\Python\Python38\lib\site-packages\torch\utils\data\dataloader.py", line 1424, in _shutdown_workers
    if self._persistent_workers or self._workers_status[worker_id]:
AttributeError: '_MultiProcessingDataLoaderIter' object has no attribute '_workers_status'
Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x0000016B6C6AA5E0>
Traceback (most recent call last):
  File "c:\Users\Владислав\AppData\Local\Programs\Python\Python38\lib\site-packages\torch\utils\data\dataloader.py", line 1466, in __del__
    self._shutdown_workers()
  File "c:\Users\Владислав\AppData\Local\Programs\Python\Python38\lib\site-packages\torch\utils\da

Epoch #5 loss:  0.14470
Score: 0.030349099099099094
retrieving text embeddings for spacy_appoach
Epoch #1 loss:  0.39228
Epoch #2 loss:  0.15688
Epoch #3 loss:  0.14460
Epoch #4 loss:  0.14381
Epoch #5 loss:  0.14393
Score: 0.030349099099099094


In [43]:
avg_weights = []
with torch.no_grad():
    for i in range(model.fc1.weight.shape[1]):
        avg_weights.append(model.fc1.weight[:, i].sum().item())
np.min(avg_weights), np.max(avg_weights), np.mean(avg_weights), np.std(avg_weights)

(-4.56007719039917, 4.925796985626221, 3.1783318028253102, 1.4912840477227125)

Pipeline with SimpleNet          | mean f1 score

resnet50_features + bag_of_words | 0.030349099099099094

resnet50_features + tf_idf       | 0.030349099099099094

resnet50_features + spacy        | 0.030349099099099094