In [1]:
!pip install catboost
!pip install transformers
! pip install sentence_transformers
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [2]:
from pathlib import Path
from tqdm import tqdm
import json
import re
import warnings
warnings.simplefilter("ignore")

import numpy as np
import pandas as pd

from PIL import Image

from text_features_3 import bag_of_words, tf_idf, spacy_approach, word2vec_approach, transformer_clip, transformer_distil_bert
from img_features import resnet50_features
from sentence_transformers import SentenceTransformer

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier

In [3]:
from google.colab import drive
drive.mount('/content/drive/')

# raif_trans = pd.read_csv('/content/drive/My Drive/Raifdata.csv', header=0, sep=',')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [4]:
data_dir = Path("/content/drive/My Drive")
imgs_dir = data_dir / "imgs"
embeds_dir = Path("embeddings")

movie_info = pd.read_csv(data_dir / "movie_info.csv")
movie_info["plot"].fillna("No description", inplace=True)
movie_info["genres"] = movie_info["genres"].map(lambda x: json.loads(x.replace("\'", "\"")))

classes = ["Action", "Adventure", "Animation", "Biography", "Comedy", "Crime", "Documentary", "Drama",
            "Family", "Fantasy", "Film-Noir", "History", "Horror", "Music", "Musical", "Mystery",
            "Romance", "Sci-Fi", "Short", "Sport", "Superhero", "Thriller", "War", "Western"]

for genre in classes:
    movie_info[genre] = movie_info["genres"].map(lambda x: genre in x).astype(int)

In [5]:
for i in tqdm(range(len(movie_info))):
    if (imgs_dir / f"{movie_info.iloc[i].imdb_id}.jpg").exists():
        movie_info["imdb_id"].iloc[i] = str(movie_info.iloc[i]["imdb_id"])
    elif (imgs_dir / f"00{movie_info.iloc[i].imdb_id}.jpg").exists():
        movie_info["imdb_id"].iloc[i] = "00" + str(movie_info.iloc[i]["imdb_id"])
    elif (imgs_dir / f"0{movie_info.iloc[i].imdb_id}.jpg").exists():
        movie_info["imdb_id"].iloc[i] = "0" + str(movie_info.iloc[i]["imdb_id"])

100%|██████████| 4702/4702 [00:07<00:00, 640.74it/s]


In [7]:
def clean_text(text):
    text = re.sub("[^ a-zA-Z0-9]", " ", text)  # deleting everything besides whitespaces and letters
    text = re.sub(" +", " ", text)  # merging multiple whitespaces into one
    text = text.lower()  # text to lowercase

    stop_words = stopwords.words('english')
    text = [word for word in text.split(" ") if not word in stop_words]  # removing stop_words

    lemmatizer = WordNetLemmatizer()
    text = [lemmatizer.lemmatize(token) for token in text]  # lemmatization
    return ' '.join(text)

In [8]:
for col in ("title", "plot"):
    movie_info[col] = movie_info[col].map(clean_text)

In [15]:
image_methods = [transformer_clip, resnet50_features]
text_methods = [transformer_clip, transformer_distil_bert]#, bag_of_words, tf_idf, spacy_approach]
classif_methods = [LogisticRegression]#, CatBoostClassifier]
classif_methods_params = [{"random_state": 0, "solver": "saga"}, {"random_state": 0, "silent": True, "iterations": 100}]

val_size = 0.2

y = movie_info[classes].values
y_train, y_val = train_test_split(y, test_size=val_size, random_state=0)

best_score = - np.inf
for im in image_methods:
    print(f"retrieving image embeddings for {im.__name__}")
    if (embeds_dir / f"{im.__name__}.npy").exists():
        image_embeds = np.load(embeds_dir / f"{im.__name__}.npy")
    else:
        imgs = [Image.open(imgs_dir / f"{movie_info.iloc[i].imdb_id}.jpg").convert("RGB").resize((224, 224)) 
                            for i in range(len(movie_info))]
        
        image_embeds, im_model = im(imgs)
        image_embeds = np.array(image_embeds)
        
        np.save(embeds_dir / f"{im.__name__}.npy", image_embeds)

    for tm in text_methods:
        print(f"retrieving text embeddings for {tm.__name__}")
        if (embeds_dir / f"{tm.__name__}_title.npy").exists():
            tm_title_embeds = np.load(embeds_dir / f"{tm.__name__}_title.npy")
            tm_plot_embeds = np.load(embeds_dir / f"{tm.__name__}_plot.npy")
        else:
            tm_title_embeds, tm_title = tm(movie_info.title.values, data_type_text=True)
            tm_plot_embeds, tm_plot = tm(movie_info["plot"].values, data_type_text=True)

            np.save(embeds_dir / f"{tm.__name__}_title.npy", tm_title_embeds)
            np.save(embeds_dir / f"{tm.__name__}_plot.npy", tm_plot_embeds)

        X = np.column_stack([image_embeds, tm_title_embeds, tm_plot_embeds])
        X_train, X_val = train_test_split(X, test_size=val_size, random_state=0)

        for cm, cmp in zip(classif_methods, classif_methods_params):
            print(f"Using {im.__name__} + {tm.__name__} + {cm.__name__}")
            print(f"Embeds size: {X.shape[1]}")

            models = [cm(**cmp) for _ in range(len(classes))]
            score = 0
            for i in tqdm(range(len(models))):

                models[i].fit(X_train, y_train[:, i])
                score += f1_score(y_val[:, i], models[i].predict(X_val))
            score /= len(models)
            print(f"Score {score}")
            if score > best_score:
                best_score = score
                best_comb = (im, tm, cm)
print(best_score, [x.__name__ for x in best_comb])

retrieving image embeddings for transformer_clip


ftfy or spacy is not installed using BERT BasicTokenizer instead of ftfy.


retrieving text embeddings for transformer_clip


ftfy or spacy is not installed using BERT BasicTokenizer instead of ftfy.
ftfy or spacy is not installed using BERT BasicTokenizer instead of ftfy.


Using transformer_clip + transformer_clip + LogisticRegression
Embeds size: 1536


100%|██████████| 24/24 [03:08<00:00,  7.87s/it]


Score 0.722937328444511
retrieving text embeddings for transformer_distil_bert
Using transformer_clip + transformer_distil_bert + LogisticRegression
Embeds size: 2048


100%|██████████| 24/24 [04:12<00:00, 10.54s/it]


Score 0.6607348537810084
retrieving image embeddings for resnet50_features
retrieving text embeddings for transformer_clip
Using resnet50_features + transformer_clip + LogisticRegression
Embeds size: 3072


100%|██████████| 24/24 [06:13<00:00, 15.57s/it]


Score 0.7036974529671972
retrieving text embeddings for transformer_distil_bert
Using resnet50_features + transformer_distil_bert + LogisticRegression
Embeds size: 3584


100%|██████████| 24/24 [07:17<00:00, 18.22s/it]

Score 0.64425909164199
0.722937328444511 ['transformer_clip', 'transformer_clip', 'LogisticRegression']





Method | number of generated features

resnet50_features | 2048

bag_of_words, tf-idf | 22997

spacy | 192

word2vec | 22798

Pipeline                                               | mean f1 score

resnet50_features + bag_of_words + LogisticRegression  | 0.67489

resnet50_features + tf_idf + LogisticRegression        | 0.63171

resnet50_features + spacy_approach + LogisticRegression | 0.63050

resnet50_features + word2vec_approach + LogisticRegression | 0.5764033562527832


resnet50_features + bag_of_words + CatBoostClassifier  | 0.57421

resnet50_features + tf_idf + CatBoostClassifier        | 0.57266

resnet50_features + spacy_approach + CatBoostClassifier | 0.59685

resnet50_features + word2vec_approach + CatBoostClassifier | 0.5720593239340869

-------------------------------------------------
#### with transformers \\/
-------------------------------------------------

transformer_clip + transformer_clip + LogisticRegression | 0.722937328444511

transformer_clip + transformer_distil_bert + LogisticRegression | 0.6607348537810084

resnet50_features + transformer_clip + LogisticRegression | 0.7036974529671972

resnet50_features + transformer_distil_bert + LogisticRegression | 0.64425909164199

In [None]:
import torch
from torch import nn

In [None]:
class SimpleNet(nn.Module):
    def __init__(self, input_size, classes_size):
        super().__init__()
        self.fc1 = nn.Linear(input_size, 1024)
        self.fc2 = nn.Linear(1024, 512)
        self.fc3 = nn.Linear(512, classes_size)

    def forward(self, X):
        X = nn.functional.relu(self.fc1(X))
        X = nn.functional.relu(self.fc2(X))
        X = nn.functional.sigmoid(self.fc3(X))
        return X

In [None]:
image_methods = [resnet50_features]
text_methods = [bag_of_words, tf_idf, spacy_approach]

val_size = 0.2

y = movie_info[classes].values

best_score = - np.inf
for im in image_methods:
    print(f"retrieving image embeddings for {im.__name__}")
    if (embeds_dir / f"{im.__name__}.npy").exists():
        image_embeds = np.load(embeds_dir / f"{im.__name__}.npy")
    else:
        imgs = [Image.open(imgs_dir / f"{movie_info.iloc[i].imdb_id}.jpg").convert("RGB").resize((224, 224)) 
                            for i in range(len(movie_info))]
        image_embeds = []
        batch_size = 512
        for idx in range(0, len(imgs), batch_size):
            image_embeds.append(im(imgs[idx: idx + batch_size]))
        image_embeds = np.concatenate(image_embeds, axis=0)
        np.save(embeds_dir / f"{im.__name__}.npy", image_embeds)

    for tm in text_methods:
        print(f"retrieving text embeddings for {tm.__name__}")
        if (embeds_dir / f"{tm.__name__}_title.npy").exists():
            tm_title_embeds = np.load(embeds_dir / f"{tm.__name__}_title.npy")
            tm_plot_embeds = np.load(embeds_dir / f"{tm.__name__}_plot.npy")
        else:
            tm_title_embeds, tm_title = tm(movie_info.title.values)
            tm_plot_embeds, tm_plot = tm(movie_info["plot"].values)
            np.save(embeds_dir / f"{tm.__name__}_title.npy", tm_title_embeds)
            np.save(embeds_dir / f"{tm.__name__}_plot.npy", tm_plot_embeds)

        X = np.column_stack([image_embeds, tm_title_embeds, tm_plot_embeds])
        X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=val_size, random_state=0)

        # to tensors
        batch_size = 128
        train_dl = torch.utils.data.DataLoader(list(zip(X_train, y_train)), batch_size=batch_size, shuffle=True)
        val_dl = torch.utils.data.DataLoader(list(zip(X_val, y_val)), batch_size=batch_size, shuffle=False)

        # training
        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        model = SimpleNet(X.shape[1], len(classes)).float().to(device)
        optim = torch.optim.AdamW(model.parameters(), lr=4e-4)
        criterion = nn.L1Loss()
        epochs = 5
        losses = []
        model.train()
        for epoch in range(1, epochs + 1):
            losses.append(0)
            for X_cur, y_cur in tqdm(train_dl, leave=False):
                model.zero_grad()
                y_hat = model(X_cur.float().to(device))
                loss = criterion(y_cur.float().to(device), y_hat)
                loss.backward()
                optim.step()
                losses[- 1] += loss.item()
            losses[- 1] /= len(train_dl)
            print(f"Epoch #{epoch} loss: {losses[- 1]: 0.5f}")
        model.eval()
        preds = []
        with torch.no_grad():
            for X_cur, y_cur in val_dl:
                preds.append(model(X_cur.float().to(device)).cpu().numpy())
        preds = np.row_stack(preds)
        score = np.mean([f1_score(y_val[:, i], (preds[:, i] > 0.5).astype(int)) for i in range(len(classes))])
        print(f"Score: {score}")

In [None]:
avg_weights = []
with torch.no_grad():
    for i in range(model.fc1.weight.shape[1]):
        avg_weights.append(model.fc1.weight[:, i].sum().item())
np.min(avg_weights), np.max(avg_weights), np.mean(avg_weights), np.std(avg_weights)

Pipeline with SimpleNet          | mean f1 score

resnet50_features + bag_of_words | 0.03177

resnet50_features + tf_idf       | 0.03177

resnet50_features + spacy        | 0.03177