In [1]:
import pandas as pd
import json
import glob
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
import pickle

### Loading json to dataframe

In [2]:
def combine_json_to_dataframe(directory_path, file_pattern='*.json'):
    """
    Reads multiple JSON files (with IMDb IDs as top-level keys)
    and combines inner movie data into a single DataFrame.
    """
    search_path = os.path.join(directory_path, file_pattern)
    all_json_files = glob.glob(search_path)
    data_list = []

    if not all_json_files:
        print(f"No files found matching pattern '{file_pattern}' in '{directory_path}'")
        return pd.DataFrame()

    for file_path in all_json_files:
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                movie_data = json.load(f)
                # flatten each file’s movie dictionary
                for _, movie in movie_data.items():
                    data_list.append(movie)
        except Exception as e:
            print(f"Error processing file {file_path}: {e}")

    df = pd.DataFrame(data_list)
    return df


In [3]:
json_directory = '../data/batches'
movies_df = combine_json_to_dataframe(json_directory, '*.json')

In [4]:
# print(movies_df.head())
print(f"Total rows (movies): {len(movies_df)}\n")
print(movies_df.columns)

Total rows (movies): 5000

Index(['id', 'title', 'description', 'summary', 'image', 'url',
       'datePublished', 'duration', 'genre', 'keywords', 'aggregateRating',
       'actors', 'directors', 'creators', 'trailer', 'review'],
      dtype='object')


# TODO: 
Mayby we should perform some data cleaning ??? (for example: check if there is no missing values, typos etc.) 
Calculate statistics ? (how many occurances of particular genre, how many missing summary etc.)

### Preparing the dataset

In [5]:
all_genres = set()
for genres in movies_df['genre']:
    all_genres.update(genres)
all_genres = sorted(list(all_genres))
print("Unique genres:", len(all_genres))
print(all_genres)

Unique genres: 22
['Action', 'Adventure', 'Animation', 'Biography', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Family', 'Fantasy', 'History', 'Horror', 'Music', 'Musical', 'Mystery', 'News', 'Romance', 'Sci-Fi', 'Sport', 'Thriller', 'War', 'Western']


In [6]:
# joining description and summary into one text
movies_df['text'] = movies_df.apply(
    lambda row: row['description'] + " " + row['summary'] if pd.notnull(row['summary']) else row['description'],
    axis=1)

In [7]:
# Convert the list of genres into a binary matrix (multi-label)
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(movies_df['genre'])
print(y)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [1 0 0 ... 0 0 0]
 [1 1 0 ... 0 0 0]
 [1 0 0 ... 0 0 0]]


In [8]:
# Prepare X (text) and y (genres) for train/test split
X = movies_df['text']
# Keep the ids for later analysis
ids = movies_df['id']

X_train, X_test, y_train, y_test, ids_train, ids_test = train_test_split(
    X, y, ids, test_size=0.2, random_state=42
)

In [9]:
#pip3 install -U "transformers>=4.45" "datasets>=3.0" "accelerate>=1.0" "torch" "evaluate"
import numpy as np
import pandas as pd
from datasets import Dataset, DatasetDict

train_df = pd.DataFrame({
    'text': list(X_train),
    'labels': [list(map(float, row)) for row in y_train] 
})
test_df = pd.DataFrame({
    'text': list(X_test),
    'labels': [list(map(float, row)) for row in y_test]
})


ds = DatasetDict({
    'train': Dataset.from_pandas(train_df, preserve_index=False),
    'test': Dataset.from_pandas(test_df, preserve_index=False),
})

num_labels = y_train.shape[1]
num_labels

22

In [10]:
from transformers import AutoTokenizer

model_name = "prajjwal1/bert-tiny"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_fn(batch):
    return tokenizer(batch['text'], truncation=True, padding='max_length', max_length=256)

tokenized = ds.map(tokenize_fn, batched=True, remove_columns=['text'])
tokenized.set_format('torch')

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [19]:
import numpy as np
from sklearn.metrics import f1_score, precision_score, recall_score
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
    problem_type="multi_label_classification"
)

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    probs = sigmoid(logits)
    preds = (probs >= 0.5).astype(int)
    return {
        "micro_f1": f1_score(labels, preds, average='micro', zero_division=0),
        "macro_f1": f1_score(labels, preds, average='macro', zero_division=0),
        "micro_precision": precision_score(labels, preds, average='micro', zero_division=0),
        "micro_recall": recall_score(labels, preds, average='micro', zero_division=0),
    }

args = TrainingArguments(
    output_dir="../models/distilbert_multilabel",
    do_eval=True,               
    save_steps=500,    
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    num_train_epochs=50,
    weight_decay=0.01,
    logging_steps=50,
    fp16=False  # set True if GPU supports it
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


In [20]:
import os
train_result = trainer.train()
eval_metrics = trainer.evaluate()
print(eval_metrics)

os.makedirs("../models", exist_ok=True)
with open("../models/metrics_distilbert.txt", "w") as f:
    for k, v in sorted(eval_metrics.items()):
        f.write(f"{k}: {v}\n")

trainer.save_model("../models/distilbert_multilabel")



Step,Training Loss
50,0.6697
100,0.6221
150,0.5806
200,0.5449
250,0.5145
300,0.4857
350,0.4602
400,0.4343
450,0.4188
500,0.4039




{'eval_loss': 0.23211325705051422, 'eval_micro_f1': 0.5327490774907749, 'eval_macro_f1': 0.21743663899677934, 'eval_micro_precision': 0.6830277942046127, 'eval_micro_recall': 0.43667296786389415, 'eval_runtime': 1.4496, 'eval_samples_per_second': 689.855, 'eval_steps_per_second': 43.461, 'epoch': 50.0}


In [13]:
# save dataframe and traning/test datasets to pickle files

with open('../data/movies_df.pkl', 'wb') as f:
    pickle.dump((X_train, y_train, ids_train), f)

with open('../data/train_data.pkl', 'wb') as f:
    pickle.dump((X_train, y_train, ids_train), f)

with open('../data/test_data.pkl', 'wb') as f:
    pickle.dump((X_test, y_test, ids_test), f)

with open('../data/mlb.pkl', 'wb') as f:
    pickle.dump(mlb, f)

Imports and training (TF-IDF + OneVsRest LogisticRegression)

In [14]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier

tfidf_clf = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=50000, ngram_range=(1, 2), lowercase=True, stop_words='english')),
    ('ovr', OneVsRestClassifier(LogisticRegression(max_iter=200, C=2.0, solver='liblinear')))
])

tfidf_clf.fit(X_train, y_train)



0,1,2
,steps,"[('tfidf', ...), ('ovr', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,'english'
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,estimator,LogisticRegre...r='liblinear')
,n_jobs,
,verbose,0

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,2.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'liblinear'
,max_iter,200


Evaluation

In [15]:
from sklearn.metrics import classification_report, f1_score, precision_score, recall_score

y_pred = tfidf_clf.predict(X_test)

micro_f1 = f1_score(y_test, y_pred, average='micro', zero_division=0)
macro_f1 = f1_score(y_test, y_pred, average='macro', zero_division=0)
micro_precision = precision_score(y_test, y_pred, average='micro', zero_division=0)
micro_recall = recall_score(y_test, y_pred, average='micro', zero_division=0)
report = classification_report(y_test, y_pred, target_names=list(mlb.classes_), zero_division=0)

print(f"micro_f1: {micro_f1:.4f}")
print(f"macro_f1: {macro_f1:.4f}")
print(f"micro_precision: {micro_precision:.4f}")
print(f"micro_recall: {micro_recall:.4f}")
print()
print(report)

micro_f1: 0.4394
macro_f1: 0.1469
micro_precision: 0.7475
micro_recall: 0.3112

              precision    recall  f1-score   support

      Action       0.81      0.44      0.57       311
   Adventure       0.83      0.32      0.46       223
   Animation       0.00      0.00      0.00        47
   Biography       0.50      0.01      0.03        76
      Comedy       0.74      0.43      0.55       336
       Crime       0.71      0.21      0.32       178
 Documentary       0.00      0.00      0.00        13
       Drama       0.72      0.76      0.74       516
      Family       0.00      0.00      0.00        57
     Fantasy       0.00      0.00      0.00       115
     History       0.00      0.00      0.00        32
      Horror       1.00      0.10      0.18       123
       Music       0.00      0.00      0.00        28
     Musical       0.00      0.00      0.00        10
     Mystery       0.62      0.04      0.07       131
        News       0.00      0.00      0.00         1
 

Persist model and metrics

In [16]:
import os
import pickle

os.makedirs('../models', exist_ok=True)

with open('../models/baseline_tfidf_logreg.pkl', 'wb') as f:
    pickle.dump({'pipeline': tfidf_clf, 'mlb': mlb}, f)

with open('../models/metrics_tfidf.txt', 'w') as f:
    f.write("\n".join([
        f"micro_f1: {micro_f1}",
        f"macro_f1: {macro_f1}",
        f"micro_precision: {micro_precision}",
        f"micro_recall: {micro_recall}",
        "",
        report
    ]))

Embeddings baseline (SBERT + OneVsRest)

In [17]:
import numpy as np

try:
    from sentence_transformers import SentenceTransformer
except Exception as e:
    raise RuntimeError("Please install sentence-transformers: pip install sentence-transformers")

sbert_model_name = 'sentence-transformers/all-MiniLM-L6-v2'
sbert = SentenceTransformer(sbert_model_name)

def encode_sbert(texts, batch_size=256):
    embs = []
    for i in range(0, len(texts), batch_size):
        batch = list(texts[i:i+batch_size])
        emb = sbert.encode(batch, convert_to_numpy=True, normalize_embeddings=True, show_progress_bar=False)
        embs.append(emb)
    return np.vstack(embs)

X_train_emb = encode_sbert(X_train)
X_test_emb = encode_sbert(X_test)

sbert_clf = OneVsRestClassifier(LogisticRegression(max_iter=300, C=2.0, solver='liblinear'))
sbert_clf.fit(X_train_emb, y_train)



0,1,2
,estimator,LogisticRegre...r='liblinear')
,n_jobs,
,verbose,0

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,2.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'liblinear'
,max_iter,300


In [18]:
y_pred_sbert = sbert_clf.predict(X_test_emb)

micro_f1_sbert = f1_score(y_test, y_pred_sbert, average='micro', zero_division=0)
macro_f1_sbert = f1_score(y_test, y_pred_sbert, average='macro', zero_division=0)
micro_precision_sbert = precision_score(y_test, y_pred_sbert, average='micro', zero_division=0)
micro_recall_sbert = recall_score(y_test, y_pred_sbert, average='micro', zero_division=0)
report_sbert = classification_report(y_test, y_pred_sbert, target_names=list(mlb.classes_), zero_division=0)

print(f"SBERT micro_f1: {micro_f1_sbert:.4f}")
print(f"SBERT macro_f1: {macro_f1_sbert:.4f}")
print(f"SBERT micro_precision: {micro_precision_sbert:.4f}")
print(f"SBERT micro_recall: {micro_recall_sbert:.4f}")
print()
print(report_sbert)

with open('../models/baseline_sbert_logreg.pkl', 'wb') as f:
    pickle.dump({'clf': sbert_clf, 'mlb': mlb, 'sbert_model_name': sbert_model_name}, f)

with open('../models/metrics_sbert.txt', 'w') as f:
    f.write("\n".join([
        f"micro_f1: {micro_f1_sbert}",
        f"macro_f1: {macro_f1_sbert}",
        f"micro_precision: {micro_precision_sbert}",
        f"micro_recall: {micro_recall_sbert}",
        "",
        report_sbert
    ]))

SBERT micro_f1: 0.5602
SBERT macro_f1: 0.3323
SBERT micro_precision: 0.6901
SBERT micro_recall: 0.4715

              precision    recall  f1-score   support

      Action       0.76      0.66      0.71       311
   Adventure       0.71      0.54      0.61       223
   Animation       0.41      0.15      0.22        47
   Biography       0.60      0.24      0.34        76
      Comedy       0.71      0.57      0.63       336
       Crime       0.58      0.51      0.54       178
 Documentary       0.00      0.00      0.00        13
       Drama       0.71      0.72      0.72       516
      Family       0.80      0.07      0.13        57
     Fantasy       0.69      0.10      0.17       115
     History       0.33      0.06      0.11        32
      Horror       0.75      0.48      0.58       123
       Music       0.82      0.32      0.46        28
     Musical       0.00      0.00      0.00        10
     Mystery       0.68      0.27      0.39       131
        News       0.00      0.