# Configurate Session


In [None]:
%load_ext autoreload

import os
import sys

project_root = "C:/Users/vasco/repos/Natural-Language"
if project_root not in sys.path:
    sys.path.append(project_root)
os.chdir(project_root)

print("Current working directory:", os.getcwd())

# Load Data


In [None]:
%autoreload 2
from src.dataset import read_data

path = "data/raw/train.txt"
columns = ["title", "from", "genre", "director", "description"]

df = read_data(path, columns)
df.head(3)

# EDA


### First Impressions


In [None]:
%autoreload 2
print(df.info())
print("*" * 20)
repeated_titles = df["title"].value_counts()[df["title"].value_counts() > 1].head(5)
print(repeated_titles)
print("*" * 20)
popular_directors = df["director"].value_counts().head(5)
print(popular_directors)
print("*" * 20)
print(df["from"].value_counts().head(5))
print("*" * 20)
print(df["genre"].value_counts())
print("*" * 20)
print("Number of duplicates:", df.duplicated().sum())

### Drop Duplicates


In [None]:
df[df.duplicated(keep=False)].sort_values("title").head(6)

In [5]:
df.drop_duplicates(inplace=True)
df.reset_index(drop=True, inplace=True)

### Plots


In [6]:
%autoreload 2

from src.plots import (
    plot_movie_data,
    plot_stopword_frequency,
    get_text_statistics,
    plot_histograms,
    plot_boxplots,
    plot_pca_tfidf,
    plot_correlation_matrix,
    plot_pca,
)

In [None]:
plot_movie_data(df)

In [None]:
plot_stopword_frequency(df, "description")

In [None]:
plot_stopword_frequency(df, "title")

In [10]:
stats = get_text_statistics(df, "description")

In [None]:
plot_histograms(stats)

In [None]:
plot_boxplots(stats)

In [None]:
pca_description_df, pca_description = plot_pca_tfidf(df, "description", "genre")

In [None]:
pca_title_df, pca_title = plot_pca_tfidf(df, "title", "genre")

In [None]:
plot_correlation_matrix(stats)
pca_stats_df, pca_stats = plot_pca(stats, df["genre"])

# Train-Test Split

In [6]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df, test_size=0.1, random_state=42, stratify=df["genre"])

In [7]:
train_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)

# Duplicate Candidates & Cleaning Directors


In [8]:
%autoreload 2

from src.director_encoder import (
    filter_duplicate_descriptions,
    find_similar_descriptions,
    print_differences,
    get_encoding_map,
    encode_directors,
)

In [None]:
filter_duplicate_descriptions(train_df, "description", "title").head(6)

In [13]:
similar_pairs = find_similar_descriptions(train_df, "description")

In [None]:
print_differences(train_df, similar_pairs, "title")
print("\n")
print_differences(train_df, similar_pairs, "director")
print("\n")
print_differences(train_df, similar_pairs, "genre")
print("\n")
print_differences(train_df, similar_pairs, "from")

In [9]:
mapping = get_encoding_map(train_df, test_df)

In [10]:
train_df = encode_directors(train_df, "train", mapping)
test_df = encode_directors(test_df, "test", mapping)

train_df.drop("director", axis=1, inplace=True)
test_df.drop("director", axis=1, inplace=True)

In [12]:
train_df.drop_duplicates(inplace=True)

# Generate Embeddings

In [None]:
from src.embedding import process_and_save_in_chunks
from sentence_transformers import SentenceTransformer

# Example of how embeddings were generated, in practice we ran the embedding.py file
model = SentenceTransformer("avsolatorio/GIST-small-Embedding-v0")
output_path = "data/processed/processed_embeddings_train.csv"
process_and_save_in_chunks(train_df, "description", model, chunk_size=300, output_path=output_path)

In [None]:
import pandas as pd

embedding_train_path = "data/processed/processed_embeddings_train.csv"
embedding_train_df = pd.read_csv(embedding_train_path)

embedding_train_df = pd.concat([train_df["genre"], embedding_train_df], axis=1)
embedding_train_df.drop_duplicates(inplace=True)
embedding_train_df.reset_index(drop=True, inplace=True)
embedding_train_df.head(3)

In [None]:
output_path = "data/processed/processed_embeddings_test.csv"
process_and_save_in_chunks(test_df, "description", model, chunk_size=300, output_path=output_path)

In [None]:
embedding_test_path = "data/processed/processed_embeddings_test.csv"
embedding_test_df = pd.read_csv(embedding_test_path)

embedding_test_df = pd.concat([test_df["genre"], embedding_test_df], axis=1)
embedding_test_df.reset_index(drop=True, inplace=True)
embedding_test_df.head(3)

In [None]:
embeddings_aux = embedding_train_df.drop("genre", inplace=False, axis=1)
pca_embedding_df, pca_embedding = plot_pca(embeddings_aux, embedding_train_df["genre"])

# Preprocessing


In [13]:
%autoreload 2

from src.preprocessing import preprocess_sentence

# Expand contractions, extract noun-phrases, tokenize and lemmatize (optionally remove stopwords)
train_df["title"] = train_df["title"].apply(preprocess_sentence)
train_df["description"] = train_df["description"].apply(preprocess_sentence)

test_df["title"] = test_df["title"].apply(preprocess_sentence)
test_df["description"] = test_df["description"].apply(preprocess_sentence)

# Feature Engineering

In [14]:
%autoreload 2

from src.features import REGION_MAP, select_tokens

In [22]:
train_df["region"] = train_df["from"].replace(REGION_MAP)
test_df["region"] = test_df["from"].replace(REGION_MAP)

# Feature Selection (Log Ratio Analysis)


In [19]:
%autoreload 2
from src.logratioanalysis import LogRatioAnalysis, plot_scree_subplots_for_genres

In [None]:
logratio_title = LogRatioAnalysis(train_df, "title", "genre")
logratio_description = LogRatioAnalysis(train_df, "description", "genre")

In [None]:
plot_scree_subplots_for_genres(logratio_title)

In [None]:
plot_scree_subplots_for_genres(logratio_description)

In [22]:
description_tokens = logratio_description.feature_selection(25000)
title_tokens = logratio_title.feature_selection(1000)

In [23]:
train_df["selected_description"] = train_df["description"].apply(select_tokens, selected_tokens=description_tokens)
test_df["selected_description"] = test_df["description"].apply(select_tokens, selected_tokens=description_tokens)

# Modelling


In [16]:
from sklearn.preprocessing import LabelEncoder
from mlxtend.feature_selection import ColumnSelector
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.svm import SVC
from skrub import SelectCols, SimilarityEncoder
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from src.plots import plot_confusion_matrix

In [24]:
label_encoder = LabelEncoder()
label_encoder.fit(train_df["genre"])
train_df["genre"] = label_encoder.transform(train_df["genre"])
decoded_class_names = label_encoder.inverse_transform(range(len(label_encoder.classes_)))

X_train = train_df.drop("genre", axis=1)
y_train = train_df["genre"]

In [25]:
test_df["genre"] = label_encoder.transform(test_df["genre"])

X_test = test_df.drop("genre", axis=1)
y_test = test_df["genre"]

### Scenario 1: HistGradientBoostingClassifier w/ SVD

In [None]:
text_pipeline_1 = make_union(
    make_pipeline(
        ColumnSelector("title", drop_axis=True),
        TfidfVectorizer(),
        TruncatedSVD(),
    ),
    make_pipeline(
        ColumnSelector("description", drop_axis=True),
        TfidfVectorizer(),
        TruncatedSVD(),
    ),
    make_pipeline(SelectCols("region"), SimilarityEncoder()),
    make_pipeline(SelectCols("from"), OneHotEncoder(sparse_output=False)),
    make_pipeline(SelectCols("encoded_director")),
)

pipeline_1 = make_pipeline(text_pipeline_1, HistGradientBoostingClassifier())
pipeline_1

In [None]:
param_distributions = {
    "histgradientboostingclassifier__learning_rate": np.logspace(-3, 0, 100),
    "histgradientboostingclassifier__max_iter": np.arange(100, 500, 50),
    "histgradientboostingclassifier__max_depth": np.arange(3, 15),
    "histgradientboostingclassifier__min_samples_leaf": np.arange(1, 51, 5),
    "histgradientboostingclassifier__max_leaf_nodes": np.arange(10, 301, 10),
    "histgradientboostingclassifier__l2_regularization": np.logspace(-4, 0, 100),
    "histgradientboostingclassifier__scoring": ["accuracy"],
    "featureunion__pipeline-1__truncatedsvd__n_components": np.arange(200, 400, 50),
    "featureunion__pipeline-2__truncatedsvd__n_components": np.arange(4000, 6000, 500),
    "featureunion__pipeline-5__truncatedsvd__n_components": np.arange(200, 400, 50),
}

In [None]:
random_search_1 = RandomizedSearchCV(
    pipeline_1,
    param_distributions=param_distributions,
    n_iter=10,
    cv=4,
    scoring="accuracy",
    random_state=42,
    verbose=4,
    error_score="raise",
)
random_search_1.fit(X_train, y_train)

In [None]:
best_params = random_search_1.best_params_
best_score = random_search_1.best_score_
best_model = random_search_1.best_estimator_

print("\n Best Parameters:", best_params)
print("\n Best Score:", best_score)

In [None]:
# TODO: use best metrics to predict on test set
pipeline_1.set_params(**best_params)
pipeline_1.fit(X_train, y_train)
y_pred = pipeline_1.predict(X_test)

In [None]:
accuracy_score(y_test, y_pred)

In [None]:
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

In [None]:
cm = confusion_matrix(y_test, y_pred)
plot_confusion_matrix(cm,decoded_class_names )

### Scenario 2: SVM w/o feature selection

In [None]:
text_pipeline_2 = make_union(
    make_pipeline(
        ColumnSelector("title", drop_axis=True),
        TfidfVectorizer(),
    ),
    make_pipeline(
        ColumnSelector("description", drop_axis=True),
        TfidfVectorizer(),
    ),
    make_pipeline(SelectCols("region"), SimilarityEncoder()),
    make_pipeline(SelectCols("from"), OneHotEncoder(sparse_output=False)),
    make_pipeline(SelectCols("encoded_director")),
)

pipeline_2 = make_pipeline(text_pipeline_2, SVC())
pipeline_2

In [None]:
param_distributions = {
    "svc__C": np.logspace(-3, 3, 100),
    "svc__kernel": ["poly", "sigmoid", "rbf", "linear"],
    "svc__gamma": ["scale", "auto"] + list(np.logspace(-4, 1, 100)),
    "svc__degree": np.arange(2, 6),
    "svc__coef0": np.linspace(-1, 1, 100),
    "svc__class_weight": ["balanced", None],
}

In [None]:
random_search_2 = RandomizedSearchCV(
    pipeline_2,
    param_distributions=param_distributions,
    n_iter=50,
    cv=4,
    scoring="accuracy",
    random_state=42,
    verbose=4,
    error_score="raise",
)
random_search_2.fit(X_train, y_train)

In [None]:
best_params = random_search_2.best_params_
best_score = random_search_2.best_score_
best_model = random_search_2.best_estimator_

print("\nBest Parameters:", best_params)
print("\nBest Score:", best_score)

Best Parameters: {
    
    'svc__C': np.float64(3.9676050770529883), 

    'svc__coef0': np.float64(0.3991305878561679), 
    
    'svc__degree': 4, 'svc__gamma': 'auto',
    
    'svc__kernel': 'linear'}

Best Score: 0.6512520009226962

In [None]:
# TODO: use best metrics to predict on test set
pipeline_2.set_params(**best_params)
pipeline_2.fit(X_train, y_train)
y_pred = pipeline_2.predict(X_test)

In [None]:
accuracy_score(y_test, y_pred)

In [None]:
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

In [None]:
cm = confusion_matrix(y_test, y_pred)
plot_confusion_matrix(cm, decoded_class_names )

### Scenario 3: SVM w/ Feature Selection

In [None]:
text_pipeline_3 = make_union(
    make_pipeline(
        ColumnSelector("title", drop_axis=True),
        TfidfVectorizer(ngram_range=(2, 4)),
    ),
    make_pipeline(
        ColumnSelector("selected_description", drop_axis=True),
        TfidfVectorizer(),
    ),
    make_pipeline(SelectCols("region"), SimilarityEncoder()),
    make_pipeline(SelectCols("from"), OneHotEncoder(sparse_output=False)),
    make_pipeline(SelectCols("encoded_director")),
)

pipeline_3 = make_pipeline(text_pipeline_3, SVC())
pipeline_3

In [None]:
random_search_3 = RandomizedSearchCV(
    pipeline_3,
    param_distributions=param_distributions,
    n_iter=50,
    cv=4,
    scoring="accuracy",
    random_state=42,
    verbose=4,
    error_score="raise",
)
random_search_3.fit(X_train, y_train)

In [None]:
best_params = random_search_3.best_params_
best_score = random_search_3.best_score_
best_model = random_search_3.best_estimator_

print("\nBest Parameters:", best_params)
print("\nBest Score:", best_score)

Best Parameters: {
    
    'svc__C': np.float64(3.9676050770529883), 

    'svc__coef0': np.float6(03991305878561679), 

    'svc__degree': 4, 

    'svc__gamma': 'auto', 

    'svc__kernel': 'linear'}

Best Score: 0.6364193794168839

In [None]:
# TODO: use best metrics to predict on test set
pipeline_3.set_params(**best_params)
pipeline_3.fit(X_train, y_train)
y_pred = pipeline_3.predict(X_test)

In [None]:
accuracy_score(y_test, y_pred)

In [None]:
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

In [None]:
cm = confusion_matrix(y_test, y_pred)
plot_confusion_matrix(cm, decoded_class_names )

### Modelling with Embeddings

In [17]:
label_encoder = LabelEncoder()
label_encoder = label_encoder.fit(train_df["genre"])
embedding_train_df["genre"] = label_encoder.transform(embedding_train_df["genre"])
decoded_class_names = label_encoder.inverse_transform(range(len(label_encoder.classes_)))

X_train = embedding_train_df.drop("genre", axis=1)
y_train = embedding_train_df["genre"]

In [18]:
embedding_test_df["genre"] = label_encoder.transform(embedding_test_df["genre"])

X_test = embedding_test_df.drop("genre", axis=1)
y_test = embedding_test_df["genre"]

### Scenario 4: SVM w/ Embeddings  

In [19]:
param_distributions = {
    "C": np.logspace(-3, 3, 100),
    "kernel": ["poly", "sigmoid"],
    "gamma": ["scale", "auto"] + list(np.logspace(-4, 1, 100)),
    "degree": np.arange(2, 6),
    "coef0": np.linspace(-1, 1, 100),
    "class_weight": ["balanced", None],
}

In [None]:
random_search_4 = RandomizedSearchCV(
    SVC(),
    param_distributions=param_distributions,
    n_iter=50,
    scoring="accuracy",
    cv=4,
    verbose=4,
    random_state=42,
    n_jobs=-1,
)

random_search_4.fit(X_train, y_train)

In [None]:
best_params = random_search_4.best_params_
best_score = random_search_4.best_score_
best_model = random_search_4.best_estimator_

print("\nBest Parameters:", best_params)
print("\nBest Score:", best_score)

Best Parameters: 
{'kernel': 'poly',

'gamma': 7.054802310718645,

'degree': 3,

'coef0': -0.050505050505050386,

'class_weight': None,

'C': 0.0026560877829466868}

Best Score: 0.6867000556483027

In [22]:
# TODO: use best metrics to predict on test set
svc = SVC()
svc.set_params(**best_params)
svc.fit(X_train, y_train)
y_pred = svc.predict(X_test)

In [None]:
accuracy_score(y_test, y_pred)

Test Accuracy: 0.676214196762142

In [None]:
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

In [None]:
cm = cm = confusion_matrix(y_test, y_pred)
plot_confusion_matrix(cm, decoded_class_names)

### Scenario 5: HistGradientBoosting w/ Embeddings

In [32]:
param_distributions = {
    "learning_rate": np.logspace(-3, 0, 100),
    "max_iter": np.arange(100, 500, 50),
    "max_depth": np.arange(3, 15),
    "min_samples_leaf": np.arange(1, 51, 5),
    "max_leaf_nodes": np.arange(10, 301, 10),
    "l2_regularization": np.logspace(-4, 0, 100),
    "scoring": ["accuracy"],
}

In [None]:
random_search_5 = RandomizedSearchCV(
    HistGradientBoostingClassifier(),
    param_distributions=param_distributions,
    n_iter=50,
    scoring="accuracy",
    cv=4,
    verbose=4,
    random_state=42,
    n_jobs=-1,
)

random_search_5.fit(X_train, y_train)

In [None]:
best_params = random_search_5.best_params_
best_score = random_search_5.best_score_
best_model = random_search_5.best_estimator_

print("\nBest Parameters:", best_params)
print("\nBest Score:", best_score)

Best parameters found: {
    
    'scoring': 'accuracy', 

    'min_samples_leaf': 6, 

    'max_leaf_nodes': 50, 

    'max_iter': 350, 

    'max_depth': 6, 

    'learning_rate': 0.1, 

    'l2_regularization': 0.0004430621457583882, 
    }
Best score found: 0.671883263271661

In [None]:
# TODO: use best metrics to predict on test set
hgb = HistGradientBoostingClassifier()
hgb.set_params(**best_params)
hgb.fit(X_train, y_train)
y_pred = hgb.predict(X_test)

In [None]:
accuracy_score(y_test, y_pred)

In [None]:
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

In [None]:
cm = confusion_matrix(y_test, y_pred)
plot_confusion_matrix(cm, decoded_class_names)

# Predict Test Set

In [None]:
path_test = "data/raw/test_no_labels.txt"
columns_test = ["title", "from", "director", "description"]

df_no_labels_test = read_data(path_test, columns_test)
df_no_labels_test.head(3)

In [None]:
path_train = "data/raw/train.txt"
columns = ["title", "from", "genre", "director", "description"]

df = read_data(path, columns)
df.head(3)

In [None]:
train_description = df["description"].dropna().unique()
test_description = df_no_labels_test["description"].dropna().unique()

overlapping_descriptions = set(train_description).intersection(set(test_description))
print(f"{len(overlapping_descriptions)} overlapping descriptions found.")

In [None]:
rule_based_dict = df[df["description"].isin(overlapping_descriptions)].set_index("description")["genre"].to_dict()


def rule_based_prediction(description):
    if description in rule_based_dict:
        return rule_based_dict[description]
    return None


df_no_labels_test["rule_based_prediction"] = df_no_labels_test["description"].apply(rule_based_prediction)

df_no_labels_test[df_no_labels_test["description"].isin(overlapping_descriptions)].head(15)

In [None]:
output_path = "data/processed/processed_embeddings_no_labels_test.csv"
process_and_save_in_chunks(df_no_labels_test, "description", model, chunk_size=300, output_path=output_path)