# Configurate Session


In [None]:
%load_ext autoreload

import os
import sys

project_root = "C:/Users/vasco/repos/Natural-Language"
if project_root not in sys.path:
    sys.path.append(project_root)
os.chdir(project_root)

print("Current working directory:", os.getcwd())

# Load Data


In [None]:
%autoreload 2
from src.dataset import read_data

path = "data/raw/train.txt"
columns = ["title", "from", "genre", "director", "description"]

df = read_data(path, columns)
df.head(3)

# EDA


### First Impressions


In [None]:
%autoreload 2
print(df.info())
print("*" * 20)
repeated_titles = df["title"].value_counts()[df["title"].value_counts() > 1].head(5)
print(repeated_titles)
print("*" * 20)
popular_directors = df["director"].value_counts().head(5)
print(popular_directors)
print("*" * 20)
print(df["from"].value_counts().head(5))
print("*" * 20)
print("Number of duplicates:", df.duplicated().sum())

### Drop Duplicates


In [None]:
df[df.duplicated(keep=False)].sort_values("title").head(20)

In [6]:
df.drop_duplicates(inplace=True)


### Plots


In [None]:
%autoreload 2

from src.plots import plot_movie_data

plot_movie_data(df)

In [None]:
%autoreload 2

from src.plots import plot_stopword_frequency

plot_stopword_frequency(df, "description")

In [None]:
plot_stopword_frequency(df, "title")

### Duplicate Candidates


In [None]:
%autoreload 2

from src.dataset import filter_duplicate_descriptions

filter_duplicate_descriptions(df, "description", "title").head(10)

In [9]:
%autoreload 2
from src.dataset import find_similar_descriptions

similar_pairs = find_similar_descriptions(df, "description")

In [None]:
%autoreload 2
from src.dataset import print_differences

print_differences(df, similar_pairs, "title")
print("\n")
print_differences(df, similar_pairs, "director")
print("\n")
print_differences(df, similar_pairs, "genre")
print("\n")
print_differences(df, similar_pairs, "from")

### Process Directors (NEEDS WORK!)


In [None]:
import re
from collections import defaultdict

import pandas as pd


def clean_director_name(name: str):
    name = name.lower().replace(" ", "").replace("-", "")
    return re.sub(r"\.", "", name)


def create_name_map(df: pd.DataFrame):
    name_map = defaultdict(set)

    for i, row in df.iterrows():
        director_list = [name.strip() for name in row["director"].split(",")]

        for director in director_list:
            cleaned_name = clean_director_name(director)
            name_map[cleaned_name].add(director)

    return name_map


director_name_map = create_name_map(df)
filtered_name_map = {key: value for key, value in director_name_map.items() if len(value) > 1}


print("Directors with more than 1 value:")
filtered_name_map

In [5]:
import numpy as np

df.loc[df["Director"] == "3 directors" "Director"] = np.nan
df.loc[:, "director"] = df.loc[:, "director"].replace("Unknown", np.nan)

# Preprocessing (NEEDS WORK!)


In [22]:
%autoreload 2
from src.dataset import lemmatize_tokens, extract_noun_phrases, word_tokenize
import string


def preprocess_sentence(sentence: str) -> str:
    """Preprocess the sentence by tokenizing, lemmatizing, and joining noun phrases."""
    cleaned_sentence = sentence.translate(str.maketrans("", "", string.punctuation)).lower()
    tokens = word_tokenize(cleaned_sentence)
    noun_phrases = extract_noun_phrases(sentence)
    noun_phrases_joined = ["".join(phrase.split()) for phrase in noun_phrases]
    lemmatized_tokens = lemmatize_tokens(tokens)
    combined_tokens = list(set(lemmatized_tokens + noun_phrases_joined))
    return " ".join(combined_tokens)


df["title"] = df["title"].apply(preprocess_sentence)
df["description"] = df["description"].apply(preprocess_sentence)

# Feature Engineering


In [21]:
df["region"] = df["from"].map(
    {
        "American": "Western",
        "British": "Western",
        "Canadian": "Western",
        "Australian": "Western",
        "Bollywood": "South Asian",
        "Telugu": "South Asian",
        "Tamil": "South Asian",
        "Malayalam": "South Asian",
        "Bengali": "South Asian",
        "Kannada": "South Asian",
        "Marathi": "South Asian",
        "Punjabi": "South Asian",
        "Assamese": "South Asian",
        "Chinese": "East Asian",
        "Japanese": "East Asian",
        "South_Korean": "East Asian",
        "Hong Kong": "East Asian",
        "Filipino": "Southeast Asian",
        "Bangladeshi": "South Asian",
        "Russian": "European",
        "Turkish": "Middle Eastern",
        "Egyptian": "Middle Eastern",
        "Malaysian": "Southeast Asian",
    }
)

# Feature Selection


In [None]:
%autoreload 2
from src.logratioanalysis import plot_scree_subplots_for_genres

plot_scree_subplots_for_genres(df, "description", "genre")

# Modelling


In [None]:
%autoreload 2
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
df["genre"] = label_encoder.fit_transform(df["genre"])

X_train = df.drop("genre", axis=1)
y_train = df["genre"]

In [None]:
%autoreload 2
from mlxtend.feature_selection import ColumnSelector
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import HistGradientBoostingClassifier
from skrub import SelectCols, SimilarityEncoder

text_pipeline = make_union(
    make_pipeline(
        ColumnSelector("title", drop_axis=True),
        TfidfVectorizer(ngram_range=(2, 4)),
        StandardScaler(with_mean=False),
        TruncatedSVD(),
    ),
    make_pipeline(
        ColumnSelector("description", drop_axis=True),
        TfidfVectorizer(),
        StandardScaler(with_mean=False),
        TruncatedSVD(),
    ),
    make_pipeline(SelectCols("region"), SimilarityEncoder()),
    make_pipeline(SelectCols("from"), OneHotEncoder(sparse_output=False)),
    make_pipeline(
        ColumnSelector("director", drop_axis=True),
        TfidfVectorizer(ngram_range=(1, 2)),  #
        StandardScaler(with_mean=False),
        TruncatedSVD(),
    ),
)

pipeline = make_pipeline(text_pipeline, HistGradientBoostingClassifier())
pipeline

In [None]:
from scipy.stats import loguniform, randint
from sklearn.model_selection import RandomizedSearchCV


# Update param_distributions to match HistGradientBoostingClassifier
param_distributions = {
    "histgradientboostingclassifier__learning_rate": loguniform(0.03, 0.1),
    "histgradientboostingclassifier__max_iter": randint(200, 400),
    "histgradientboostingclassifier__max_depth": randint(3, 7),
    "histgradientboostingclassifier__min_samples_leaf": randint(80, 100),
    "histgradientboostingclassifier__max_leaf_nodes": randint(100, 150),
    "histgradientboostingclassifier__l2_regularization": loguniform(0.001, 0.01),
    "featureunion__pipeline-1__truncatedsvd__n_components": randint(100, 200),
    "featureunion__pipeline-2__truncatedsvd__n_components": randint(400, 800),
    "featureunion__pipeline-5__truncatedsvd__n_components": randint(100, 200),
}

# Set up the RandomizedSearchCV with the updated parameters
random_search = RandomizedSearchCV(
    pipeline,
    param_distributions=param_distributions,
    n_iter=10,
    cv=3,
    scoring="accuracy",
    random_state=42,
    verbose=4,
    error_score="raise",
)

# Fit the RandomizedSearchCV with the updated pipeline
random_search.fit(X_train, y_train)

# Get the best parameters, score, and model
best_params = random_search.best_params_
best_score = random_search.best_score_
best_model = random_search.best_estimator_

print("\n Best Parameters:", best_params)
print("\n Best Score:", best_score)