# Configurate Session


In [None]:
%load_ext autoreload

import os
import sys

project_root = "C:/Users/franc/Desktop/Cadeiras_Mestrado/NLP/Projeto/Natural-Language"
if project_root not in sys.path:
    sys.path.append(project_root)
os.chdir(project_root)

print("Current working directory:", os.getcwd())

# Load Data


In [None]:
%pip install ipykernel python-dotenv numpy pandas scikit-learn spacy nltk matplotlib seaborn skrub textblob mlxtend rapidfuzz


In [None]:
%autoreload 2
from src.dataset import read_data

path = "data/raw/train.txt"
columns = ["title", "from", "genre", "director", "description"]

df = read_data(path, columns)
df.head(3)

# EDA


### First Impressions


In [None]:
%autoreload 2
print(df.info())
print("*" * 20)
repeated_titles = df["title"].value_counts()[df["title"].value_counts() > 1].head(5)
print(repeated_titles)
print("*" * 20)
popular_directors = df["director"].value_counts().head(5)
print(popular_directors)
print("*" * 20)
print(df["from"].value_counts().head(5))
print("*" * 20)
print("Number of duplicates:", df.duplicated().sum())

### Drop Duplicates


In [None]:
df[df.duplicated(keep=False)].sort_values("title").head(20)

In [18]:
df.drop_duplicates(inplace=True)
# Reset index to ensure continuous indices
df.reset_index(drop=True)

### Plots


In [None]:
%autoreload 2

from src.plots import plot_movie_data

plot_movie_data(df)

In [None]:
%autoreload 2

from src.plots import plot_stopword_frequency

plot_stopword_frequency(df, "description")

In [None]:
plot_stopword_frequency(df, "title")

## Standardize Directors (with mapping)

In [None]:
from collections import defaultdict
import re
from directors import create_name_map, map_director_names

#Create the mapping dictionary that maps the original director name to a standardized version
name_map = create_name_map(df)
#Map the names
map_director_names(df, name_map)

## Candidate Duplicates

In [None]:
from src.dataset import find_similar_descriptions

#Find pairs with similar description
similar_pairs = find_similar_descriptions(df, "description")

#Validate the candidate duplicates by fuzzy matching the director's name and filter the duplicates
from src.dataset import validate_and_filter_duplicates_fuzzy
validate_and_filter_duplicates_fuzzy(df, similar_pairs, ["director"], threshold=80)


# Feature Engineering


In [22]:
df["region"] = df["from"].map(
    {
        "American": "Western",
        "British": "Western",
        "Canadian": "Western",
        "Australian": "Western",
        "Bollywood": "South Asian",
        "Telugu": "South Asian",
        "Tamil": "South Asian",
        "Malayalam": "South Asian",
        "Bengali": "South Asian",
        "Kannada": "South Asian",
        "Marathi": "South Asian",
        "Punjabi": "South Asian",
        "Assamese": "South Asian",
        "Chinese": "East Asian",
        "Japanese": "East Asian",
        "South_Korean": "East Asian",
        "Hong Kong": "East Asian",
        "Filipino": "Southeast Asian",
        "Bangladeshi": "South Asian",
        "Russian": "European",
        "Turkish": "Middle Eastern",
        "Egyptian": "Middle Eastern",
        "Malaysian": "Southeast Asian",
    }
)

# Feature Selection


In [23]:
%autoreload 2
from src.logratioanalysis import LogRatioAnalysis

logratio_title = LogRatioAnalysis(df, "title", "genre")
logratio_description = LogRatioAnalysis(df, "description", "genre")

In [None]:
%autoreload 2
import nltk
nltk.download('punkt')  # Download the necessary NLTK data
nltk.download('punkt_tab')

from src.logratioanalysis import plot_scree_subplots_for_genres

genres = df.genre.unique()
plot_scree_subplots_for_genres(logratio_title)


In [None]:
%autoreload 2
plot_scree_subplots_for_genres(logratio_description)

In [33]:
%autoreload 2

description_tokens = logratio_description.feature_selection(25000)
title_tokens = logratio_title.feature_selection(1000)

In [34]:
import nltk

def select_tokens(text, selected_tokens, tokenizer=nltk.word_tokenize):
    """
    Cleans a single document by keeping only the tokens present in the selected_tokens set.

    Parameters:
    text (str): The text document to clean.
    selected_tokens (set or list): The set or list of tokens to retain in the text.
    tokenizer (function): A function to tokenize the text (defaults to nltk.word_tokenize).

    Returns:
    str: The cleaned text with only the selected tokens.
    """

    if isinstance(text, str):
        tokens = tokenizer(text)
        filtered_tokens = [token for token in tokens if token in selected_tokens]
        return " ".join(filtered_tokens)
    return text

In [35]:
df['cleaned_description'] = df["description"].apply(select_tokens, selected_tokens=description_tokens)

# Modelling


In [None]:
%autoreload 2
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
df["genre"] = label_encoder.fit_transform(df["genre"])
df["director"] = df["director"].fillna("")

X_train = df.drop("genre", axis=1)
y_train = df["genre"]

print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {len(y_train)}")


In [39]:
%autoreload 2
from mlxtend.feature_selection import ColumnSelector
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.svm import SVC  # Import the correct SVM classifier
from skrub import SelectCols, SimilarityEncoder

# Define your text pipeline
text_pipeline = make_union(
    make_pipeline(
        ColumnSelector("title", drop_axis=True),
        TfidfVectorizer(ngram_range=(2, 4)),  # Use TF-IDF for "title"
    ),
    make_pipeline(
        ColumnSelector("cleaned_description", drop_axis=True),
        TfidfVectorizer()  # Use TF-IDF for "description"
    ),
    make_pipeline(SelectCols("region"), SimilarityEncoder()),  # Use similarity encoding for "region"
    make_pipeline(SelectCols("from"), OneHotEncoder(sparse_output=False)),  # One-hot encode the "from" column
    make_pipeline(
        ColumnSelector("director", drop_axis=True),
        TfidfVectorizer(ngram_range=(1, 2))  # Use TF-IDF for "director"
    ),
)

# Replace with the correct SVM classifier (SVC)
pipeline = make_pipeline(text_pipeline, SVC())  # SVC is the correct class


In [None]:
from scipy.stats import loguniform, randint
from sklearn.model_selection import RandomizedSearchCV

# Define parameter grid for RandomizedSearchCV for SVC
param_distributions = {
    "svc__C": loguniform(0.01, 10),  # Adjusted range for regularization parameter
    "svc__kernel": ["linear", "rbf", "poly", "sigmoid"],  # Kernel types
    "svc__gamma": ["scale", "auto"],  # Kernel coefficient
    "svc__degree": randint(2, 5),  # Degree for poly kernel
    "svc__coef0": loguniform(0.1, 1.0),  # Adjusted range for poly/sigmoid kernels
}

# Set up RandomizedSearchCV
random_search = RandomizedSearchCV(
    pipeline,
    param_distributions=param_distributions,
    n_iter=10,
    cv=3,
    scoring="accuracy",
    random_state=42,
    verbose=4,
    error_score="raise",
)

# Fit RandomizedSearchCV
random_search.fit(X_train, y_train)

# Get the best parameters, score, and model
best_params = random_search.best_params_
best_score = random_search.best_score_
best_model = random_search.best_estimator_

print("\nBest Parameters:", best_params)
print("\nBest Score:", best_score)

# Print the shape of X_train and y_train
print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {len(y_train)}")