In [None]:
%load_ext autoreload

import os
import sys

project_root = "C:/Users/vasco/repos/Natural-Language"
if project_root not in sys.path:
    sys.path.append(project_root)
os.chdir(project_root)

print("Current working directory:", os.getcwd())

In [None]:
%autoreload 2
from src.dataset import get_data

path = "data/raw/train.txt"
columns = ["Title", "From", "Genre", "Director", "Description"]

df = get_data(path, columns)
df.head(3)

In [None]:
%autoreload 2
print(df.info())
print("*" * 20)
repeated_titles = df["Title"].value_counts()[df["Title"].value_counts() > 1].head(5)
print(repeated_titles)
print("*" * 20)
popular_directors = df["Director"].value_counts().head(5)
print(popular_directors)
print("*" * 20)
print(df["From"].value_counts())

- Duplicated Movies (Check titles, check Description)
- Unknowns in Director
- Weird symbols in some titles


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Set plot styles
sns.set(style="whitegrid")

# 1. Distribution of Genres
plt.figure(figsize=(12, 6))
sns.countplot(y="Genre", data=df, order=df["Genre"].value_counts().index)
plt.title("Distribution of Movie Genres")
plt.xlabel("Count")
plt.ylabel("Genre")
plt.show()

# 2. Distribution of Directors (Top 10)
plt.figure(figsize=(12, 6))
top_directors = df["Director"].value_counts().head(10)
sns.barplot(x=top_directors.values, y=top_directors.index)
plt.title("Top 10 Most Popular Directors")
plt.xlabel("Number of Movies")
plt.ylabel("Director")
plt.show()

# 3. Genre vs Director Count (Top 10 Directors)
top_10_directors = df["Director"].value_counts().head(10).index
filtered_df = df[df["Director"].isin(top_10_directors)]

plt.figure(figsize=(14, 7))
sns.countplot(y="Director", hue="Genre", data=filtered_df)
plt.title("Top 10 Directors by Genre")
plt.xlabel("Number of Movies")
plt.ylabel("Director")
plt.legend(loc="upper right")
plt.show()

In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Function to compute Jaccard similarity between two sets of words
def jaccard_similarity(set1, set2):
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    if union == 0:
        return 0.0
    return intersection / union


# Tokenization function for Jaccard
def tokenize_description(description):
    return set(description.lower().split())


# Load and clean the descriptions
descriptions = df["Description"].fillna("")

# Step 1: Compute cosine similarity using TF-IDF
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(descriptions)
cosine_sim = cosine_similarity(tfidf_matrix)

# Threshold for cosine similarity
cosine_threshold = 0.8
candidate_pairs = []

# Step 2: Filter candidates using cosine similarity
for i in range(cosine_sim.shape[0]):
    for j in range(i + 1, cosine_sim.shape[0]):
        if cosine_sim[i, j] >= cosine_threshold:
            candidate_pairs.append((i, j, cosine_sim[i, j]))

# Step 3: Apply Jaccard similarity to the candidate pairs
final_similar_pairs = []
jaccard_threshold = 0.7  # Adjust the threshold as needed

for i, j, cos_sim in candidate_pairs:
    set1 = tokenize_description(df.loc[i, "Description"])
    set2 = tokenize_description(df.loc[j, "Description"])
    jac_sim = jaccard_similarity(set1, set2)

    if jac_sim >= jaccard_threshold:
        final_similar_pairs.append((i, j, cos_sim, jac_sim))

In [None]:
print("\n Different Titles: \n")
for i, j, cos_sim, jac_sim in final_similar_pairs:
    title_i = df.loc[i, "Title"]
    title_j = df.loc[j, "Title"]
    if title_i != title_j:
        print(
            f"Different Titles: {title_i} and {title_j}, with cosine similarity of {cos_sim:.4f} and Jaccard similarity of {jac_sim:.4f}"
        )

In [None]:
df[(df["Title"] == "Lakshmi Putrudu") | (df["Title"] == "Vambu Sandai")]

In [None]:
print("\n Different Titles: \n")
for i, j, cos_sim, jac_simin in final_similar_pairs:
    director_i = df.loc[i, "Director"]
    director_j = df.loc[j, "Director"]
    if director_i != director_j:
        print(
            f"Different Directors: {director_i} and {director_j}, with cosine similarity of {cos_sim:.4f} and Jaccard similarity of {jac_sim:.4f}"
        )

In [None]:
print("\n Different Genre: \n")
for i, j, cos_sim, jac_simin in final_similar_pairs:
    genre_i = df.loc[i, "Genre"]
    genre_j = df.loc[j, "Genre"]
    if genre_i != genre_j:
        print(
            f"Different Genre: {genre_i} and {genre_j}, with cosine similarity of {cos_sim:.4f} and Jaccard similarity of {jac_sim:.4f}"
        )

In [5]:
%autoreload 2
import numpy as np

from src.dataset import preprocess_sentence

df.loc[:, "Director"] = df.loc[:, "Director"].replace("Unknown", np.nan)
df["Title"] = df["Title"].apply(preprocess_sentence)
df["Description"] = df["Description"].apply(preprocess_sentence)

In [6]:
%autoreload 2
df["Region"] = df["From"].map(
    {
        "American": "Western",
        "British": "Western",
        "Canadian": "Western",
        "Australian": "Western",
        "Bollywood": "South Asian",
        "Telugu": "South Asian",
        "Tamil": "South Asian",
        "Malayalam": "South Asian",
        "Bengali": "South Asian",
        "Kannada": "South Asian",
        "Marathi": "South Asian",
        "Punjabi": "South Asian",
        "Assamese": "South Asian",
        "Chinese": "East Asian",
        "Japanese": "East Asian",
        "South_Korean": "East Asian",
        "Hong Kong": "East Asian",
        "Filipino": "Southeast Asian",
        "Bangladeshi": "South Asian",
        "Russian": "European",
        "Turkish": "Middle Eastern",
        "Egyptian": "Middle Eastern",
        "Malaysian": "Southeast Asian",
    }
)

In [None]:
%autoreload 2
from joblib import Memory
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from skrub import SelectCols, SimilarityEncoder
from xgboost import XGBClassifier


def identity_tokenizer(text):
    return text


class SelectColumnsTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, columns=None):
        self.columns = columns

    def transform(self, X, **transform_params):
        # Ensure that X is a DataFrame and return the selected columns
        cpy_df = X[self.columns].copy()
        return cpy_df

    def fit(self, X, y=None, **fit_params):
        return self


memory = Memory(location="cache_directory")
text_pipeline = make_union(
    make_pipeline(
        SelectColumnsTransformer("Title"),
        TfidfVectorizer(tokenizer=identity_tokenizer, lowercase=False),
        StandardScaler(with_mean=False),
        TruncatedSVD(n_components=50),
    ),
    make_pipeline(
        SelectColumnsTransformer("Description"),
        TfidfVectorizer(tokenizer=identity_tokenizer, lowercase=False),
        StandardScaler(with_mean=False),
        TruncatedSVD(n_components=100),
    ),
    make_pipeline(SelectCols("Region"), SimilarityEncoder()),
    make_pipeline(SelectCols("From"), OneHotEncoder()),
)

pipeline = make_pipeline(text_pipeline, XGBClassifier())

pipeline

In [None]:
from scipy.stats import loguniform, randint, uniform
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

df["Genre"] = label_encoder.fit_transform(df["Genre"])

X_train = df.drop("Genre", axis=1)
y_train = df["Genre"]

param_distributions = {
    "xgbclassifier__n_estimators": randint(100, 500),
    "xgbclassifier__learning_rate": loguniform(1e-3, 1),  # Log scale for learning rate
    "xgbclassifier__max_depth": randint(3, 10),
    "xgbclassifier__subsample": uniform(0.3, 0.7),
    "xgbclassifier__min_child_weight": randint(1, 10),  # Replaces min_samples_split
    "xgbclassifier__gamma": uniform(0, 5),  # Regularization term
    "xgbclassifier__colsample_bytree": uniform(0.3, 0.7),  # Feature subsampling
    "featureunion__pipeline-1__truncatedsvd__n_components": randint(50, 200),
    "featureunion__pipeline-2__truncatedsvd__n_components": randint(50, 200),
}

# Set up the RandomizedSearchCV
random_search = RandomizedSearchCV(
    pipeline,
    param_distributions=param_distributions,
    n_iter=100,
    cv=5,
    scoring="accuracy",
    random_state=42,
    n_jobs=-1,
    verbose=1,
)

random_search.fit(X_train, y_train)

best_params = random_search.best_params_
best_score = random_search.best_score_

best_model = random_search.best_estimator_

In [None]:
print(best_params)
print(best_score)