In [None]:
%load_ext autoreload

import os
import sys

project_root = "C:/Users/vasco/repos/Natural-Language"
if project_root not in sys.path:
    sys.path.append(project_root)
os.chdir(project_root)

print("Current working directory:", os.getcwd())

In [None]:
%autoreload 2
from src.dataset import read_data

path = "data/raw/train.txt"
columns = ["Title", "From", "Genre", "Director", "Description"]

df = read_data(path, columns)
df.head(3)

In [None]:
%autoreload 2
print(df.info())
print("*" * 20)
repeated_titles = df["Title"].value_counts()[df["Title"].value_counts() > 1].head(5)
print(repeated_titles)
print("*" * 20)
popular_directors = df["Director"].value_counts().head(5)
print(popular_directors)
print("*" * 20)
print(df["From"].value_counts().head(5))
print("*" * 20)
print("Number of duplicates:", df.duplicated().sum())

In [31]:
df.drop_duplicates(inplace=True)


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Set plot styles
sns.set_theme(style="whitegrid")

# 1. Distribution of Genres
plt.figure(figsize=(12, 6))
sns.countplot(y="Genre", data=df, order=df["Genre"].value_counts().index)
plt.title("Distribution of Movie Genres")
plt.xlabel("Count")
plt.ylabel("Genre")
plt.show()

# 2. Distribution of Directors (Top 10)
plt.figure(figsize=(12, 6))
top_directors = df["Director"].value_counts().head(10)
sns.barplot(x=top_directors.values, y=top_directors.index)
plt.title("Top 10 Most Popular Directors")
plt.xlabel("Number of Movies")
plt.ylabel("Director")
plt.show()

# 3. Genre vs Director Count (Top 10 Directors)
top_10_directors = df["Director"].value_counts().head(10).index
filtered_df = df[df["Director"].isin(top_10_directors)]

plt.figure(figsize=(14, 7))
sns.countplot(y="Director", hue="Genre", data=filtered_df)
plt.title("Top 10 Directors by Genre")
plt.xlabel("Number of Movies")
plt.ylabel("Director")
plt.legend(loc="upper right")
plt.show()

In [33]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


def jaccard_similarity(set1, set2):
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    if union == 0:
        return 0.0
    return intersection / union


def tokenize_description(description):
    return set(description.lower().split())


descriptions = df["Description"].fillna("")

tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(descriptions)
cosine_sim = cosine_similarity(tfidf_matrix)

cosine_threshold = 0.8
candidate_pairs = []

for i in range(cosine_sim.shape[0]):
    for j in range(i + 1, cosine_sim.shape[0]):
        if cosine_sim[i, j] >= cosine_threshold:
            candidate_pairs.append((i, j, cosine_sim[i, j]))

final_similar_pairs = []
jaccard_threshold = 0.7

for i, j, cos_sim in candidate_pairs:
    set1 = tokenize_description(df.loc[i, "Description"])
    set2 = tokenize_description(df.loc[j, "Description"])
    jac_sim = jaccard_similarity(set1, set2)

    if jac_sim >= jaccard_threshold:
        final_similar_pairs.append((i, j, cos_sim, jac_sim))

In [None]:
print("\n Different Titles: \n")
for i, j, cos_sim, jac_sim in final_similar_pairs:
    title_i = df.loc[i, "Title"]
    title_j = df.loc[j, "Title"]
    if title_i != title_j:
        print(f"{title_i} ({i}) and {title_j} ({j}) : (Cosine {cos_sim:.4f}, Jaccard  {jac_sim:.4f})")

In [None]:
print("\n Different Directors: \n")
for i, j, cos_sim, jac_sim in final_similar_pairs:
    director_i = df.loc[i, "Director"]
    director_j = df.loc[j, "Director"]
    if director_i != director_j:
        print(f"{director_i} ({i}) and {director_j} ({j}) : (Cosine {cos_sim:.4f}, Jaccard {jac_sim:.4f})")

In [None]:
print("\n Different Genre: \n")
for i, j, cos_sim, jac_sim in final_similar_pairs:
    genre_i = df.loc[i, "Genre"]
    genre_j = df.loc[j, "Genre"]
    if genre_i != genre_j:
        print(f" {genre_i} ({i}) and {genre_j} ({j}) : (Cosine {cos_sim:.4f}, Jaccard {jac_sim:.4f})")

In [None]:
# Step 1: Utility function to clean a director's name by removing spaces
import re
from collections import defaultdict

import pandas as pd


def clean_director_name(name: str):
    # Remove spaces, hyphens, periods, and convert to lowercase
    name = name.lower().replace(" ", "").replace("-", "")
    return re.sub(r"\.", "", name)  # Remove periods as well


# Step 2: Create a mapping of cleaned names to the original names
def create_name_map(df: pd.DataFrame):
    name_map = defaultdict(set)  # Use a list to account for potential multiple matches

    for i, row in df.iterrows():
        director_list = [name.strip() for name in row["Director"].split(",")]

        for director in director_list:
            cleaned_name = clean_director_name(director)
            name_map[cleaned_name].add(director)

    return name_map


# Step 3: Return the final director name map with cleaned versions
director_name_map = create_name_map(df)

# Display the result
print("Director Name Map (cleaned name -> original names):")
dict(director_name_map)

In [5]:
import numpy as np

df.loc[df["Director"] == "3 directors", "Director"] = np.nan

In [None]:
def create_name_map(df: pd.DataFrame):
    name_map = defaultdict(set)  # Use a set to account for potential multiple matches

    for i, row in df.iterrows():
        director_list = [name.strip() for name in row["Director"].split(",")]

        for director in director_list:
            cleaned_name = clean_director_name(director)
            name_map[cleaned_name].add(director)

    # Filter to only show directors with more than 1 value
    filtered_name_map = {key: value for key, value in name_map.items() if len(value) > 1}
    return filtered_name_map


# Create the final director name map
director_name_map = create_name_map(df)

# Display the result
print("Directors with more than 1 value:")
director_name_map


In [None]:
description_title_counts = df.groupby("Description")["Title"].nunique()
duplicate_descriptions = description_title_counts[description_title_counts > 1].index
filtered_df = df[df["Description"].isin(duplicate_descriptions)]

filtered_df.sort_values("Description")

In [None]:
counts = df.groupby(["Title", "Description"]).transform("size")
df[counts > 1].sort_values("Title")

In [None]:
from collections import Counter

import pandas as pd
from nltk.corpus import stopwords

stop_words = set(stopwords.words("english"))


def plot_stopword_frequency(df: pd.DataFrame, column_name: str):
    stopwords_in_titles = (
        df[column_name].str.lower().str.split().apply(lambda x: [word for word in x if word in stop_words])
    )
    all_stopwords = [word for sublist in stopwords_in_titles for word in sublist]
    stopword_counts = Counter(all_stopwords)

    stopword_df = pd.DataFrame(stopword_counts.items(), columns=["Stopword", "Frequency"]).sort_values(
        by="Frequency", ascending=False
    )
    plt.figure(figsize=(20, 6))
    plt.bar(stopword_df["Stopword"], stopword_df["Frequency"])
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()


plot_stopword_frequency(df, "Title")

In [None]:
from scipy.stats import chi2_contingency


def check_significant_stopwords(df: pd.DataFrame, column_name: str):
    titles_processed = df[column_name].str.lower().str.split()
    stopword_df = pd.DataFrame(
        {stopword: titles_processed.apply(lambda x: 1 if stopword in x else 0) for stopword in stop_words}
    )
    stopword_df = pd.concat([df["Genre"], stopword_df], axis=1)

    results = {}
    for stopword in stop_words:
        if stopword_df[stopword].sum() > 0:  # Only test stopwords that appear in at least one title
            contingency_table = pd.crosstab(stopword_df[stopword], stopword_df["Genre"])
            chi2, p, dof, ex = chi2_contingency(contingency_table)
            if p < 0.05:  # type: ignore
                results[stopword] = (contingency_table, p)

    return results


significant_stopwords = check_significant_stopwords(df, "Title")
for stopword, (table, p_value) in significant_stopwords.items():
    print(f"Stopword: {stopword} P-value: {p_value}\nCross-tabulation:\n{table}\n")

In [None]:
%autoreload 2
import numpy as np

df.loc[:, "Director"] = df.loc[:, "Director"].replace("Unknown", np.nan)
df["Region"] = df["From"].map(
    {
        "American": "Western",
        "British": "Western",
        "Canadian": "Western",
        "Australian": "Western",
        "Bollywood": "South Asian",
        "Telugu": "South Asian",
        "Tamil": "South Asian",
        "Malayalam": "South Asian",
        "Bengali": "South Asian",
        "Kannada": "South Asian",
        "Marathi": "South Asian",
        "Punjabi": "South Asian",
        "Assamese": "South Asian",
        "Chinese": "East Asian",
        "Japanese": "East Asian",
        "South_Korean": "East Asian",
        "Hong Kong": "East Asian",
        "Filipino": "Southeast Asian",
        "Bangladeshi": "South Asian",
        "Russian": "European",
        "Turkish": "Middle Eastern",
        "Egyptian": "Middle Eastern",
        "Malaysian": "Southeast Asian",
    }
)

In [10]:
# %autoreload 2
# from src.dataset import preprocess_sentence

# df["Title"] = df["Title"].apply(preprocess_sentence)
# df["Description"] = df["Description"].apply(preprocess_sentence)

# df.to_csv("data/processed/train.csv", index=False)

# df = pd.read_csv("data/processed/train.csv")

In [None]:
%autoreload 2
from mlxtend.feature_selection import ColumnSelector
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from skrub import SelectCols, SimilarityEncoder

text_pipeline = make_union(
    make_pipeline(
        ColumnSelector("Title", drop_axis=True),
        TfidfVectorizer(),
        StandardScaler(with_mean=False),
        TruncatedSVD(),
    ),
    make_pipeline(
        ColumnSelector("Description", drop_axis=True),
        TfidfVectorizer(),
        StandardScaler(with_mean=False),
        TruncatedSVD(),
    ),
    make_pipeline(SelectCols("Region"), SimilarityEncoder()),
    make_pipeline(SelectCols("From"), OneHotEncoder(sparse_output=False)),
)

pipeline = make_pipeline(text_pipeline, HistGradientBoostingClassifier())
pipeline

In [None]:
from scipy.stats import loguniform, randint
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder

# Label encoding
label_encoder = LabelEncoder()
df["Genre"] = label_encoder.fit_transform(df["Genre"])

# Split features and target
X_train = df.drop("Genre", axis=1)
y_train = df["Genre"]

# Update param_distributions to match HistGradientBoostingClassifier
param_distributions = {
    "histgradientboostingclassifier__learning_rate": loguniform(0.01, 0.1),
    "histgradientboostingclassifier__max_iter": randint(200, 450),
    "histgradientboostingclassifier__max_depth": randint(4, 7),
    "histgradientboostingclassifier__min_samples_leaf": randint(30, 100),
    "histgradientboostingclassifier__max_leaf_nodes": randint(50, 150),
    "histgradientboostingclassifier__l2_regularization": loguniform(1e-3, 0.1),
    "featureunion__pipeline-1__truncatedsvd__n_components": randint(100, 250),
    "featureunion__pipeline-2__truncatedsvd__n_components": randint(200, 500),
}

# Set up the RandomizedSearchCV with the updated parameters
random_search = RandomizedSearchCV(
    pipeline,
    param_distributions=param_distributions,
    n_iter=50,  # Number of random combinations to try
    cv=3,  # 5-fold cross-validation
    scoring="accuracy",  # Optimizing for accuracy
    random_state=42,
    verbose=4,
    error_score="raise",
)

# Fit the RandomizedSearchCV with the updated pipeline
random_search.fit(X_train, y_train)

# Get the best parameters, score, and model
best_params = random_search.best_params_
best_score = random_search.best_score_
best_model = random_search.best_estimator_


print("\n Best Parameters:")
print(best_params)
print("\n Best Results:")
print(best_score)