In [28]:
%load_ext autoreload

import os
import sys

project_root = "C:/Users/vasco/repos/Natural-Language"
if project_root not in sys.path:
    sys.path.append(project_root)
os.chdir(project_root)

print("Current working directory:", os.getcwd())

In [29]:
%autoreload 2
from src.dataset import read_data

path = "data/raw/train.txt"
columns = ["Title", "From", "Genre", "Director", "Description"]

df = read_data(path, columns)
df.head(3)

In [30]:
%autoreload 2
print(df.info())
print("*" * 20)
repeated_titles = df["Title"].value_counts()[df["Title"].value_counts() > 1].head(5)
print(repeated_titles)
print("*" * 20)
popular_directors = df["Director"].value_counts().head(5)
print(popular_directors)
print("*" * 20)
print(df["From"].value_counts().head(5))

In [31]:
import matplotlib.pyplot as plt
import seaborn as sns

# Set plot styles
sns.set_theme(style="whitegrid")

# 1. Distribution of Genres
plt.figure(figsize=(12, 6))
sns.countplot(y="Genre", data=df, order=df["Genre"].value_counts().index)
plt.title("Distribution of Movie Genres")
plt.xlabel("Count")
plt.ylabel("Genre")
plt.show()

# 2. Distribution of Directors (Top 10)
plt.figure(figsize=(12, 6))
top_directors = df["Director"].value_counts().head(10)
sns.barplot(x=top_directors.values, y=top_directors.index)
plt.title("Top 10 Most Popular Directors")
plt.xlabel("Number of Movies")
plt.ylabel("Director")
plt.show()

# 3. Genre vs Director Count (Top 10 Directors)
top_10_directors = df["Director"].value_counts().head(10).index
filtered_df = df[df["Director"].isin(top_10_directors)]

plt.figure(figsize=(14, 7))
sns.countplot(y="Director", hue="Genre", data=filtered_df)
plt.title("Top 10 Directors by Genre")
plt.xlabel("Number of Movies")
plt.ylabel("Director")
plt.legend(loc="upper right")
plt.show()

In [32]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


def jaccard_similarity(set1, set2):
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    if union == 0:
        return 0.0
    return intersection / union


def tokenize_description(description):
    return set(description.lower().split())


descriptions = df["Description"].fillna("")

tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(descriptions)
cosine_sim = cosine_similarity(tfidf_matrix)

cosine_threshold = 0.8
candidate_pairs = []

for i in range(cosine_sim.shape[0]):
    for j in range(i + 1, cosine_sim.shape[0]):
        if cosine_sim[i, j] >= cosine_threshold:
            candidate_pairs.append((i, j, cosine_sim[i, j]))

final_similar_pairs = []
jaccard_threshold = 0.7

for i, j, cos_sim in candidate_pairs:
    set1 = tokenize_description(df.loc[i, "Description"])
    set2 = tokenize_description(df.loc[j, "Description"])
    jac_sim = jaccard_similarity(set1, set2)

    if jac_sim >= jaccard_threshold:
        final_similar_pairs.append((i, j, cos_sim, jac_sim))

In [33]:
print("\n Different Titles: \n")
for i, j, cos_sim, jac_sim in final_similar_pairs:
    title_i = df.loc[i, "Title"]
    title_j = df.loc[j, "Title"]
    if title_i != title_j:
        print(f"{title_i} ({i}) and {title_j} ({j}) : (Cosine {cos_sim:.4f}, Jaccard  {jac_sim:.4f})")

In [34]:
print("\n Different Directors: \n")
for i, j, cos_sim, jac_simin in final_similar_pairs:
    director_i = df.loc[i, "Director"]
    director_j = df.loc[j, "Director"]
    if director_i != director_j:
        print(f"{director_i} ({i}) and {director_j} ({j}) : (Cosine {cos_sim:.4f}, Jaccard {jac_sim:.4f})")

In [35]:
print("\n Different Genre: \n")
for i, j, cos_sim, jac_simin in final_similar_pairs:
    genre_i = df.loc[i, "Genre"]
    genre_j = df.loc[j, "Genre"]
    if genre_i != genre_j:
        print(f" {genre_i} ({i}) and {genre_j} ({j}) : (Cosine {cos_sim:.4f}, Jaccard {jac_sim:.4f})")

In [45]:
from collections import Counter
import pandas as pd
from nltk.corpus import stopwords


# Load stopwords
stop_words = set(stopwords.words("english"))

# Process titles and count stopwords
stopwords_in_titles = df["Title"].str.lower().str.split().apply(lambda x: [word for word in x if word in stop_words])
all_stopwords = [word for sublist in stopwords_in_titles for word in sublist]
stopword_counts = Counter(all_stopwords)

# Bar chart
stopword_df = pd.DataFrame(stopword_counts.items(), columns=["Stopword", "Frequency"]).sort_values(
    by="Frequency", ascending=False
)
plt.figure(figsize=(20, 6))
plt.bar(stopword_df["Stopword"], stopword_df["Frequency"])
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [37]:
from scipy.stats import chi2_contingency

stop_words = set(stopwords.words("english"))

# Convert titles to lowercase and split into words
df["Title_processed"] = df["Title"].str.lower().str.split()

# Create a DataFrame with binary columns for each stopword
stopword_df = pd.DataFrame(
    {stopword: df["Title_processed"].apply(lambda x: 1 if stopword in x else 0) for stopword in stop_words}
)

# Concatenate the original DataFrame with the stopword DataFrame
stopword_df = pd.concat([df["Genre"], stopword_df], axis=1)

# Function to perform chi-square test for each stopword
results = {}
for stopword in stop_words:
    if stopword_df[stopword].count() > 0:  # Only test stopwords that appear in at least one title
        contingency_table = pd.crosstab(stopword_df[stopword], stopword_df["Genre"])
        chi2, p, dof, ex = chi2_contingency(contingency_table)
        results[stopword] = p

# Convert results to DataFrame and sort by p-value
results_df = pd.DataFrame(list(results.items()), columns=["Stopword", "P-value"]).sort_values(by="P-value")

# Display stopwords that are significantly related to genre (p < 0.05)
significant_stopwords = results_df[results_df["P-value"] < 0.05]
print(significant_stopwords)

In [5]:
%autoreload 2
import numpy as np

df.loc[:, "Director"] = df.loc[:, "Director"].replace("Unknown", np.nan)
df["Region"] = df["From"].map(
    {
        "American": "Western",
        "British": "Western",
        "Canadian": "Western",
        "Australian": "Western",
        "Bollywood": "South Asian",
        "Telugu": "South Asian",
        "Tamil": "South Asian",
        "Malayalam": "South Asian",
        "Bengali": "South Asian",
        "Kannada": "South Asian",
        "Marathi": "South Asian",
        "Punjabi": "South Asian",
        "Assamese": "South Asian",
        "Chinese": "East Asian",
        "Japanese": "East Asian",
        "South_Korean": "East Asian",
        "Hong Kong": "East Asian",
        "Filipino": "Southeast Asian",
        "Bangladeshi": "South Asian",
        "Russian": "European",
        "Turkish": "Middle Eastern",
        "Egyptian": "Middle Eastern",
        "Malaysian": "Southeast Asian",
    }
)

In [10]:
%autoreload 2
from src.dataset import preprocess_sentence

sentence = "Natural Language Processing is really cool. Deep Learning and transformers are interesting."
output = preprocess_sentence(sentence)
print(output)

In [11]:
%autoreload 2

df["Title"] = df["Title"].apply(preprocess_sentence)
df["Description"] = df["Description"].apply(preprocess_sentence)

df.to_csv("data/processed/train.csv", index=False)

In [2]:
import pandas as pd

df = pd.read_csv("data/processed/train.csv")

In [46]:
%autoreload 2
from mlxtend.feature_selection import ColumnSelector
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.svm import SVC
from skrub import SelectCols, SimilarityEncoder

text_pipeline = make_union(
    make_pipeline(
        ColumnSelector("Title", drop_axis=True),
        TfidfVectorizer(strip_accents="unicode", ngram_range=(2, 4)),
        StandardScaler(with_mean=False),
        TruncatedSVD(),
    ),
    make_pipeline(
        ColumnSelector("Description", drop_axis=True),
        TfidfVectorizer(strip_accents="unicode", ngram_range=(2, 4)),
        StandardScaler(with_mean=False),
        TruncatedSVD(),
    ),
    make_pipeline(SelectCols("Region"), SimilarityEncoder()),
    make_pipeline(SelectCols("From"), OneHotEncoder()),
)

pipeline = make_pipeline(text_pipeline, SVC())
pipeline

In [47]:
from scipy.stats import loguniform, randint, uniform
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

df["Genre"] = label_encoder.fit_transform(df["Genre"])

X_train = df.drop("Genre", axis=1)
y_train = df["Genre"]

param_distributions = {
    "svc__C": loguniform(1e-3, 1e3),  # Regularization parameter
    "svc__gamma": loguniform(1e-4, 1e1),  # Kernel coefficient (for ‘rbf’, ‘poly’ and ‘sigmoid’)
    "svc__kernel": ["linear", "rbf", "poly", "sigmoid"],  # Different kernel types
    "svc__degree": randint(2, 5),  # Degree of the polynomial kernel (only for 'poly')
    "svc__coef0": uniform(0, 5),  # Independent term in kernel function (only for ‘poly’ and ‘sigmoid’)
    "svc__class_weight": ["balanced", None],  # Class weight
    "featureunion__pipeline-1__truncatedsvd__n_components": [50, 100, 150, 200],
    "featureunion__pipeline-2__truncatedsvd__n_components": [50, 100, 150, 200],
}

# Set up the RandomizedSearchCV
random_search = RandomizedSearchCV(
    pipeline,
    param_distributions=param_distributions,
    n_iter=10,
    cv=3,
    scoring="accuracy",
    random_state=42,
    n_jobs=-1,
    verbose=1,
)

random_search.fit(X_train, y_train)

best_params = random_search.best_params_
best_score = random_search.best_score_

best_model = random_search.best_estimator_

Fitting 3 folds for each of 10 candidates, totalling 30 fits


KeyboardInterrupt: 

In [None]:
print(best_params)
print(best_score)