<a href="https://colab.research.google.com/github/zsj-jaz/Capstone-G4-ReelGood/blob/main/code/feature_engineer/%5BG4_ReelGood_Jaz%5DFeature_Engineering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import pandas as pd
import numpy as np
import json

from sklearn.preprocessing import MultiLabelBinarizer
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

dataset_path = '/content/drive/MyDrive/datasets/capstone'

In [None]:
meta_df = pd.read_csv(dataset_path+'/cleaned_metadata.csv')
meta_df.head()

Unnamed: 0,id,title,year,genres,first_three_actors,director,original_language,imdb_id
0,862,Toy Story,1995,"Animation, Comedy, Family","Tom Hanks, Tim Allen, Don Rickles",John Lasseter,en,tt0114709
1,8844,Jumanji,1995,"Adventure, Fantasy, Family","Robin Williams, Jonathan Hyde, Kirsten Dunst",Joe Johnston,en,tt0113497
2,15602,Grumpier Old Men,1995,"Romance, Comedy","Walter Matthau, Jack Lemmon, Ann-Margret",Howard Deutch,en,tt0113228
3,31357,Waiting to Exhale,1995,"Comedy, Drama, Romance","Whitney Houston, Angela Bassett, Loretta Devine",Forest Whitaker,en,tt0114885
4,11862,Father of the Bride Part II,1995,Comedy,"Steve Martin, Diane Keaton, Martin Short",Charles Shyer,en,tt0113041


In [None]:
rate_df = pd.read_csv(dataset_path+'/cleaned_ratings.csv')
rate_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,4,223,4.0,1042668576
1,4,415,4.0,1042667925
2,4,648,4.0,1042674800
3,4,1422,4.0,1042674861
4,4,1597,3.0,1042674787


In [None]:
""" Only keep movies that have ratings """
meta_df = meta_df.rename(columns={'id': 'movieId'})
meta_df = meta_df[meta_df['movieId'].isin(rate_df['movieId'])]

""" create new userIdInt and movieIdInt to ensure index start from 0 """
user_mapping = {user_id: idx for idx, user_id in enumerate(sorted(rate_df['userId'].unique()))}
movie_mapping = {movie_id: idx for idx, movie_id in enumerate(sorted(rate_df['movieId'].unique()))}

user_mapping = {int(k): v for k, v in user_mapping.items()}
movie_mapping = {int(k): v for k, v in movie_mapping.items()}

rate_df['userIdInt'] = rate_df['userId'].map(user_mapping)
rate_df['movieIdInt'] = rate_df['movieId'].map(movie_mapping)
meta_df['movieIdInt'] = meta_df['movieId'].map(movie_mapping)

In [None]:
meta_df.head()

Unnamed: 0,movieId,title,year,genres,first_three_actors,director,original_language,imdb_id,movieIdInt
0,862,Toy Story,1995,"Animation, Comedy, Family","Tom Hanks, Tim Allen, Don Rickles",John Lasseter,en,tt0114709,670
1,8844,Jumanji,1995,"Adventure, Fantasy, Family","Robin Williams, Jonathan Hyde, Kirsten Dunst",Joe Johnston,en,tt0113497,2789
5,949,Heat,1995,"Action, Crime, Drama, Thriller","Al Pacino, Robert De Niro, Val Kilmer",Michael Mann,en,tt0113277,744
9,710,GoldenEye,1995,"Adventure, Action, Thriller","Pierce Brosnan, Sean Bean, Izabella Scorupco",Martin Campbell,en,tt0113189,553
14,1408,Cutthroat Island,1995,"Action, Adventure","Geena Davis, Matthew Modine, Frank Langella",Renny Harlin,en,tt0112760,882


In [None]:
rate_df.head()

Unnamed: 0,userId,movieId,rating,timestamp,userIdInt,movieIdInt
0,4,223,4.0,1042668576,0,170
1,4,415,4.0,1042667925,0,316
2,4,648,4.0,1042674800,0,498
3,4,1422,4.0,1042674861,0,894
4,4,1597,3.0,1042674787,0,973


In [None]:
rate_df.to_csv(dataset_path + '/cleaned_ratings_id_remapped.csv', index=False)
meta_df.to_csv(dataset_path + '/cleaned_metadata_id_remapped.csv', index=False)

with open(dataset_path + "/user_id_mapping.json", "w") as f:
    json.dump(user_mapping, f)

with open(dataset_path + "/movie_id_mapping.json", "w") as f:
    json.dump(movie_mapping, f)

print("Updated datasets saved successfully!")
print(f"Unique Users: {rate_df['userIdInt'].nunique()} | Unique Movies: {rate_df['movieIdInt'].nunique()}")

Updated datasets saved successfully!
Unique Users: 120147 | Unique Movies: 7508


In [None]:
""" Leave-One-Last-Out Splitting """

# rate_df = pd.read_csv(dataset_path + "/cleaned_ratings_id_remapped.csv")

rate_df = rate_df.sort_values(by=['userId', 'timestamp'])

test_set = rate_df.groupby('userId').tail(1)
remaining_ratings = rate_df.drop(test_set.index)
val_set = remaining_ratings.groupby('userId').tail(1)
train_set = remaining_ratings.drop(val_set.index)

train_set.to_csv(dataset_path + "/train_ratings.csv", index=False)
val_set.to_csv(dataset_path + "/val_ratings.csv", index=False)
test_set.to_csv(dataset_path + "/test_ratings.csv", index=False)

print(f"Train set size: {len(train_set)}")
print(f"Validation set size: {len(val_set)}")
print(f"Test set size: {len(test_set)}")

Train set size: 9972455
Validation set size: 120147
Test set size: 120147


## Feature Engineering of Metadata

In [None]:
meta_df_original = pd.read_csv(dataset_path + '/cleaned_data/cleaned_metadata_id_remapped.csv')
meta_df = meta_df_original.copy()
meta_df.head()

Unnamed: 0,movieId,title,year,genres,first_three_actors,director,original_language,imdb_id,movieIdInt
0,862,Toy Story,1995,"Animation, Comedy, Family","Tom Hanks, Tim Allen, Don Rickles",John Lasseter,en,tt0114709,670
1,8844,Jumanji,1995,"Adventure, Fantasy, Family","Robin Williams, Jonathan Hyde, Kirsten Dunst",Joe Johnston,en,tt0113497,2789
2,949,Heat,1995,"Action, Crime, Drama, Thriller","Al Pacino, Robert De Niro, Val Kilmer",Michael Mann,en,tt0113277,744
3,710,GoldenEye,1995,"Adventure, Action, Thriller","Pierce Brosnan, Sean Bean, Izabella Scorupco",Martin Campbell,en,tt0113189,553
4,1408,Cutthroat Island,1995,"Action, Adventure","Geena Davis, Matthew Modine, Frank Langella",Renny Harlin,en,tt0112760,882


### Language

In [None]:
language_counts = meta_df['original_language'].value_counts()

print(f"Total unique languages: {len(language_counts)}")
print("=" * 80)

for i, (language, count) in enumerate(language_counts.items()):
    print(f"{language:<3} {count:<10}", end="\t")
    if i % 5 == 4:
        print()


Total unique languages: 56
en  5342      	fr  499       	it  264       	de  263       	ja  213       	
es  152       	ru  116       	sv  65        	ko  62        	hi  56        	
zh  49        	pt  40        	nl  38        	fi  37        	pl  35        	
da  34        	cn  27        	cs  24        	tr  18        	th  13        	
no  12        	fa  12        	he  12        	bn  11        	hu  11        	
ro  11        	sr  10        	ta  9         	el  9         	xx  8         	
bs  4         	ka  4         	tl  4         	te  4         	is  4         	
uk  3         	bg  3         	hr  3         	sk  3         	ar  3         	
et  2         	ml  2         	mn  2         	id  2         	wo  2         	
ca  1         	lo  1         	af  1         	lv  1         	zu  1         	
cy  1         	mr  1         	ab  1         	sl  1         	kk  1         	
eu  1         	

In [None]:
threshold = 36

rare_languages = language_counts[language_counts < threshold].index

meta_df['original_language'] = meta_df['original_language'].replace(rare_languages, 'other')
language_counts = meta_df['original_language'].value_counts()
print(f"Total unique languages: {len(language_counts)}")
print("=" * 80)
print(language_counts)


Total unique languages: 15
original_language
en       5342
fr        499
other     312
it        264
de        263
ja        213
es        152
ru        116
sv         65
ko         62
hi         56
zh         49
pt         40
nl         38
fi         37
Name: count, dtype: int64


In [None]:
def tokenize_name(meta_df, col):
  meta_df[col] = (
      meta_df[col]
      .fillna('')
      .str.replace(r'\s+', '', regex=True)
      .str.lower()
      .str.replace(r'[^\w,]', '', regex=True)
  )
  return meta_df

meta_df = tokenize_name(meta_df, 'first_three_actors')
meta_df = tokenize_name(meta_df, 'director')

meta_df.head()

Unnamed: 0,movieId,title,year,genres,first_three_actors,director,original_language,imdb_id,movieIdInt
0,862,Toy Story,1995,"Animation, Comedy, Family","tomhanks,timallen,donrickles",johnlasseter,en,tt0114709,670
1,8844,Jumanji,1995,"Adventure, Fantasy, Family","robinwilliams,jonathanhyde,kirstendunst",joejohnston,en,tt0113497,2789
2,949,Heat,1995,"Action, Crime, Drama, Thriller","alpacino,robertdeniro,valkilmer",michaelmann,en,tt0113277,744
3,710,GoldenEye,1995,"Adventure, Action, Thriller","piercebrosnan,seanbean,izabellascorupco",martincampbell,en,tt0113189,553
4,1408,Cutthroat Island,1995,"Action, Adventure","geenadavis,matthewmodine,franklangella",rennyharlin,en,tt0112760,882


In [None]:
def ohe_feature(df, col, delimiter,prefix):
  df = df.copy()
  df[col] = df[col].str.split(delimiter)
  mlb = MultiLabelBinarizer()
  ohe = pd.DataFrame(mlb.fit_transform(df[col]), columns=mlb.classes_, index=df.index)

  ohe = ohe.rename(columns=lambda x: f"{prefix}_{x}")
  df = df.drop(columns=[col]).reset_index(drop=True)
  df = pd.concat([df, ohe], axis=1)
  return df

meta_df_encoded = ohe_feature(meta_df, 'genres', ', ', 'G')
meta_df_encoded.head()


Unnamed: 0,movieId,title,year,first_three_actors,director,original_language,imdb_id,movieIdInt,G_Action,G_Adult,...,G_Mystery,G_Romance,G_Sci-Fi,G_Science Fiction,G_Short,G_Sport,G_TV Movie,G_Thriller,G_War,G_Western
0,862,Toy Story,1995,"tomhanks,timallen,donrickles",johnlasseter,en,tt0114709,670,0,0,...,0,0,0,0,0,0,0,0,0,0
1,8844,Jumanji,1995,"robinwilliams,jonathanhyde,kirstendunst",joejohnston,en,tt0113497,2789,0,0,...,0,0,0,0,0,0,0,0,0,0
2,949,Heat,1995,"alpacino,robertdeniro,valkilmer",michaelmann,en,tt0113277,744,1,0,...,0,0,0,0,0,0,0,1,0,0
3,710,GoldenEye,1995,"piercebrosnan,seanbean,izabellascorupco",martincampbell,en,tt0113189,553,1,0,...,0,0,0,0,0,0,0,1,0,0
4,1408,Cutthroat Island,1995,"geenadavis,matthewmodine,franklangella",rennyharlin,en,tt0112760,882,1,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
meta_df_encoded = ohe_feature(meta_df_encoded, 'original_language', ',', 'L')
meta_df_encoded.head()

Unnamed: 0,movieId,title,year,first_three_actors,director,imdb_id,movieIdInt,G_Action,G_Adult,G_Adventure,...,L_hi,L_it,L_ja,L_ko,L_nl,L_other,L_pt,L_ru,L_sv,L_zh
0,862,Toy Story,1995,"tomhanks,timallen,donrickles",johnlasseter,tt0114709,670,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,8844,Jumanji,1995,"robinwilliams,jonathanhyde,kirstendunst",joejohnston,tt0113497,2789,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,949,Heat,1995,"alpacino,robertdeniro,valkilmer",michaelmann,tt0113277,744,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,710,GoldenEye,1995,"piercebrosnan,seanbean,izabellascorupco",martincampbell,tt0113189,553,1,0,1,...,0,0,0,0,0,0,0,0,0,0
4,1408,Cutthroat Island,1995,"geenadavis,matthewmodine,franklangella",rennyharlin,tt0112760,882,1,0,1,...,0,0,0,0,0,0,0,0,0,0


In [None]:
def word2vec(df, col, newcol, delimiter=None):
  df = df.copy()
  if delimiter:
    df[col] = df[col].str.split(delimiter)
  model = Word2Vec(sentences=df[col], vector_size=20, window=5, min_count=1, workers=4)
  def encode_movie(features):
    vectors = [model.wv[fea] for fea in features if fea in model.wv]
    if not vectors:
      return np.zeros(model.vector_size)
    return np.mean(vectors, axis=0)
  df[newcol] = df[col].apply(encode_movie)
  return df

meta_df_encoded = word2vec(meta_df_encoded, 'first_three_actors', 'actor_vec', delimiter=',')
meta_df_encoded['actor_vec'].head()

Unnamed: 0,actor_vec
0,"[0.009011689, -0.00025350475, 0.011138584, 0.0..."
1,"[-0.009639174, -0.0051140557, 0.003408042, 0.0..."
2,"[0.013628433, 0.017308336, 0.023538781, 0.0111..."
3,"[0.014488584, 0.008962001, 0.022006324, -0.008..."
4,"[0.00695286, 0.01720686, 0.008968999, 0.001311..."


In [None]:
meta_df_encoded = word2vec(meta_df_encoded, 'director', 'director_vec')
meta_df_encoded['director_vec'].head()



Unnamed: 0,director_vec
0,"[-0.42760155, 0.13697457, 0.20941244, 0.056648..."
1,"[-0.42440513, 0.14589114, 0.23616546, 0.096033..."
2,"[-0.43454915, 0.0672729, 0.24271268, 0.0944805..."
3,"[-0.44027948, 0.07801231, 0.1839246, 0.0571525..."
4,"[-0.4635523, 0.107159145, 0.16134076, -0.02284..."


In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
meta_df_encoded['year_norm'] = scaler.fit_transform(meta_df_encoded[['year']])
meta_df_encoded['year_norm'].head()

Unnamed: 0,year_norm
0,0.826772
1,0.826772
2,0.826772
3,0.826772
4,0.826772


In [None]:
encoded_cols = ['movieIdInt']
encoded_cols.extend([col for col in meta_df_encoded.columns if col.startswith('G_') or col.startswith('L_')])
encoded_cols.extend(['actor_vec', 'director_vec', 'year_norm'])
meta_df_encoded_to_save = meta_df_encoded[encoded_cols]

meta_df_encoded_to_save.head()

Unnamed: 0,movieIdInt,G_Action,G_Adult,G_Adventure,G_Animation,G_Biography,G_Comedy,G_Crime,G_Documentary,G_Drama,...,L_ko,L_nl,L_other,L_pt,L_ru,L_sv,L_zh,actor_vec,director_vec,year_norm
0,670,0,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,"[0.009011689, -0.00025350475, 0.011138584, 0.0...","[-0.42760155, 0.13697457, 0.20941244, 0.056648...",0.826772
1,2789,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,"[-0.009639174, -0.0051140557, 0.003408042, 0.0...","[-0.42440513, 0.14589114, 0.23616546, 0.096033...",0.826772
2,744,1,0,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,"[0.013628433, 0.017308336, 0.023538781, 0.0111...","[-0.43454915, 0.0672729, 0.24271268, 0.0944805...",0.826772
3,553,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,"[0.014488584, 0.008962001, 0.022006324, -0.008...","[-0.44027948, 0.07801231, 0.1839246, 0.0571525...",0.826772
4,882,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,"[0.00695286, 0.01720686, 0.008968999, 0.001311...","[-0.4635523, 0.107159145, 0.16134076, -0.02284...",0.826772


In [None]:
def save_df(df, save_path):
  dir_name = os.path.dirname(save_path)
  if not os.path.exists(dir_name):
    os.makedirs(dir_name)
  if save_path.endswith('.csv'):
    df.to_csv(save_path, index=False)
    print(f" DataFrame saved as CSV: {save_path}")
  elif save_path.endswith('.parquet'):
    df.to_parquet(save_path, index=False)
    print(f" DataFrame saved as Parquet: {save_path}")
  else:
    print("Error: Unsupported file format. Use .csv or .parquet.")


save_df(meta_df_encoded_to_save, save_path = dataset_path+"/meta_encoding/meta_data_encoded.csv")

 DataFrame saved as CSV: /content/drive/MyDrive/datasets/capstone/meta_encoding/meta_data_encoded.csv


In [None]:
import pandas as pd

meta_df_encoded = pd.read_csv( dataset_path + "/data_ready_for_model/meta_encoding/meta_data_encoded.csv")
meta_df_encoded.head()


Unnamed: 0,movieIdInt,G_Action,G_Adult,G_Adventure,G_Animation,G_Biography,G_Comedy,G_Crime,G_Documentary,G_Drama,...,L_ko,L_nl,L_other,L_pt,L_ru,L_sv,L_zh,actor_vec,director_vec,year_norm
0,670,0,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,[ 0.00901169 -0.0002535 0.01113858 0.010536...,[-0.42760155 0.13697457 0.20941244 0.056648...,0.826772
1,2789,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,[-0.00963917 -0.00511406 0.00340804 0.012952...,[-0.42440513 0.14589114 0.23616546 0.096033...,0.826772
2,744,1,0,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,[ 0.01362843 0.01730834 0.02353878 0.011125...,[-0.43454915 0.0672729 0.24271268 0.094480...,0.826772
3,553,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,[ 0.01448858 0.008962 0.02200632 -0.008503...,[-0.44027948 0.07801231 0.1839246 0.057152...,0.826772
4,882,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,[ 0.00695286 0.01720686 0.008969 0.001311...,[-0.4635523 0.10715915 0.16134076 -0.022849...,0.826772


In [None]:
import numpy as np

# Function to safely convert space-separated vectors to lists
def parse_vector(vec_str):
    return np.fromstring(vec_str.strip(" []"), sep=" ")

# Apply conversion
meta_df_encoded["actor_vec"] = meta_df_encoded["actor_vec"].apply(parse_vector)
meta_df_encoded["director_vec"] = meta_df_encoded["director_vec"].apply(parse_vector)

# Drop 'movieIdInt' before processing
meta_features = meta_df_encoded.drop(columns=['movieIdInt'])

# Convert actor and director vectors into separate columns
actor_vec_df = pd.DataFrame(meta_features['actor_vec'].to_list(), index=meta_features.index)
director_vec_df = pd.DataFrame(meta_features['director_vec'].to_list(), index=meta_features.index)

# Rename columns for clarity
actor_vec_df.columns = [f'actor_vec_{i}' for i in range(actor_vec_df.shape[1])]
director_vec_df.columns = [f'director_vec_{i}' for i in range(director_vec_df.shape[1])]

# Drop old list-based columns and merge expanded embeddings
meta_features = meta_features.drop(columns=['actor_vec', 'director_vec'])
meta_features = pd.concat([meta_features, actor_vec_df, director_vec_df], axis=1)

# Normalize all features (to prevent one-hot dominance)
scaler = StandardScaler()
meta_features_scaled = scaler.fit_transform(meta_features)

# Compute cosine similarity
meta_sim = cosine_similarity(meta_features_scaled)
print("Meta Similarity Matrix Shape:", meta_sim.shape)
print(meta_sim[:5, :5])  # Preview similarity matrix


Meta Similarity Matrix Shape: (7508, 7508)
[[ 1.          0.28821937 -0.2121643  -0.15100939 -0.02140478]
 [ 0.28821937  1.         -0.09423064  0.06497625 -0.05811079]
 [-0.2121643  -0.09423064  1.          0.47405114  0.05700039]
 [-0.15100939  0.06497625  0.47405114  1.          0.38937208]
 [-0.02140478 -0.05811079  0.05700039  0.38937208  1.        ]]


In [None]:
# Store the order of movieIdInt
meta_movie_ids = meta_df_encoded["movieIdInt"].values

# Convert similarity matrix to DataFrame for indexing
meta_sim_df = pd.DataFrame(meta_sim, index=meta_movie_ids, columns=meta_movie_ids)

# Save for future reference
meta_sim_df.to_csv(dataset_path + "/data_ready_for_model/meta_encoding/meta_similarity_matrix.csv")


In [None]:
dir = "/content/drive/My Drive/datasets/capstone/data_ready_for_model/"

train_df = pd.read_csv(os.path.join(dir, "train_test_split/train_ratings.csv"))

train_movie_ids = np.sort(train_df["movieIdInt"].unique())  # Movies in training set

# Reorder `content_similarity_matrix` to match `arr_train` movie order
meta_sim_df = meta_sim_df.loc[train_movie_ids, train_movie_ids]

# Convert back to NumPy array for training
content_similarity_matrix = meta_sim_df.to_numpy()

# Save as a NumPy binary file for fast loading
np.save(dataset_path + "/data_ready_for_model/meta_encoding/meta_similarity_matrix.npy", content_similarity_matrix)



In [None]:
np.save(dataset_path+"/data_ready_for_model/meta_encoding/meta_similarity.npy", meta_sim)

In [None]:
def get_top_similar_movies_based_on_metadata(meta_sim, movie_df, num_movies=3, top_k=10, seed=42):

  np.random.seed(seed)
  random_indices = np.random.choice(meta_sim.shape[0], num_movies, replace=False)

  for idx in random_indices:
    movie_id = movie_df.iloc[idx]['movieIdInt']
    movie_title = movie_df.iloc[idx]['title']

    similarities = meta_sim[idx]

    top_indices = np.argsort(similarities)[::-1][1:top_k+1]

    similar_movies = movie_df.iloc[top_indices][['movieIdInt', 'title']]
    similar_movies['Similarity'] = similarities[top_indices]

    print(f"\n **Top {top_k} Similar Movies to:** {movie_title} ({movie_id})")
    print(similar_movies.to_string(index=False))


get_top_similar_movies_based_on_metadata(meta_sim, meta_df_encoded, num_movies=3, top_k=10)



 **Top 10 Similar Movies to:** Blood Games (5976)
 movieIdInt                                  title  Similarity
       4483                        Polar Opposites    0.654910
       1938                           Nuits rouges    0.640828
       4618                         A Parting Shot    0.633717
       3946              Diary of a Country Priest    0.629224
       4984                        A Loving Father    0.611742
       1456 An Elephant Can Be Extremely Deceptive    0.601536
       4476                               The Prey    0.598620
        944                              Mouchette    0.590528
       4964                       Special Delivery    0.575449
       5244                                  Brake    0.571522

 **Top 10 Similar Movies to:** The Ice Dragon (6308)
 movieIdInt                                    title  Similarity
       3008 More About the Children of Noisy Village    0.868643
       4102                                    Anita    0.804941
       

In [None]:
def get_top_similar_movies_with_metadata(meta_sim, encoded_df, original_df, num_movies=3, top_k=10, seed=42):
    np.random.seed(seed)
    random_indices = np.random.choice(meta_sim.shape[0], num_movies, replace=False)

    for idx in random_indices:
        movie_id = encoded_df.iloc[idx]['movieIdInt']
        movie_metadata = original_df[original_df['movieIdInt'] == movie_id][['title', 'genres', 'first_three_actors', 'director', 'original_language', 'year']].iloc[0]

        similarities = meta_sim[idx]

        # Get top-k similar movie indices (excluding itself)
        top_indices = np.argsort(similarities)[::-1][1:top_k+1]
        similar_movies = encoded_df.iloc[top_indices][['movieIdInt', 'title']].copy()
        similar_movies['Similarity'] = similarities[top_indices]

        # Merge with original metadata
        similar_movies = similar_movies.merge(original_df[['movieIdInt', 'genres', 'first_three_actors', 'director', 'original_language', 'year']],
                                              on='movieIdInt', how='left')

        print("\n" + "="*80)
        print(f"   **Selected Movie:** {movie_metadata['title']} ({movie_id})")
        print(f"   - Genres: {movie_metadata['genres']}")
        print(f"   - First Three Actors: {movie_metadata['first_three_actors']}")
        print(f"   - Director: {movie_metadata['director']}")
        print(f"   - Language: {movie_metadata['original_language']}")
        print(f"   - Year: {movie_metadata['year']}")
        print("="*80)

        # Print results
        print(f"\n  **Top {top_k} Similar Movies:**")
        print(similar_movies.to_string(index=False))

get_top_similar_movies_with_metadata(meta_sim, meta_df_encoded, meta_df_original, num_movies=3, top_k=10)



   **Selected Movie:** Blood Games (5976)
   - Genres: Action, Thriller
   - First Three Actors: Gregory Scott Cummins, Laura Albert, Shelley Abblett
   - Director: Tanya Rosenberg
   - Language: fr
   - Year: 1990

  **Top 10 Similar Movies:**
 movieIdInt                                  title  Similarity                  genres                                          first_three_actors       director original_language  year
       4483                        Polar Opposites    0.654910                  Action              Charles Shaughnessy, Tracy Nelson, Ken Barnett  Fred Olen Ray                fr  2008
       1938                           Nuits rouges    0.640828        Horror, Thriller       Jacques Champreux, Josephine Chaplin, Patrick Préjean Georges Franju                fr  1974
       4618                         A Parting Shot    0.633717 Drama, Action, Thriller                      Isild Le Besco, Lio, Steven de Almeida   Jeanne Waltz                fr  2007
       394

## Calculate Biases

In [None]:
import os
import pandas as pd
import json

def compute_biases(train_df, user_col, item_col, rating_col, save_path):
    """
    Computes and saves the global bias, user bias, and item bias for a recommendation system.

    Parameters:
    train_df (pd.DataFrame): DataFrame containing user, item, and rating columns.
    user_col (str): Name of the column representing user IDs.
    item_col (str): Name of the column representing item IDs (movies).
    rating_col (str): Name of the column representing ratings.
    save_path (str): Directory path to save the bias files.

    Returns:
    None (Saves bias values to files)
    """
    # Ensure the directory exists
    os.makedirs(save_path, exist_ok=True)

    # Compute Global Bias (μ)
    global_bias = train_df[rating_col].mean()

    # Compute User Bias (b_u)
    user_bias = train_df.groupby(user_col)[rating_col].mean() - global_bias
    user_bias_dict = user_bias.to_dict()

    # Compute Item Bias (b_i)
    item_bias = train_df.groupby(item_col)[rating_col].mean() - global_bias
    item_bias_dict = item_bias.to_dict()

    print(f"Loaded Global Bias: {global_bias}")
    print("Sample User Biases:", list(user_bias_dict.items())[:5])
    print("Sample Item Biases:", list(item_bias_dict.items())[:5])

    # Save biases to files
    with open(os.path.join(save_path, "global_bias.json"), "w") as f:
        json.dump({"global_bias": global_bias}, f)

    with open(os.path.join(save_path, "user_bias.json"), "w") as f:
        json.dump(user_bias_dict, f)

    with open(os.path.join(save_path, "item_bias.json"), "w") as f:
        json.dump(item_bias_dict, f)

    print(f"Biases saved successfully in {save_path}!")

# Load training data
train_df = pd.read_csv(os.path.join(dataset_path, "train_ratings.csv"))

# Define save directory
bias_dir = os.path.join(dataset_path, "biases")

# Compute and save biases
compute_biases(train_df, user_col='userIdInt', item_col='movieIdInt', rating_col='rating', save_path=bias_dir)


Loaded Global Bias: 3.5211848035413547
Sample User Biases: [(0, 0.14548186312531186), (1, 0.11770408534753418), (2, -0.4611848035413546), (3, 0.4595844272278762), (4, -0.19041557277212373)]
Sample Item Biases: [(0, -0.2967395357466982), (1, -0.3902229768641088), (2, -0.47237307105615267), (3, 0.3235563972868025), (4, 0.134832235862266)]
Biases saved successfully in /content/drive/MyDrive/datasets/capstone/biases!


## Similarity Matrix

In [None]:
train_df = pd.read_csv(dataset_path + '/data_ready_for_model/train_test_split/train_ratings.csv')
train_df.head()

Unnamed: 0,userId,movieId,rating,timestamp,userIdInt,movieIdInt
0,4,4896,4.0,1042667845,0,2073
1,4,415,4.0,1042667925,0,316
2,4,2023,5.0,1042667945,0,1253
3,4,3004,1.0,1042668521,0,1618
4,4,2694,1.0,1042668544,0,1525


In [None]:
# Load biases
bias_dir = dataset_path+"/data_ready_for_model/biases"
with open(os.path.join(bias_dir, "global_bias.json")) as f:
    global_bias = json.load(f)["global_bias"]

with open(os.path.join(bias_dir, "user_bias.json")) as f:
    user_bias = json.load(f)

with open(os.path.join(bias_dir, "item_bias.json")) as f:
    item_bias = json.load(f)

# Compute bias-adjusted ratings
def adjust_rating(row):
    user = row['userIdInt']
    item = row['movieIdInt']

    # Compute bias-adjusted rating: r_ui - (μ + b_u + b_i)
    return row['rating'] - (global_bias + user_bias.get(user, 0) + item_bias.get(item, 0))

train_df["adjusted_rating"] = train_df.apply(adjust_rating, axis=1)



In [None]:
train_df.head()

Unnamed: 0,userId,movieId,rating,timestamp,userIdInt,movieIdInt,adjusted_rating
0,4,4896,4.0,1042667845,0,2073,0.478815
1,4,415,4.0,1042667925,0,316,0.478815
2,4,2023,5.0,1042667945,0,1253,1.478815
3,4,3004,1.0,1042668521,0,1618,-2.521185
4,4,2694,1.0,1042668544,0,1525,-2.521185


In [None]:
# Define the save path
save_path = os.path.join(dataset_path, "data_ready_for_model", "similarity")
os.makedirs(save_path, exist_ok=True)  # Ensure the directory exists

# Save the first few rows of train_df
train_df.to_csv(os.path.join(save_path, "train_adjusted_ratings.csv"), index=False)

print(f"Saved to {save_path}/train_adjusted_ratings.csv")


Saved to /content/drive/MyDrive/datasets/capstone/data_ready_for_model/similarity/train_adjusted_ratings.csv


In [None]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from scipy.stats import pearsonr
from tqdm import tqdm  # For progress tracking

# Convert user and item IDs to categorical codes (reduces memory usage)
train_df["userIdCat"] = train_df["userIdInt"].astype("category").cat.codes
train_df["movieIdCat"] = train_df["movieIdInt"].astype("category").cat.codes

# Create a sparse matrix (user-item ratings)
user_item_sparse = csr_matrix(
    (train_df["adjusted_rating"], (train_df["userIdCat"], train_df["movieIdCat"]))
)

print("Sparse matrix shape:", user_item_sparse.shape)


Sparse matrix shape: (120147, 7486)


In [None]:
import numpy as np
import scipy.sparse as sp

import numpy as np
import scipy.sparse as sp

def fast_pearson_shrinkage(item_sparse, lambda_=100):
    """
    Computes shrunken Pearson correlation efficiently using sparse matrix operations.
    Assumes that `item_sparse` already contains bias-adjusted ratings.
    """
    # Convert sparse matrix to compressed format
    item_sparse = item_sparse.tocsc()

    # Compute co-rated counts (users who rated both items)
    co_rated_counts = (item_sparse > 0).T @ (item_sparse > 0)  # Sparse matrix

    # Compute Pearson correlation using dot products (vectorized)
    dot_product = item_sparse.T @ item_sparse  # Fast matrix multiplication
    norm_factor = np.sqrt(item_sparse.power(2).sum(axis=0))  # Compute norms
    pearson_sim = dot_product.multiply(1 / (norm_factor.T @ norm_factor))  # Sparse-friendly Pearson formula

    # Convert co_rated_counts to dense before scalar addition
    shrinkage_factor = co_rated_counts.copy()
    shrinkage_factor.data = shrinkage_factor.data / (shrinkage_factor.data + lambda_)

    # Apply shrinkage
    shrunken_similarity = shrinkage_factor.multiply(pearson_sim)

    return shrunken_similarity  # Returns a sparse matrix



# Compute shrunken Pearson similarity
shrunken_similarity = fast_pearson_shrinkage(user_item_sparse)

# Convert to DataFrame for readability
item_similarity_df = pd.DataFrame(shrunken_similarity)


item_similarity_df.head()

Unnamed: 0,0
0,"(0, 0)\t0.009900990099008328\n (0, 164)\t0...."
1,"(0, 1)\t0.009900990099011096\n (0, 3)\t0.00..."
2,"(0, 2)\t0.009900990099011117\n (0, 57)\t0.0..."
3,"(0, 3)\t0.009900990099009294\n (0, 234)\t0...."
4,"(0, 4)\t0.009900990099011228\n (0, 131)\t0...."


In [None]:
from scipy.sparse import save_npz, load_npz

# Save the sparse similarity matrix
save_path = dataset_path+ "/data_ready_for_model/similarity/shrunken_similarity.npz"
save_npz(save_path, shrunken_similarity)

print(f"Sparse similarity matrix saved to {save_path}")


Sparse similarity matrix saved to /content/drive/MyDrive/datasets/capstone/data_ready_for_model/similarity/shrunken_similarity.npz


In [None]:
from scipy.sparse import load_npz, save_npz

# Load the saved sparse similarity matrix
shrunken_similarity = load_npz(dataset_path + "/data_ready_for_model/similarity/shrunken_similarity.npz")

# Convert sparse matrix to dense NumPy array
shrunken_similarity_dense = shrunken_similarity.toarray()

# Convert to DataFrame for easy alignment
shrunken_sim_df = pd.DataFrame(shrunken_similarity_dense, index=train_movie_ids, columns=train_movie_ids)

# Align CF similarity matrix to match train_movie_ids
shrunken_sim_df = shrunken_sim_df.loc[train_movie_ids, train_movie_ids]

# Convert back to NumPy array
similarity_matrix = shrunken_sim_df.to_numpy()

# Save aligned collaborative similarity matrix
np.save(dataset_path + "/data_ready_for_model/similarity/collab_similarity_matrix_aligned.npy", similarity_matrix)

# Print verification
print("Updated Collaborative Similarity Matrix Shape:", similarity_matrix.shape)


Updated Collaborative Similarity Matrix Shape: (7486, 7486)


In [None]:
# Save the similarity matrix
save_path = os.path.join(dataset_path, "data_ready_for_model", "similarity")
os.makedirs(save_path, exist_ok=True)
item_similarity_df.to_csv(os.path.join(save_path, "item_similarity_matrix_shrunken.csv"))

print(f"Saved shrunken Pearson item similarity matrix to {save_path}/item_similarity_matrix_shrunken.csv")

In [None]:
user_item_matrix = train_df.pivot(index='userIdInt', columns='movieIdInt', values='adjusted_rating').fillna(0)
# Pearson
item_similarity = user_item_matrix.T.corr()

# Apply shrinkage
lambda_ = 100
co_rated_counts = user_item_matrix.T.notna().astype(int).dot(user_item_matrix.notna().astype(int))
shrunken_similarity = (co_rated_counts / (co_rated_counts + lambda_)) * item_similarity


In [None]:
shrunken_similarity.to_csv(dataset_path+"/data_ready_for_model/similarity/item_similarity_matrix_shrunken.csv")
print("Shrunken Pearson item similarity matrix saved!")


In [None]:
train_movie_ids = set(train_df['movieIdInt'])
train_meta_df = meta_df[meta_df['movieIdInt'].isin(train_movie_ids)]
train_meta_df.head()

Unnamed: 0,movieId,title,year,genres,first_three_actors,director,original_language,imdb_id,movieIdInt
0,862,Toy Story,1995,"Animation, Comedy, Family","tomhanks,timallen,donrickles",johnlasseter,en,tt0114709,670
1,8844,Jumanji,1995,"Adventure, Fantasy, Family","robinwilliams,jonathanhyde,kirstendunst",joejohnston,en,tt0113497,2789
2,949,Heat,1995,"Action, Crime, Drama, Thriller","alpacino,robertdeniro,valkilmer",michaelmann,en,tt0113277,744
3,710,GoldenEye,1995,"Adventure, Action, Thriller","piercebrosnan,seanbean,izabellascorupco",martincampbell,en,tt0113189,553
4,1408,Cutthroat Island,1995,"Action, Adventure","geenadavis,matthewmodine,franklangella",rennyharlin,en,tt0112760,882


In [None]:
valid_df = pd.read_csv(dataset_path + '/val_ratings.csv')
valid_df.head()

Unnamed: 0,userId,movieId,rating,timestamp,userIdInt,movieIdInt
0,4,1644,3.0,1042674845,0,1005
1,7,39183,3.0,1486253891,1,3808
2,8,1792,1.0,1013444101,2,1087
3,9,150,4.0,1073837284,3,107
4,11,36513,3.0,1231682019,4,3762


In [None]:
test_df = pd.read_csv(dataset_path + '/test_ratings.csv')
test_df.head()

Unnamed: 0,userId,movieId,rating,timestamp,userIdInt,movieIdInt
0,4,1422,4.0,1042674861,0,894
1,7,88744,1.5,1486253974,1,5375
2,8,2022,3.0,1013444158,2,1252
3,9,2321,4.0,1073837327,3,1419
4,11,6058,3.5,1231683242,4,2384


In [None]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer

# Step 1: Filter metadata for training movies
train_movie_ids = set(train_df['movieIdInt'])
train_meta_df = meta_df[meta_df['movieIdInt'].isin(train_movie_ids)].copy()

# Step 2: Split genre strings into lists
train_meta_df['genres'] = train_meta_df['genres'].str.split(', ')

# Step 3: Apply MultiLabelBinarizer to one-hot encode genres
mlb = MultiLabelBinarizer()
genre_ohe = pd.DataFrame(mlb.fit_transform(train_meta_df['genres']), columns=mlb.classes_)

# Step 4: Rename genre columns to start with "Genre_"
genre_ohe = genre_ohe.rename(columns=lambda x: f"G_{x}")

# Step 5: Merge One-Hot Encoded Genres with train_meta_df
train_meta_df = train_meta_df.drop(columns=['genres']).reset_index(drop=True)
train_meta_df = pd.concat([train_meta_df, genre_ohe], axis=1)


train_meta_df.head()



Unnamed: 0,movieId,title,year,first_three_actors,director,original_language,imdb_id,movieIdInt,G_Action,G_Adult,...,G_Mystery,G_Romance,G_Sci-Fi,G_Science Fiction,G_Short,G_Sport,G_TV Movie,G_Thriller,G_War,G_Western
0,862,Toy Story,1995,"tomhanks,timallen,donrickles",johnlasseter,en,tt0114709,670,0,0,...,0,0,0,0,0,0,0,0,0,0
1,8844,Jumanji,1995,"robinwilliams,jonathanhyde,kirstendunst",joejohnston,en,tt0113497,2789,0,0,...,0,0,0,0,0,0,0,0,0,0
2,949,Heat,1995,"alpacino,robertdeniro,valkilmer",michaelmann,en,tt0113277,744,1,0,...,0,0,0,0,0,0,0,1,0,0
3,710,GoldenEye,1995,"piercebrosnan,seanbean,izabellascorupco",martincampbell,en,tt0113189,553,1,0,...,0,0,0,0,0,0,0,1,0,0
4,1408,Cutthroat Island,1995,"geenadavis,matthewmodine,franklangella",rennyharlin,en,tt0112760,882,1,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
import pandas as pd

# Step 1: Extract the genre set from training (ensure consistency)
train_genre_set = set(mlb.classes_)  # Genres seen in training
train_genre_list = list(train_genre_set)

# Step 2: Define a function to apply encoding to test/valid sets
def encode_genres(df, genre_list):
    df = df.copy()  # Avoid modifying the original dataframe
    df['genres'] = df['genres'].fillna('').str.split(', ')  # Ensure no NaNs in genres

    # Create a one-hot encoding DataFrame with the same columns as training
    genre_ohe = pd.DataFrame(0, index=df.index, columns=genre_list).fillna(0)

    # Populate one-hot encoding only for genres present in the training set
    for idx, genres in df['genres'].items():
        known_genres = set(genres) & set(genre_list)  # Keep only known genres
        genre_ohe.loc[idx, list(known_genres)] = 1

    # Rename columns with prefix "Genre_"
    genre_ohe = genre_ohe.rename(columns=lambda x: f"G_{x}")

    # Ensure both DataFrames have properly reset indices before merging
    df = df.drop(columns=['genres']).reset_index(drop=True)
    genre_ohe = genre_ohe.reset_index(drop=True)  # Fix: Reset index for genre_ohe

    # Merge one-hot encoding with the original dataframe
    df = pd.concat([df, genre_ohe], axis=1)

    return df

# Step 3: Filter test and validation sets to include only known movie IDs
test_movie_ids = set(test_df['movieIdInt'])
valid_movie_ids = set(valid_df['movieIdInt'])

test_meta_df = meta_df[meta_df['movieIdInt'].isin(test_movie_ids)].copy()
valid_meta_df = meta_df[meta_df['movieIdInt'].isin(valid_movie_ids)].copy()

# Step 4: Apply the genre encoding
test_meta_df = encode_genres(test_meta_df, train_genre_list)
valid_meta_df = encode_genres(valid_meta_df, train_genre_list)

# Print column names to confirm renaming
print("Updated Genre Column Names:", [col for col in test_meta_df.columns if col.startswith("Genre_")])


Updated Genre Column Names: []


In [None]:
test_meta_df.head()

Unnamed: 0,movieId,title,year,first_three_actors,director,original_language,imdb_id,movieIdInt,G_Sport,G_Action,...,G_Drama,G_Western,G_Fantasy,G_Biography,G_Sci-Fi,G_Musical,G_Comedy,G_Short,G_Crime,G_Horror
0,862,Toy Story,1995,"tomhanks,timallen,donrickles",johnlasseter,en,tt0114709,670,0,0,...,0,0,0,0,0,0,1,0,0,0
1,949,Heat,1995,"alpacino,robertdeniro,valkilmer",michaelmann,en,tt0113277,744,0,1,...,1,0,0,0,0,0,0,0,1,0
2,710,GoldenEye,1995,"piercebrosnan,seanbean,izabellascorupco",martincampbell,en,tt0113189,553,0,1,...,0,0,0,0,0,0,0,0,0,0
3,1408,Cutthroat Island,1995,"geenadavis,matthewmodine,franklangella",rennyharlin,en,tt0112760,882,0,1,...,0,0,0,0,0,0,0,0,0,0
4,524,Casino,1995,"robertdeniro,sharonstone,joepesci",martinscorsese,en,tt0112641,391,0,0,...,1,0,0,0,0,0,0,0,1,0


In [None]:
valid_meta_df.head()

Unnamed: 0,movieId,title,year,first_three_actors,director,original_language,imdb_id,movieIdInt,G_Sport,G_Action,...,G_Drama,G_Western,G_Fantasy,G_Biography,G_Sci-Fi,G_Musical,G_Comedy,G_Short,G_Crime,G_Horror
0,862,Toy Story,1995,"tomhanks,timallen,donrickles",johnlasseter,en,tt0114709,670,0,0,...,0,0,0,0,0,0,1,0,0,0
1,8844,Jumanji,1995,"robinwilliams,jonathanhyde,kirstendunst",joejohnston,en,tt0113497,2789,0,0,...,0,0,1,0,0,0,0,0,0,0
2,949,Heat,1995,"alpacino,robertdeniro,valkilmer",michaelmann,en,tt0113277,744,0,1,...,1,0,0,0,0,0,0,0,1,0
3,710,GoldenEye,1995,"piercebrosnan,seanbean,izabellascorupco",martincampbell,en,tt0113189,553,0,1,...,0,0,0,0,0,0,0,0,0,0
4,1408,Cutthroat Island,1995,"geenadavis,matthewmodine,franklangella",rennyharlin,en,tt0112760,882,0,1,...,0,0,0,0,0,0,0,0,0,0


## TF-IDF Encoding for `first_three_actors` and `director`

In [None]:
num_unique_directors = meta_df['director'].nunique()
print(f"Number of unique directors: {num_unique_directors}")


Number of unique directors: 4414


In [None]:
# Split the actor names and flatten the list
actor_names = meta_df['first_three_actors'].dropna().str.split(',').explode()

# Count unique actor names
num_unique_actors = actor_names.nunique()
print(f"Number of unique actors: {num_unique_actors}")


Number of unique actors: 12480


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

# Step 1: Extract training data for TF-IDF fitting
train_actor_texts = train_meta_df['first_three_actors'].fillna('')
valid_actor_texts = valid_meta_df['first_three_actors'].fillna('')
test_actor_texts = test_meta_df['first_three_actors'].fillna('')

# Step 2: Fit TF-IDF on training data only
actor_vectorizer = TfidfVectorizer()  # Ensures full names as single tokens
actor_tfidf_train = actor_vectorizer.fit_transform(train_actor_texts)

# Step 3: Transform validation and test sets using the trained vectorizer
valid_actor_tfidf = actor_vectorizer.transform(valid_actor_texts)
test_actor_tfidf = actor_vectorizer.transform(test_actor_texts)

# Step 4: Create a mapping from row index to movieIdInt
train_movie_ids = train_meta_df['movieIdInt'].to_numpy()
valid_movie_ids = valid_meta_df['movieIdInt'].to_numpy()
test_movie_ids = test_meta_df['movieIdInt'].to_numpy()

# Convert TF-IDF matrices to a list of tuples (movieIdInt, feature_index, tfidf_score)
def convert_tfidf_to_list(tfidf_matrix, movie_ids):
    coo_matrix = tfidf_matrix.tocoo()  # Convert to COO format for easy iteration
    return [(movie_ids[row], col, score) for row, col, score in zip(coo_matrix.row, coo_matrix.col, coo_matrix.data)]

train_actor_tfidf_list = convert_tfidf_to_list(actor_tfidf_train, train_movie_ids)
valid_actor_tfidf_list = convert_tfidf_to_list(valid_actor_tfidf, valid_movie_ids)
test_actor_tfidf_list = convert_tfidf_to_list(test_actor_tfidf, test_movie_ids)

# Print a sample
print("Sample from Training Actor TF-IDF:", train_actor_tfidf_list[:5])

# Step 5: Save to a structured format if needed
import pandas as pd

# Convert to DataFrame for easier inspection
train_actor_tfidf_df = pd.DataFrame(train_actor_tfidf_list, columns=['movieIdInt', 'actor_idex', 'tfidf_score'])
valid_actor_tfidf_df = pd.DataFrame(valid_actor_tfidf_list, columns=['movieIdInt', 'actor_idex', 'tfidf_score'])
test_actor_tfidf_df = pd.DataFrame(test_actor_tfidf_list, columns=['movieIdInt', 'actor_idex', 'tfidf_score'])

train_actor_tfidf_df.head()


Sample from Training Actor TF-IDF: [(670, 11487, 0.5117537866893297), (670, 11396, 0.6007242749976105), (670, 2890, 0.614197368309021), (2789, 10036, 0.5128378575882407), (2789, 5821, 0.6620217422277047)]


Unnamed: 0,movieIdInt,actor_idex,tfidf_score
0,670,11487,0.511754
1,670,11396,0.600724
2,670,2890,0.614197
3,2789,10036,0.512838
4,2789,5821,0.662022


In [None]:
import os

# Define the directory path
tfidf_dir = os.path.join(dataset_path, "tfidf")

# Create the directory if it does not exist
os.makedirs(tfidf_dir, exist_ok=True)

# Save to Parquet for efficient storage
train_actor_tfidf_df.to_parquet(tfidf_dir+'/train_actor_tfidf.parquet', index=False)
valid_actor_tfidf_df.to_parquet(tfidf_dir+'/valid_actor_tfidf.parquet', index=False)
test_actor_tfidf_df.to_parquet(tfidf_dir+'/test_actor_tfidf.parquet', index=False)

print("Saved TF-IDF files successfully!")


Saved TF-IDF files successfully!


In [None]:
# Save actor feature names mapping
actor_feature_names = actor_vectorizer.get_feature_names_out()

# Convert to DataFrame
actor_feature_mapping_df = pd.DataFrame({'actor_index': range(len(actor_feature_names)), 'actor_name': actor_feature_names})

# Save mapping
actor_feature_mapping_df.to_parquet(tfidf_dir+'/actor_feature_mapping.parquet', index=False)

print("Saved Actor Feature Mapping Successfully!")


Saved Actor Feature Mapping Successfully!


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import pandas as pd

# Step 1: Extract training data for TF-IDF fitting
train_director_texts = train_meta_df['director'].fillna('')
valid_director_texts = valid_meta_df['director'].fillna('')
test_director_texts = test_meta_df['director'].fillna('')

# Step 2: Fit TF-IDF on training data only
director_vectorizer = TfidfVectorizer()  # Treats full director names as single tokens
director_tfidf_train = director_vectorizer.fit_transform(train_director_texts)

# Step 3: Transform validation and test sets using the trained vectorizer
valid_director_tfidf = director_vectorizer.transform(valid_director_texts)
test_director_tfidf = director_vectorizer.transform(test_director_texts)

# Step 4: Create a mapping from row index to movieIdInt
train_movie_ids = train_meta_df['movieIdInt'].to_numpy()
valid_movie_ids = valid_meta_df['movieIdInt'].to_numpy()
test_movie_ids = test_meta_df['movieIdInt'].to_numpy()

# Convert TF-IDF matrices to a list of tuples (movieIdInt, feature_index, tfidf_score)
def convert_tfidf_to_list(tfidf_matrix, movie_ids):
    coo_matrix = tfidf_matrix.tocoo()  # Convert to COO format for easy iteration
    return [(movie_ids[row], col, score) for row, col, score in zip(coo_matrix.row, coo_matrix.col, coo_matrix.data)]

train_director_tfidf_list = convert_tfidf_to_list(director_tfidf_train, train_movie_ids)
valid_director_tfidf_list = convert_tfidf_to_list(valid_director_tfidf, valid_movie_ids)
test_director_tfidf_list = convert_tfidf_to_list(test_director_tfidf, test_movie_ids)

# Print a sample
print("Sample from Training Director TF-IDF:", train_director_tfidf_list[:5])

# Step 5: Save to a structured format if needed

# Convert to DataFrame for easier inspection
train_director_tfidf_df = pd.DataFrame(train_director_tfidf_list, columns=['movieIdInt', 'director_index', 'tfidf_score'])
valid_director_tfidf_df = pd.DataFrame(valid_director_tfidf_list, columns=['movieIdInt', 'director_index', 'tfidf_score'])
test_director_tfidf_df = pd.DataFrame(test_director_tfidf_list, columns=['movieIdInt', 'director_index', 'tfidf_score'])

train_director_tfidf_df.head()


Sample from Training Director TF-IDF: [(670, 2038, 1.0), (2789, 1970, 1.0), (744, 2820, 1.0), (553, 2693, 1.0), (882, 3374, 1.0)]


Unnamed: 0,movieIdInt,director_index,tfidf_score
0,670,2038,1.0
1,2789,1970,1.0
2,744,2820,1.0
3,553,2693,1.0
4,882,3374,1.0


In [None]:
# Save to Parquet for efficient storage
train_director_tfidf_df.to_parquet(tfidf_dir+'/train_director_tfidf.parquet', index=False)
valid_director_tfidf_df.to_parquet(tfidf_dir+'/valid_director_tfidf.parquet', index=False)
test_director_tfidf_df.to_parquet(tfidf_dir+'/test_director_tfidf.parquet', index=False)

print("Saved TF-IDF files successfully!")


Saved TF-IDF files successfully!


In [None]:
# Save director feature names mapping
director_feature_names = director_vectorizer.get_feature_names_out()

# Convert to DataFrame
director_feature_mapping_df = pd.DataFrame({'director_index': range(len(director_feature_names)), 'director_name': director_feature_names})

# Save mapping
director_feature_mapping_df.to_parquet(tfidf_dir+'/director_feature_mapping.parquet', index=False)

print("Saved director Feature Mapping Successfully!")


Saved director Feature Mapping Successfully!


## Language

In [None]:
import pandas as pd

# Step 1: Fit One-Hot Encoding on Training Data Only (Defining Language Categories)
train_language_ohe = pd.get_dummies(train_meta_df['original_language'], prefix='L').astype(int)

# Step 2: Apply the Same Encoding on Validation and Test (No New Fitting)
valid_language_ohe = pd.get_dummies(valid_meta_df['original_language'], prefix='L').astype(int)
test_language_ohe = pd.get_dummies(test_meta_df['original_language'], prefix='L').astype(int)

# Step 3: Ensure Validation and Test Have the Same Columns as Training (Avoid Leakage)
valid_language_ohe = valid_language_ohe.reindex(columns=train_language_ohe.columns, fill_value=0)
test_language_ohe = test_language_ohe.reindex(columns=train_language_ohe.columns, fill_value=0)

# Step 4: Create New Encoded DataFrames Without Modifying Original DataFrames
train_meta_encoded = pd.concat([train_meta_df.drop(columns=['original_language']), train_language_ohe], axis=1)
valid_meta_encoded = pd.concat([valid_meta_df.drop(columns=['original_language']), valid_language_ohe], axis=1)
test_meta_encoded = pd.concat([test_meta_df.drop(columns=['original_language']), test_language_ohe], axis=1)


# Step 5: Print Confirmation
print(f"Train language encoding shape: {train_meta_encoded.shape}")
print(f"Valid language encoding shape: {valid_meta_encoded.shape}")
print(f"Test language encoding shape: {test_meta_encoded.shape}")


Train language encoding shape: (7486, 44)
Valid language encoding shape: (3774, 44)
Test language encoding shape: (3851, 44)


In [None]:
# Step 1: Identify columns to keep (encoded features only)
encoded_columns = [col for col in train_meta_encoded.columns if col.startswith("G_") or col.startswith("L_")]
encoded_columns.append("movieIdInt")  # Keep movieIdInt for mapping

# Step 2: Extract the relevant columns
train_ohe_features = train_meta_encoded[encoded_columns]
valid_ohe_features = valid_meta_encoded[encoded_columns]
test_ohe_features = test_meta_encoded[encoded_columns]

train_ohe_features.head()

Unnamed: 0,G_Action,G_Adult,G_Adventure,G_Animation,G_Biography,G_Comedy,G_Crime,G_Documentary,G_Drama,G_Family,...,L_es,L_fr,L_hi,L_it,L_ja,L_ko,L_other,L_ru,L_sv,movieIdInt
0,0,0,0,1,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,670
1,0,0,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,2789
2,1,0,0,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,744
3,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,553
4,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,882


In [None]:
import os

# Define the directory path
ohe_dir = os.path.join(dataset_path, "ohe")

# Create the directory if it does not exist
os.makedirs(ohe_dir, exist_ok=True)

# Save to Parquet for efficient storage
train_ohe_features.to_parquet(os.path.join(ohe_dir, "train_ohe.parquet"), index=False)
valid_ohe_features.to_parquet(os.path.join(ohe_dir, "valid_ohe.parquet"), index=False)
test_ohe_features.to_parquet(os.path.join(ohe_dir, "test_ohe.parquet"), index=False)

print("Saved One-Hot Encoded (OHE) features successfully in:", ohe_dir)


Saved One-Hot Encoded (OHE) features successfully in: /content/drive/MyDrive/datasets/capstone/ohe


In [None]:
train_year = train_meta_encoded[['year',"movieIdInt"]]
valid_year = valid_meta_encoded[['year',"movieIdInt"]]
test_year = test_meta_encoded[['year',"movieIdInt"]]
train_year.head()

Unnamed: 0,year,movieIdInt
0,1995,670
1,1995,2789
2,1995,744
3,1995,553
4,1995,882


In [None]:
import os

# Define the directory path
year_dir = os.path.join(dataset_path, "year")

# Create the directory if it does not exist
os.makedirs(year_dir, exist_ok=True)

# Save to Parquet for efficient storage
train_year.to_parquet(os.path.join(year_dir, "train_year.parquet"), index=False)
valid_year.to_parquet(os.path.join(year_dir, "valid_year.parquet"), index=False)
test_year.to_parquet(os.path.join(year_dir, "test_year.parquet"), index=False)

print("Saved Year feature files successfully in:", year_dir)


Saved Year feature files successfully in: /content/drive/MyDrive/datasets/capstone/year


## Calculate Biases using Training Data