In [23]:
import numpy as np
import numpy.ma as ma
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from tensorflow.keras.layers import Lambda
from sklearn.model_selection import train_test_split
pd.set_option("display.precision", 1)
import re

In [24]:
movies=pd.read_csv('movie-lens/ml-latest-small/movies.csv')
ratings=pd.read_csv('movie-lens/ml-latest-small/ratings.csv')

In [25]:
movies.head(4)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance


In [26]:
ratings.head(4)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815


#### Preparing Movies Set

In [27]:
movies['year'] = movies['title'].str.extract(r'\((\d{4})\)').astype(float)
filt=movies['year'] > 2000
movies = movies[filt]
popular_genres=['Action','Adventure','Animation','Children','Comedy','Crime','Documentary','Drama','Fantasy','Horror','Mystery','Romance','Sci-Fi','Thriller']
movies['genres'] = movies['genres'].str.split('|')
movies = movies[movies['genres'].apply(lambda g: set(g).issubset(popular_genres))]
genres = sorted(set(g for sublist in movies['genres'] for g in sublist))
for genre in genres:
    movies[genre] = movies['genres'].apply(lambda x: 1 if genre in x else 0)

In [28]:
filtered_ratings = ratings[ratings['movieId'].isin(movies['movieId'])]
movie_avg_ratings = filtered_ratings.groupby('movieId')['rating'].mean().reset_index()
movies = movies.merge(movie_avg_ratings, on='movieId', how='left')
movies.rename(columns={'rating':'avg_rating'},inplace=True)
movies.head(2)

Unnamed: 0,movieId,title,genres,year,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Horror,Mystery,Romance,Sci-Fi,Thriller,avg_rating
0,4052,Antitrust (2001),"[Crime, Drama, Thriller]",2001.0,0,0,0,0,0,1,0,1,0,0,0,0,0,1,3.6
1,4053,Double Take (2001),"[Action, Comedy]",2001.0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,2.5


#### Preparing User Set

In [29]:
movies_exploded = movies.explode('genres')[['movieId', 'genres']]
ratings_with_genres = filtered_ratings.merge(movies_exploded, on='movieId', how='left')
user_genre_avg = ratings_with_genres.groupby(['userId', 'genres'])['rating'].mean().unstack(fill_value=0).reset_index()
user_genre_avg.head(2)

genres,userId,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Horror,Mystery,Romance,Sci-Fi,Thriller
0,2,3.9,5.0,0.0,0.0,4.0,3.8,4.3,3.8,0.0,3.0,4.0,0.0,4.2,3.7
1,3,0.5,0.5,0.0,0.5,0.5,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.5,0.5


#### Preparing Training Set

In [30]:
training_set = filtered_ratings.merge(user_genre_avg, on='userId', how='left')
training_set = training_set.merge(movies, on='movieId', how='left')
training_set.head(2)

Unnamed: 0,userId,movieId,rating,timestamp,Action_x,Adventure_x,Animation_x,Children_x,Comedy_x,Crime_x,...,Crime_y,Documentary_y,Drama_y,Fantasy_y,Horror_y,Mystery_y,Romance_y,Sci-Fi_y,Thriller_y,avg_rating
0,2,6874,4.0,1445714952,3.9,5.0,0.0,0.0,4.0,3.8,...,1,0,0,0,0,0,0,0,1,4.0
1,2,8798,3.5,1445714960,3.9,5.0,0.0,0.0,4.0,3.8,...,1,0,1,0,0,0,0,0,1,3.8


In [32]:
num_repeats = 2  # Increase the dataset 5-fold
training_set = pd.concat([training_set] * num_repeats, ignore_index=True)

#### Extracting the user_train array , movie_train array and the output label y from the training set

In [33]:
# Extract the output label y
y = training_set['rating'].values

# Extract user features
user_feature_cols =[col for col in training_set.columns if col.endswith('_x')]
user_train = training_set[user_feature_cols].to_numpy()

# Extract movie features
movie_feature_cols =['year']+ [col for col in training_set.columns if col.endswith('_y')]+['avg_rating']
movie_train = training_set[movie_feature_cols].to_numpy()

print("User training set shape:", user_train.shape)
print("Movie training set shape:", movie_train.shape)
print("Output y shape:", y.shape)

User training set shape: (57618, 14)
Movie training set shape: (57618, 16)
Output y shape: (57618,)


In [35]:
movie_train[:1]

array([[2.00300000e+03, 1.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 3.96183206e+00]])

In [36]:
user_train[:1]

array([[3.9       , 5.        , 0.        , 0.        , 4.        ,
        3.83333333, 4.33333333, 3.8       , 0.        , 3.        ,
        4.        , 0.        , 4.25      , 3.66666667]])

#### Normalising the training sets

In [37]:
# normalising the trainig set
scaler_movie=StandardScaler()
scaler_movie.fit(movie_train)
movie_train_scaled=scaler_movie.transform(movie_train)
scaler_user=StandardScaler()
scaler_user.fit(user_train)
user_train_scaled=scaler_user.transform(user_train)
scaler_target=MinMaxScaler((-1,1))
scaler_target.fit(y.reshape(-1,1))
y_scaled=scaler_target.transform(y.reshape(-1,1))

In [41]:
movie_train_scaled[:1]

array([[-0.75572365,  1.52320607, -0.58043655, -0.31781904, -0.30344247,
        -0.82790506,  2.16878377, -0.16856862, -0.84891659, -0.39978657,
        -0.28126502, -0.31416479, -0.47552455, -0.45047889,  1.57941175,
         0.87621312]])

In [42]:
user_train_scaled[:1]

array([[ 0.91375432,  2.62862291, -3.6814779 , -3.39760293,  1.02917048,
         0.51368747,  0.96262562,  0.44195318, -4.73935666,  0.15922969,
         0.70270596, -4.93200316,  1.2447853 ,  0.44637493]])