In [76]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from gensim.models import Word2Vec
from lightgbm import LGBMRegressor
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [58]:
movie = pd.read_csv("Netflix_Dataset_Movie.csv")

In [59]:
movie.head()

Unnamed: 0,Movie_ID,Year,Name
0,1,2003,Dinosaur Planet
1,2,2004,Isle of Man TT 2004 Review
2,3,1997,Character
3,4,1994,Paula Abdul's Get Up & Dance
4,5,2004,The Rise and Fall of ECW


In [60]:
rating = pd.read_csv("Netflix_Dataset_Rating.csv")

In [61]:
rating.head()

Unnamed: 0,User_ID,Rating,Movie_ID
0,712664,5,3
1,1331154,4,3
2,2632461,3,3
3,44937,5,3
4,656399,4,3


In [66]:
data = movie.merge(rating,on="Movie_ID")

In [67]:
data.shape

(17337458, 5)

In [68]:
data.head()

Unnamed: 0,Movie_ID,Year,Name,User_ID,Rating
0,3,1997,Character,712664,5
1,3,1997,Character,1331154,4
2,3,1997,Character,2632461,3
3,3,1997,Character,44937,5
4,3,1997,Character,656399,4


In [69]:
data.isna().sum()

Movie_ID    0
Year        0
Name        0
User_ID     0
Rating      0
dtype: int64

In [70]:
data = data.sample(100000)

In [102]:
# Custom objective function for ordinal regression
def ordinal_objective(y_pred, dataset):
    y_true = dataset.get_label()
    y_pred = y_pred.reshape(-1, 1)
    y_true = y_true.reshape(-1, 1)
    
    grad = np.zeros_like(y_pred)
    hess = np.zeros_like(y_pred)
    
    for k in range(1, 6):  
        p_k = 1 / (1 + np.exp(-(y_pred - k)))
        grad += (y_true >= k) - p_k
        hess += p_k * (1 - p_k)
    
    grad = -grad
    hess = hess
    return grad.flatten(), hess.flatten()

# Function to preprocess movie names
def preprocess_name(name):
    return name.lower().split()

# Function to get word embeddings for a movie name
def get_name_embedding(name, model):
    words = preprocess_name(name)
    word_vectors = [model.wv[word] for word in words if word in model.wv]
    if len(word_vectors) == 0:
        return np.zeros(model.vector_size)
    return np.mean(word_vectors, axis=0)


In [71]:
# Calculate user and movie statistics
user_avg_rating = data.groupby('User_ID')['Rating'].mean().reset_index()
user_avg_rating.columns = ['User_ID', 'User_Avg_Rating']
movie_avg_rating = data.groupby('Movie_ID')['Rating'].mean().reset_index()
movie_avg_rating.columns = ['Movie_ID', 'Movie_Avg_Rating']
movie_popularity = data.groupby('Movie_ID')['Rating'].count().reset_index()
movie_popularity.columns = ['Movie_ID', 'Movie_Popularity']

In [72]:
# Merge statistics back to main dataframe
data = pd.merge(data, user_avg_rating, on='User_ID')
data = pd.merge(data, movie_avg_rating, on='Movie_ID')
data = pd.merge(data, movie_popularity, on='Movie_ID')

In [78]:
# Train Word2Vec model on movie names
movie_names = data['Name'].apply(preprocess_name).tolist()
word2vec_model = Word2Vec(sentences=movie_names, vector_size=100, window=5, min_count=1, workers=4)


In [79]:
# Get embeddings for each movie name
name_embeddings = data['Name'].apply(lambda x: get_name_embedding(x, word2vec_model))
name_embedding_df = pd.DataFrame(name_embeddings.tolist(), columns=[f'name_emb_{i}' for i in range(100)])
data = pd.concat([data.reset_index(drop=True), name_embedding_df], axis=1)


In [80]:
# Encode categorical variables
le = LabelEncoder()
data['User_ID'] = le.fit_transform(data['User_ID'])
data['Movie_ID'] = le.fit_transform(data['Movie_ID'])


In [81]:
# Select features
features = ['User_ID', 'Movie_ID', 'Year', 'User_Avg_Rating', 
            'Movie_Avg_Rating', 'Movie_Popularity'] + [f'name_emb_{i}' for i in range(100)]
X = data[features]
y = data['Rating']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [106]:
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)


In [82]:
# Scale numerical features
scaler = StandardScaler()
numerical_features = ['Year', 'User_Avg_Rating', 'Movie_Avg_Rating', 'Movie_Popularity'] + [f'name_emb_{i}' for i in range(100)]
X_train[numerical_features] = scaler.fit_transform(X_train[numerical_features])
X_test[numerical_features] = scaler.transform(X_test[numerical_features])


In [110]:
params = {
    'objective': ordinal_objective,
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'max_depth': 7,
    'min_child_samples': 20,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'random_state': 42
}


In [111]:
gbm = lgb.train(
    params,
    train_data,
    num_boost_round=200,
    valid_sets=[train_data, test_data],
    valid_names=['train', 'valid'],
    callbacks=[
        lgb.early_stopping(stopping_rounds=10),
        lgb.log_evaluation(period=10)
    ]
)


Training until validation scores don't improve for 10 rounds
[10]	train's rmse: 1.65532	valid's rmse: 1.65395
[20]	train's rmse: 0.840996	valid's rmse: 0.840702
[30]	train's rmse: 0.548304	valid's rmse: 0.547323
[40]	train's rmse: 0.641611	valid's rmse: 0.639119
Early stopping, best iteration is:
[31]	train's rmse: 0.545067	valid's rmse: 0.543901


In [90]:
# Make predictions
y_pred = gbm.predict(X_test)

# Round predictions to nearest integer and clip to valid range
y_pred_rounded = np.round(y_pred).clip(1, 5)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred_rounded)
rmse = np.sqrt(mean_squared_error(y_test, y_pred_rounded))

print(f"Mean Absolute Error: {mae:.4f}")
print(f"Root Mean Squared Error: {rmse:.4f}")

Mean Absolute Error: 0.2853
Root Mean Squared Error: 0.5812


In [114]:
# Analyze prediction distribution
from collections import Counter

y_test_dist = Counter(y_test)
y_pred_dist = Counter(y_pred_rounded)

print("\nActual Rating Distribution:")
for rating in sorted(y_test_dist.keys()):
    print(f"Rating {rating}: {y_test_dist[rating]}")

print("\nPredicted Rating Distribution:")
for rating in sorted(y_pred_dist.keys()):
    print(f"Rating {rating}: {y_pred_dist[rating]}")




Actual Rating Distribution:
Rating 1: 861
Rating 2: 1992
Rating 3: 6004
Rating 4: 6842
Rating 5: 4301

Predicted Rating Distribution:
Rating 1.0: 425
Rating 2.0: 1706
Rating 3.0: 6675
Rating 4.0: 7955
Rating 5.0: 3239
