In [1]:
import pandas as pd

file_path = "/kaggle/input/movielens-1m-dataset/ratings.dat"

columns = ['user_id', 'item_id', 'rating', 'timestamp']

# Load dataset
df = pd.read_csv(file_path, sep='::', names=columns, engine='python')

# Display head and shape
print(df.head())
print(f"Dataset shape: {df.shape}")

   user_id  item_id  rating  timestamp
0        1     1193       5  978300760
1        1      661       3  978302109
2        1      914       3  978301968
3        1     3408       4  978300275
4        1     2355       5  978824291
Dataset shape: (1000209, 4)


In [2]:
# Sort by user_id and timestamp
df_sorted = df.sort_values(by=['user_id', 'timestamp']).reset_index(drop=True)

# Save the sorted data (optional, saved in Kaggle working directory)
sorted_file_path = "/kaggle/working/sorted_ratings.csv"
df_sorted.to_csv(sorted_file_path, index=False)

# Confirm the sorting
print(df_sorted.head())
print(f"Sorted dataset saved to: {sorted_file_path}")


   user_id  item_id  rating  timestamp
0        1     3186       4  978300019
1        1     1270       5  978300055
2        1     1721       4  978300055
3        1     1022       5  978300055
4        1     2340       3  978300103
Sorted dataset saved to: /kaggle/working/sorted_ratings.csv


In [3]:
import numpy as np

# Get unique users
unique_users = df_sorted['user_id'].unique()

# Split users into training and test sets (e.g., 80% for training, 20% for testing)
np.random.seed(42)
test_user_count = int(0.2 * len(unique_users))  # 20% users for testing
test_users = np.random.choice(unique_users, size=test_user_count, replace=False)
train_users = np.setdiff1d(unique_users, test_users)

train_data = df_sorted[df_sorted['user_id'].isin(train_users)]
test_data = df_sorted[df_sorted['user_id'].isin(test_users)]

train_file_path = "/kaggle/working/train_ratings.csv"
test_file_path = "/kaggle/working/test_ratings.csv"

train_data.to_csv(train_file_path, index=False)
test_data.to_csv(test_file_path, index=False)

print(f"Training data shape: {train_data.shape}")
print(f"Test data shape: {test_data.shape}")
print(f"Training data saved to: {train_file_path}")
print(f"Test data saved to: {test_file_path}")


Training data shape: (808218, 4)
Test data shape: (191991, 4)
Training data saved to: /kaggle/working/train_ratings.csv
Test data saved to: /kaggle/working/test_ratings.csv


In [4]:
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from surprise import accuracy

# Load the training data
train_file_path = r"/kaggle/working/train_ratings.csv"
train_data = pd.read_csv(train_file_path)

reader = Reader(rating_scale=(1, 5))  # Ratings range in MovieLens is 1-5
data = Dataset.load_from_df(train_data[['user_id', 'item_id', 'rating']], reader)

trainset, valset = train_test_split(data, test_size=0.2, random_state=42)

# Train an SVD model
svd_model = SVD()
svd_model.fit(trainset)

# Evaluate the model on the validation set
predictions = svd_model.test(valset)
rmse = accuracy.rmse(predictions)
print(f"Validation RMSE: {rmse}")

RMSE: 0.8794
Validation RMSE: 0.8794330740426275


In [5]:
from surprise import SVD, Dataset, Reader
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np

def prepare_data_for_cold_start(test_data, n_known=5):
    users = test_data['user_id'].unique()
    cold_start_users = []
    for user in users:
        user_data = test_data[test_data['user_id'] == user]
        if len(user_data) > n_known:
            cold_start_users.append(user)
    
    cold_start_data = []
    for user in cold_start_users:
        user_data = test_data[test_data['user_id'] == user]
        known_ratings = user_data.iloc[:n_known]
        unknown_ratings = user_data.iloc[n_known:]
        cold_start_data.append((known_ratings, unknown_ratings))
    
    return cold_start_data

def evaluate_svd_on_cold_start(svd_model, cold_start_data, num_items=1266):
    all_rmse = []
    
    for known_ratings, unknown_ratings in cold_start_data:
        # Convert known ratings to Surprise dataset format
        reader = Reader(rating_scale=(1, 5))  # Adjust rating scale if necessary
        data = Dataset.load_from_df(known_ratings[['user_id', 'item_id', 'rating']], reader)
        trainset = data.build_full_trainset()
        
        # Train the SVD model using Surprise
        svd_model.fit(trainset)
        
        # Predict ratings for the unknown items
        predicted_ratings = []
        actual_ratings = []
        
        for _, row in unknown_ratings.iterrows():
            pred = svd_model.predict(row['user_id'], row['item_id'])
            predicted_ratings.append(pred.est)
            actual_ratings.append(row['rating'])
        
        # Calculate RMSE for the current user
        mse = mean_squared_error(actual_ratings, predicted_ratings)
        rmse = np.sqrt(mse)
        all_rmse.append(rmse)
    
    # Calculate overall RMSE for all cold-start users
    overall_rmse = np.mean(all_rmse)
    return overall_rmse

# Load the test data and split it into cold-start data
test_file_path = '/kaggle/working/test_ratings.csv'  # Replace with actual file path
test_data = pd.read_csv(test_file_path)
cold_start_data = prepare_data_for_cold_start(test_data, n_known=5)

# Initialize the SVD model from Surprise
svd_model = SVD()

# Evaluate the SVD model on the cold-start task
svd_model_rmse = evaluate_svd_on_cold_start(svd_model, cold_start_data)

print(f"SVD Model RMSE: {svd_model_rmse}")


SVD Model RMSE: 1.1533355987456415


In [6]:


# Ensure the padding works correctly
def prepare_meta_learning_data(df, n_known=5, pad_value=0):
    """
    Prepare data for the meta-learning model with padding.
    
    Parameters:
    - df: DataFrame with user_id, item_id, rating.
    - n_known: Number of known ratings to provide as input.
    - pad_value: Value to pad the sequences for the unknown ratings.
    
    Returns:
    - meta_train_X: Features for training (known ratings).
    - meta_train_y: Targets for training (unknown ratings).
    """
    meta_train_X = []
    meta_train_y = []

    # Group data by user_id
    grouped = df.groupby('user_id')

    max_unknown_ratings = 0

    # First, determine the maximum number of unknown ratings per user for padding
    for user_id, group in grouped:
        group = group.sort_values(by='timestamp')
        if len(group) <= n_known:
            continue
        unknown_data = group.iloc[n_known:]
        max_unknown_ratings = max(max_unknown_ratings, len(unknown_data))

    # Now, prepare the features and targets
    for user_id, group in grouped:
        group = group.sort_values(by='timestamp')
        if len(group) <= n_known:
            continue

        # Split known and unknown ratings
        known_data = group.iloc[:n_known]
        unknown_data = group.iloc[n_known:]

        # Prepare input (known ratings) and output (unknown ratings)
        known_ratings = known_data['rating'].values
        unknown_ratings = unknown_data['rating'].values

        # Pad the sequences to the max_unknown_ratings
        padded_known = np.pad(known_ratings, (0, n_known - len(known_ratings)), mode='constant', constant_values=pad_value)
        padded_unknown = np.pad(unknown_ratings, (0, max_unknown_ratings - len(unknown_ratings)), mode='constant', constant_values=pad_value)

        meta_train_X.append(padded_known)
        meta_train_y.append(padded_unknown)

    return np.array(meta_train_X), np.array(meta_train_y)

# Prepare the data
meta_train_X, meta_train_y = prepare_meta_learning_data(train_data, n_known=5)
meta_test_X, meta_test_y = prepare_meta_learning_data(test_data, n_known=5)

print(f"Training data shape: X={meta_train_X.shape}, y={meta_train_y.shape}")
print(f"Testing data shape: X={meta_test_X.shape}, y={meta_test_y.shape}")


Training data shape: X=(4832, 5), y=(4832, 2309)
Testing data shape: X=(1208, 5), y=(1208, 1266)


In [7]:
import numpy as np
import tensorflow as tf

# Adjust target shape if necessary
def ensure_target_shape(meta_train_y, target_shape=(1208, 1266)):
    if meta_train_y.shape[1] != target_shape[1]:
        meta_train_y = meta_train_y[:, :target_shape[1]]  # Adjust if needed
    return meta_train_y

# Model Creation (adjusted to target shape of 1266)
def create_meta_model(n_known=5, n_unknown=1266, n_hidden=64):
    model = tf.keras.Sequential([
        tf.keras.layers.Input(shape=(n_known,)),  # Input layer with the correct shape (5 known ratings)
        tf.keras.layers.Dense(n_hidden, activation='relu'),
        tf.keras.layers.Dense(n_hidden, activation='relu'),
        tf.keras.layers.Dense(n_unknown, activation='linear')  # Predict only the unknown ratings
    ])
    model.compile(optimizer='adam', loss='mse', metrics=['mae'])
    return model

# Ensure target data is correctly shaped
meta_train_y = ensure_target_shape(meta_train_y, target_shape=(1208, 1266))
meta_test_y = ensure_target_shape(meta_test_y, target_shape=(1208, 1266))

# Create the model with correct output size (1266)
meta_model = create_meta_model(n_known=5, n_unknown=1266)

# Train the model
meta_model.fit(
    meta_train_X, meta_train_y,
    validation_split=0.2,
    epochs=10,
    batch_size=32
)

# Evaluate the model
test_loss, test_mae = meta_model.evaluate(meta_test_X, meta_test_y)
print(f"Test Loss (MSE): {test_loss}")
print(f"Test MAE: {test_mae}")

# Predict ratings for unseen users
meta_predictions = meta_model.predict(meta_test_X)


Epoch 1/10
[1m121/121[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - loss: 1.4186 - mae: 0.5669 - val_loss: 0.9583 - val_mae: 0.5102
Epoch 2/10
[1m121/121[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 1.0448 - mae: 0.5288 - val_loss: 0.9584 - val_mae: 0.5432
Epoch 3/10
[1m121/121[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 1.0189 - mae: 0.5247 - val_loss: 0.9549 - val_mae: 0.5128
Epoch 4/10
[1m121/121[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.9912 - mae: 0.5008 - val_loss: 0.9594 - val_mae: 0.5305
Epoch 5/10
[1m121/121[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 1.0406 - mae: 0.5307 - val_loss: 0.9576 - val_mae: 0.5337
Epoch 6/10
[1m121/121[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 1.0082 - mae: 0.5185 - val_loss: 0.9485 - val_mae: 0.4762
Epoch 7/10
[1m121/121[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - 

In [8]:
# Compare predicted ratings with actual ratings for evaluation
from sklearn.metrics import mean_squared_error
rmse = np.sqrt(mean_squared_error(meta_test_y.flatten(), meta_predictions.flatten()))
print(f"Meta-Learning Model RMSE: {rmse}")

Meta-Learning Model RMSE: 0.9739220481294046


In [10]:
# Comparing improvement
old_rmse = 1.1533355987456415
new_rmse = 0.9739220481294046

improvement = ((old_rmse - new_rmse) / old_rmse) * 100
print(f"Percentage Improvement: {improvement:.2f}%")

Percentage Improvement: 15.56%
