In [None]:
# Imports
import numpy as np
from sklearn.model_selection import KFold
import random    
import pandas as pd
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from math import sqrt

In [9]:
# Matrix factorization

import numpy as np
from sklearn.model_selection import KFold
import random    
import pandas as pd
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from math import sqrt
import datetime

def parse_date(date):
    return datetime.datetime.fromtimestamp(float(date))

def create_ratings_dataset(input_path):
    column_names = ['user_id', 'movie_id', 'rating', 'timestamp']
    df = pd.read_table(input_path, sep='::', header=None, names=column_names, engine='python')
    df.sort_values(by='timestamp', inplace=True)
    df.reset_index(inplace=True, drop=True)
    return df

def preprocess_data(X):
    copy = X.copy()

    user_ids = X['user_id'].unique().tolist()
    movie_ids = X['movie_id'].unique().tolist()

    num_users = len(user_ids)
    num_movies = len(movie_ids)

    idx_user = range(num_users)
    idx_movie = range(num_movies)

    mapping_user = dict(zip(user_ids, idx_user))
    mapping_movie = dict(zip(movie_ids, idx_movie))

    copy['user_id'] = copy['user_id'].map(mapping_user).astype(np.int32)
    copy['movie_id'] = copy['movie_id'].map(mapping_movie).astype(np.int32)
    
    return copy[['user_id', 'movie_id', 'rating']].values, mapping_user, mapping_movie

def update_weights(train_data, factors, learning_rate, regularization, num_factors, global_mean):
    [train_x, mapping_user, mapping_movie] = train_data
    [user_factors, movie_factors] = factors
    for i in range(train_x.shape[0]):
        user, movie, rating = int(train_x[i, 0]), int(train_x[i, 1]), train_x[i, 2]
        predicted = predict_one_pair([user_factors, movie_factors], global_mean, user, movie, mapping_user, mapping_movie)
        error = abs(rating - predicted)

        for factor in range(num_factors):
            current_user_factor = user_factors[user, factor]
            current_movie_factor = movie_factors[movie, factor]

            user_factors[user, factor] += learning_rate * (2 * error * current_movie_factor - regularization * current_user_factor)
            movie_factors[movie, factor] += learning_rate * (2 * error * current_user_factor - regularization * current_movie_factor)

    return  [user_factors, movie_factors]

def fit_factorization_model(train_data, val_data, learning_rate, iterations, regularization, num_factors):
    [train_x, train_mapping_user, train_mapping_movie] = train_data
    [val_x, val_mapping_user, val_mapping_movie] = val_data
    global_mean = np.mean(train_x[:, 2])
    num_users = len(np.unique(train_x[:, 0]))
    num_movies = len(np.unique(train_x[:, 1]))
    ratings = val_x[:, 2]

    user_factors = np.random.uniform(-1, 1, (num_users, num_factors))
    movie_factors = np.random.uniform(-1, 1, (num_movies, num_factors))
    factors = [user_factors, movie_factors]

    rmse_increased = False
    former_rmse = 999999999

    for epoch in range(iterations):
        factors = update_weights(train_data,factors, learning_rate, regularization, num_factors, global_mean)
        predictions =  predict_with_model(factors, global_mean, val_data)
        rmse = ((ratings - predictions) ** 2).mean() ** .5
        print('Epoch:', epoch, 'Current RMSE:', rmse)
        if rmse_increased and rmse >= former_rmse:
            break
        rmse_increased = rmse >= former_rmse
        former_rmse = rmse
        
    return factors, global_mean

def predict_one_pair( factors, global_mean, user_id, movie_id, mapping_user, mapping_movie):
    num_known = 0
    predicted = global_mean
    [user_factors, movie_factors] = factors
    if user_id in mapping_user:
        user_idx = mapping_user[user_id]
        num_known += 1

    if movie_id in mapping_movie:
        movie_idx = mapping_movie[movie_id]
        num_known += 1
    
    if num_known == 2:
        predicted += np.dot(user_factors[user_idx], movie_factors[movie_idx])
        print(predicted)
    if predicted > 5:
        return 5
    if predicted < 1:
        return 1
    return predicted

def predict_with_model(factors, global_mean, data):
    [X, mapping_user, mapping_movie] = data
    return  [
              predict_one_pair(factors, global_mean, user_id, movie_id, mapping_user, mapping_movie) for user_id, movie_id in zip(X[:, 0], X[:, 1])
            ]

def k_fold_matrix_factorization(input_path, learning_rate, iterations, regularization, num_factors):
    df = create_ratings_dataset(input_path)
    kf = KFold(n_splits = 5, shuffle = True, random_state = 9)
    maes = []
    rmses = []
    for train_index, test_index in kf.split(df):
        train_x = df.iloc[train_index]
        test_x = df.iloc[test_index]
        val_x = train_x.sample(frac=0.25, random_state=9)
        train_x.drop(val_x.index.to_list(), inplace=True)
        train_x, train_mapping_user, train_mapping_movie = preprocess_data(train_x)
        test_x, test_mapping_user, test_mapping_movie = preprocess_data(test_x)
        val_x, val_mapping_user, val_mapping_movie = preprocess_data(val_x)
        train_data = [train_x, train_mapping_user, train_mapping_movie]
        val_data = [val_x, val_mapping_user, val_mapping_movie]
        test_data = [test_x, test_mapping_user, test_mapping_movie]
        factors, global_mean = fit_factorization_model(train_data, val_data, learning_rate, iterations, regularization, num_factors)
        test_predictions = predict_with_model(factors, global_mean, test_data)
        maes.append(mean_absolute_error(test_x[:, 2], test_predictions))
        rmses.append(((test_x[:, 2] - test_predictions) ** 2).mean() ** .5)
    print('Average MAE:', sum(maes) / len(maes))
    print('Average RMSE:', sum(rmses) / len(rmses))
""
k_fold_matrix_factorization(r'./ml-1m/ratings.dat', 0.0005, 75, 0.1, 10)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_x.drop(val_x.index.to_list(), inplace=True)


TypeError: ufunc 'divide' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''