In [1]:
import pandas as pd
import numpy as np

In [2]:
train_dataset = "./datas/data_train.csv"
test_dataset = "./datas/sampleSubmission.csv"

In [3]:
def load_dataset(path):
    """Load dataset as a (User, Movie, Rating) pandas dataframe"""
    df = pd.read_csv(path)
    parsed_df = pd.DataFrame()
    # Get all pairs of (r44_c1) -> (44, 1) (user, movie)
    user_movie_indices = df.Id.apply(lambda x: x.split('_'))
    parsed_df['User'] =  [int(i[0][1:]) for i in user_movie_indices]
    parsed_df['Movie'] = [int(i[1][1:]) for i in user_movie_indices]
    parsed_df['Rating'] = df['Prediction']
    num_items = parsed_df.Movie.nunique()
    num_users = parsed_df.User.nunique()

    print("USERS: {} ITEMS: {}".format(num_users, num_items))
    return parsed_df

In [4]:
train_df = load_dataset(train_dataset)
test_df = load_dataset(test_dataset)

USERS: 10000 ITEMS: 1000
USERS: 10000 ITEMS: 1000


In [5]:
def split_dataset(df, p_test=0.2, min_num_ratings = 0):
    """ split dataframe into train and test set """
    # select user and item based on the condition.
    user_counts = df.User.value_counts()
    valid_users = user_counts[user_counts > min_num_ratings].index.values
    movie_counts = df.Movie.value_counts()
    valid_movies = movie_counts[movie_counts > min_num_ratings].index.values

    valid_ratings = df[df.User.isin(valid_users) & df.Movie.isin(valid_movies)].reset_index(drop=True)

    # Split data
    size = df.shape[0]
    indexes = list(range(size))
    np.random.shuffle(indexes)
    
    test_ind = indexes[:int(size*p_test)]
    train_ind = indexes[int(size*p_test):]
    
    test = valid_ratings.loc[test_ind]
    train = valid_ratings.loc[train_ind]

    print("Train: {}, Test: {}".format(test.shape, train.shape))
    
    # Test that the sum of nb rows of splitted dataframes = nb rows of original
    if (train.shape[0] + test.shape[0] == df.shape[0]):
        return train.reset_index(drop=True), test.reset_index(drop=True)
    else:
        raise Exception("[Error] Train: {} + Test {} != Original: {} !!".format(train_tr.shape[0], test_tr.shape[0], df.shape[0]))


In [6]:
train_tr, test_tr = split_dataset(train_df)

Train: (235390, 3), Test: (941562, 3)


In [7]:
def compute_rmse(pred, truth):
    """ compute RMSE for pandas dataframes """
    truth_sorted = truth.sort_values(['User', 'Movie']).reset_index(drop=True)
    pred_sorted = pred.sort_values(['User', 'Movie']).reset_index(drop=True)

    truth_sorted['square_error'] = np.square(truth_sorted['Rating'] - prediction_sorted['Rating'])

    mse = truth_sorted['square_error'].mean()
    rmse = np.sqrt(mse)

    return rmse

### Tensorflow 

In [24]:
import tensorflow as tf
from sklearn import preprocessing
from sklearn.metrics import precision_score

In [8]:
def standardize(df):
    # Normalize in [0, 1]
    r = df['Rating'].values.astype(float)
    min_max_scaler = preprocessing.MinMaxScaler()
    x_scaled = min_max_scaler.fit_transform(r.reshape(-1,1))
    df_normalized = pd.DataFrame(x_scaled)
    df['Rating'] = df_normalized
    return df

train_tr = standardize(train_tr)
test_tr = standardize(test_tr)

NameError: name 'preprocessing' is not defined

In [146]:
def df_to_matrix(df):
    # Convert DataFrame in user-item matrix
    matrix = df.pivot(index='User', columns='Movie', values='Rating')
    matrix.fillna(0, inplace=True)
    # Convert to numpy matrix
    users = matrix.index.tolist()
    items = matrix.columns.tolist()

    matrix = matrix.as_matrix()
    return users, items, matrix

users, items, matrix_tr = df_to_matrix(train_tr)
num_users = len(users)
num_items = len(items)

print(num_users, num_items)

10000 1000


### DIDN'T WORK!

In [110]:
# from : https://vitobellini.github.io/posts/2018/01/03/how-to-build-a-recommender-system-in-tensorflow.html

# Network Parameters

num_input = num_items
num_hidden_1 = 10
num_hidden_2 = 5

X = tf.placeholder(tf.float64, [None, num_input])

weights = {
    'encoder_h1': tf.Variable(tf.random_normal([num_input, num_hidden_1], dtype=tf.float64)),
    'encoder_h2': tf.Variable(tf.random_normal([num_hidden_1, num_hidden_2], dtype=tf.float64)),
    'decoder_h1': tf.Variable(tf.random_normal([num_hidden_2, num_hidden_1], dtype=tf.float64)),
    'decoder_h2': tf.Variable(tf.random_normal([num_hidden_1, num_input], dtype=tf.float64)),
}

biases = {
    'encoder_b1': tf.Variable(tf.random_normal([num_hidden_1], dtype=tf.float64)),
    'encoder_b2': tf.Variable(tf.random_normal([num_hidden_2], dtype=tf.float64)),
    'decoder_b1': tf.Variable(tf.random_normal([num_hidden_1], dtype=tf.float64)),
    'decoder_b2': tf.Variable(tf.random_normal([num_input], dtype=tf.float64)),
}

In [111]:
# Building the encoder

def encoder(x):
    # Encoder Hidden layer with sigmoid activation #1
    layer_1 = tf.nn.sigmoid(tf.add(tf.matmul(x, weights['encoder_h1']), biases['encoder_b1']))
    # Encoder Hidden layer with sigmoid activation #2
    layer_2 = tf.nn.sigmoid(tf.add(tf.matmul(layer_1, weights['encoder_h2']), biases['encoder_b2']))
    return layer_2


# Building the decoder

def decoder(x):
    # Decoder Hidden layer with sigmoid activation #1
    layer_1 = tf.nn.sigmoid(tf.add(tf.matmul(x, weights['decoder_h1']), biases['decoder_b1']))
    # Decoder Hidden layer with sigmoid activation #2
    layer_2 = tf.nn.sigmoid(tf.add(tf.matmul(layer_1, weights['decoder_h2']), biases['decoder_b2']))
    return layer_2


# Construct model

encoder_op = encoder(X)
decoder_op = decoder(encoder_op)


# Prediction

y_pred = decoder_op


# Targets are the input data.

y_true = X

In [112]:
# Define loss and optimizer, minimize the squared error

loss = tf.losses.mean_squared_error(y_true, y_pred)
optimizer = tf.train.RMSPropOptimizer(0.03).minimize(loss)

predictions = pd.DataFrame()

# Define evaluation metrics

eval_x = tf.placeholder(tf.int32, )
eval_y = tf.placeholder(tf.int32, )
pre, pre_op = tf.metrics.precision(labels=eval_x, predictions=eval_y)

In [113]:
# Initialize the variables (i.e. assign their default value)

init = tf.global_variables_initializer()
local_init = tf.local_variables_initializer()

In [114]:
matrix = matrix_tr.copy()
matrix

array([[0. , 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 1. , 0. , 0.5],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       ...,
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0.5]])

In [115]:
with tf.Session() as session:
    epochs = 100
    batch_size = 250

    session.run(init)
    session.run(local_init)
    
    num_batches = int(matrix.shape[0] / batch_size)
    matrix = np.array_split(matrix, num_batches)

    for i in range(epochs):

        avg_cost = 0

        for batch in matrix:
            _, l = session.run([optimizer, loss], feed_dict={X: batch})
            avg_cost += l

        avg_cost /= num_batches

        print("Epoch: {} Loss: {}".format(i + 1, avg_cost))

    print("Predictions...")

    matrix = np.concatenate(matrix, axis=0)

    preds = session.run(decoder_op, feed_dict={X: matrix})

    predictions = predictions.append(pd.DataFrame(preds))

    predictions = predictions.stack().reset_index(name='Rating')
    predictions.columns = ['User', 'Movie', 'Rating']
    predictions['User'] = predictions['User'].map(lambda value: users[value])
    predictions['Movie'] = predictions['Movie'].map(lambda value: items[value])
    
    print("Filtering out items in training set")

    keys = ['User', 'Movie']
    i1 = predictions.set_index(keys).index
    df = train_tr.copy()
    i2 = df.set_index(keys).index

    #recs = predictions[~i1.isin(i2)]
    recs = predictions[i1.isin(i2)]
    recs = recs.sort_values(['User', 'Rating'], ascending=[True, False])
    recs = recs.groupby('User').head(10)
   # recs.to_csv('recs.tsv', index=False, header=False)

Epoch: 1 Loss: 0.33865233287215235
Epoch: 2 Loss: 0.33617846965789794
Epoch: 3 Loss: 0.31951463147997855
Epoch: 4 Loss: 0.2501043420284986
Epoch: 5 Loss: 0.1083067674189806
Epoch: 6 Loss: 0.051446060091257094
Epoch: 7 Loss: 0.04880251819267869
Epoch: 8 Loss: 0.04830874148756266
Epoch: 9 Loss: 0.04717384213581681
Epoch: 10 Loss: 0.04601081358268857
Epoch: 11 Loss: 0.045704937912523745
Epoch: 12 Loss: 0.0455327108502388
Epoch: 13 Loss: 0.045409477315843105
Epoch: 14 Loss: 0.045302273239940405
Epoch: 15 Loss: 0.04522054763510823
Epoch: 16 Loss: 0.045146352518349885
Epoch: 17 Loss: 0.04511649133637548
Epoch: 18 Loss: 0.0450407731346786
Epoch: 19 Loss: 0.04499190943315625
Epoch: 20 Loss: 0.044984130375087264
Epoch: 21 Loss: 0.04493501167744398
Epoch: 22 Loss: 0.04489314304664731
Epoch: 23 Loss: 0.044886811077594756
Epoch: 24 Loss: 0.04482487924396992
Epoch: 25 Loss: 0.04484245739877224
Epoch: 26 Loss: 0.04480467587709427
Epoch: 27 Loss: 0.044766230136156084
Epoch: 28 Loss: 0.044736027158796

In [116]:
r = recs.sort_values(by=['User', 'Movie']).reset_index(drop=True)

In [117]:
t = train_tr.sort_values(by=['User', 'Movie']).reset_index(drop=True)

In [137]:
p = predictions.set_index(keys)

In [144]:
results = t.merge(r, how='inner', left_index=True, right_index=True)

### Surprise

from: https://github.com/NicolasHug/Surprise

In [9]:
from surprise import *
from surprise.model_selection import KFold
from surprise import accuracy

In [10]:
def knn(train, test, **kwargs):
    """
    K Nearest Neighbors with Baseline from library Surprise
    Args:
        train (pandas.DataFrame): train set
        test (pandas.DataFrame): test set
        **kwargs: Arbitrary keyword arguments.
            k (int): Number of nearest neighbor for the algorithm
            sim_options (dict): Dictionary specific for the kNN algorithms in Surprise
    Returns:
        pandas.DataFrame: predictions, sorted by (Movie, User)
    """

    # Get parameters
    k = kwargs['k']
    sim_options = kwargs['sim_options']

    # First, we need to dump the pandas DF into files
    train_file = 'tmp_train.csv'
    test_file = 'tmp_test.csv'
    train.to_csv(train_file, index=False, header=False)
    test.to_csv(test_file, index=False, header=False)

    # Create Reader
    reader = Reader(line_format='user item rating', sep=',')

    # Train and test set for Surprise
    fold = [(train_file, test_file)]

    # Load the data
    data = Dataset.load_from_folds(fold, reader=reader)

    # Algorithm
    algo = KNNBaseline(k=k, sim_options=sim_options)

    # Go through 1 fold
    for trainset, testset in data.folds():
        # Train
        algo.train(trainset)

        # Predict
        predictions = algo.test(testset)

    # Postprocess the predictions
    pred = np.zeros(len(predictions))
    for i in range(len(predictions)):
        val = predictions[i].est
        if val > 5:
            pred[i] = 5
        elif val < 1:
            pred[i] = 1
        else:
            pred[i] = val

    # Copy the test
    df_return = test.copy()
    df_return.Rating = pred

    return df_return

In [11]:

k_fold = 5
verbose = True

train_full = train_tr.append(test_tr)

# reader with rating scale
reader = Reader(rating_scale=(1, 5))

# load data from df
data = Dataset.load_from_df(train_full, reader)

kf = KFold(n_splits=k_fold)

algo = SVD()

rmses = dict()

for trainset, testset in kf.split(data):

    # train and test algorithm.
    model = algo.fit(trainset)
    predictions = algo.test(testset)

    # Compute and print RMSE
    rmse_ = accuracy.rmse(predictions, verbose=verbose)

    rmses[rmse_] = model

# Find the model with least RMSE
lowest_rmse = min(rmses.keys())
best_model = rmses[lowest_rmse]


RMSE: 1.0266
RMSE: 1.0249
RMSE: 1.0281
RMSE: 1.0273
RMSE: 1.0288


In [12]:
# Find the model with least RMSE
lowest_rmse = min(rmses.keys())
best_model = rmses[lowest_rmse]

In [13]:
train_df.head()

Unnamed: 0,User,Movie,Rating
0,44,1,4
1,61,1,3
2,67,1,4
3,72,1,3
4,86,1,5


In [14]:
best_model.predict(72, 1, None).est

3.220898973079219

In [15]:
test = test_df.copy()
for index, row in test.iterrows():
    user = row.User
    movie = row.Movie
    row.Rating = best_model.predict(user, movie).est

In [21]:
def submission_table(original_df, col_userID, col_movie, col_rate):
    """ return table according with Kaggle convention """

    def id(row):
        return 'r' + str(round(row[col_userID])) + '_c' + str(round(row[col_movie]))

    def pred(row):
        return row[col_rate]

    df = pd.DataFrame.copy(original_df)
    df['Id'] = df.apply(id, axis=1)
    df['Prediction'] = df.apply(pred, axis=1)

    return df[['Id', 'Prediction']]

In [22]:
submission = submission_table(test, 'User', 'Movie', 'Rating')

In [24]:
submission.to_csv("suprise_svd.csv")

Unnamed: 0,Id,Prediction
0,r37_c1,3
1,r73_c1,2
2,r156_c1,3
3,r160_c1,3
4,r248_c1,3
5,r256_c1,3
6,r284_c1,2
7,r400_c1,3
8,r416_c1,3
9,r456_c1,3
