In [1]:
import pandas as pd
import numpy as np

In [2]:
train_dataset = "./datas/data_train.csv"
test_dataset = "./datas/sampleSubmission.csv"

train_df_path = "./datas/train.csv"
test_df_path = "./datas/test.csv"

In [3]:
def load_dataset(path, name):
    """Load dataset as a (User, Movie, Rating) pandas dataframe"""
    df = pd.read_csv(path)
    parsed_df = pd.DataFrame()
    # Get all pairs of (r44_c1) -> (44, 1) (user, movie)
    user_movie_indices = df.Id.apply(lambda x: x.split('_'))
    parsed_df['User'] =  [int(i[0][1:]) for i in user_movie_indices]
    parsed_df['Movie'] = [int(i[1][1:]) for i in user_movie_indices]
    parsed_df['Rating'] = df['Prediction']
    num_items = parsed_df.Movie.nunique()
    num_users = parsed_df.User.nunique()
    
    # save to csv for later use
    parsed_df.to_csv(name, index=False, header=False)
    
    print("USERS: {} ITEMS: {}".format(num_users, num_items))
    return parsed_df

In [4]:
train_df = load_dataset(train_dataset, train_df_path)
test_df = load_dataset(test_dataset, test_df_path)

USERS: 10000 ITEMS: 1000
USERS: 10000 ITEMS: 1000


In [9]:
def split_dataset(df, p_test=0.2, min_num_ratings = 0):
    """ split dataframe into train and test set """
    # select user and item based on the condition.
    user_counts = df.User.value_counts()
    valid_users = user_counts[user_counts > min_num_ratings].index.values
    movie_counts = df.Movie.value_counts()
    valid_movies = movie_counts[movie_counts > min_num_ratings].index.values

    valid_ratings = df[df.User.isin(valid_users) & df.Movie.isin(valid_movies)].reset_index(drop=True)

    # Split data
    size = df.shape[0]
    indexes = list(range(size))
    np.random.shuffle(indexes)
    
    test_ind = indexes[:int(size*p_test)]
    train_ind = indexes[int(size*p_test):]
    
    test = valid_ratings.loc[test_ind]
    train = valid_ratings.loc[train_ind]

    print("Train: {}, Test: {}".format(test.shape, train.shape))
    
    # Test that the sum of nb rows of splitted dataframes = nb rows of original
    if (train.shape[0] + test.shape[0] == df.shape[0]):
        return train.reset_index(drop=True), test.reset_index(drop=True)
    else:
        raise Exception("[Error] Train: {} + Test {} != Original: {} !!".format(train_tr.shape[0], test_tr.shape[0], df.shape[0]))


In [10]:
train_tr, test_tr = split_dataset(train_df)

Train: (235390, 3), Test: (941562, 3)


In [13]:
def compute_rmse(pred, truth):
    """ compute RMSE for pandas dataframes """
    truth_sorted = truth.sort_values(['User', 'Movie']).reset_index(drop=True)
    pred_sorted = pred.sort_values(['User', 'Movie']).reset_index(drop=True)

    truth_sorted['square_error'] = np.square(truth_sorted['Rating'] - prediction_sorted['Rating'])

    mse = truth_sorted['square_error'].mean()
    rmse = np.sqrt(mse)

    return rmse

In [17]:
train_tr.User.unique()

array([5637, 6994, 6523, ...,  730, 1482, 5217], dtype=int64)

### Tensorflow 

In [24]:
import tensorflow as tf
from sklearn import preprocessing
from sklearn.metrics import precision_score

In [8]:
def standardize(df):
    # Normalize in [0, 1]
    r = df['Rating'].values.astype(float)
    min_max_scaler = preprocessing.MinMaxScaler()
    x_scaled = min_max_scaler.fit_transform(r.reshape(-1,1))
    df_normalized = pd.DataFrame(x_scaled)
    df['Rating'] = df_normalized
    return df

train_tr = standardize(train_tr)
test_tr = standardize(test_tr)

NameError: name 'preprocessing' is not defined

In [146]:
def df_to_matrix(df):
    # Convert DataFrame in user-item matrix
    matrix = df.pivot(index='User', columns='Movie', values='Rating')
    matrix.fillna(0, inplace=True)
    # Convert to numpy matrix
    users = matrix.index.tolist()
    items = matrix.columns.tolist()

    matrix = matrix.as_matrix()
    return users, items, matrix

users, items, matrix_tr = df_to_matrix(train_tr)
num_users = len(users)
num_items = len(items)

print(num_users, num_items)

10000 1000


### DIDN'T WORK!

In [110]:
# from : https://vitobellini.github.io/posts/2018/01/03/how-to-build-a-recommender-system-in-tensorflow.html

# Network Parameters

num_input = num_items
num_hidden_1 = 10
num_hidden_2 = 5

X = tf.placeholder(tf.float64, [None, num_input])

weights = {
    'encoder_h1': tf.Variable(tf.random_normal([num_input, num_hidden_1], dtype=tf.float64)),
    'encoder_h2': tf.Variable(tf.random_normal([num_hidden_1, num_hidden_2], dtype=tf.float64)),
    'decoder_h1': tf.Variable(tf.random_normal([num_hidden_2, num_hidden_1], dtype=tf.float64)),
    'decoder_h2': tf.Variable(tf.random_normal([num_hidden_1, num_input], dtype=tf.float64)),
}

biases = {
    'encoder_b1': tf.Variable(tf.random_normal([num_hidden_1], dtype=tf.float64)),
    'encoder_b2': tf.Variable(tf.random_normal([num_hidden_2], dtype=tf.float64)),
    'decoder_b1': tf.Variable(tf.random_normal([num_hidden_1], dtype=tf.float64)),
    'decoder_b2': tf.Variable(tf.random_normal([num_input], dtype=tf.float64)),
}

In [111]:
# Building the encoder

def encoder(x):
    # Encoder Hidden layer with sigmoid activation #1
    layer_1 = tf.nn.sigmoid(tf.add(tf.matmul(x, weights['encoder_h1']), biases['encoder_b1']))
    # Encoder Hidden layer with sigmoid activation #2
    layer_2 = tf.nn.sigmoid(tf.add(tf.matmul(layer_1, weights['encoder_h2']), biases['encoder_b2']))
    return layer_2


# Building the decoder

def decoder(x):
    # Decoder Hidden layer with sigmoid activation #1
    layer_1 = tf.nn.sigmoid(tf.add(tf.matmul(x, weights['decoder_h1']), biases['decoder_b1']))
    # Decoder Hidden layer with sigmoid activation #2
    layer_2 = tf.nn.sigmoid(tf.add(tf.matmul(layer_1, weights['decoder_h2']), biases['decoder_b2']))
    return layer_2


# Construct model

encoder_op = encoder(X)
decoder_op = decoder(encoder_op)


# Prediction

y_pred = decoder_op


# Targets are the input data.

y_true = X

In [112]:
# Define loss and optimizer, minimize the squared error

loss = tf.losses.mean_squared_error(y_true, y_pred)
optimizer = tf.train.RMSPropOptimizer(0.03).minimize(loss)

predictions = pd.DataFrame()

# Define evaluation metrics

eval_x = tf.placeholder(tf.int32, )
eval_y = tf.placeholder(tf.int32, )
pre, pre_op = tf.metrics.precision(labels=eval_x, predictions=eval_y)

In [113]:
# Initialize the variables (i.e. assign their default value)

init = tf.global_variables_initializer()
local_init = tf.local_variables_initializer()

In [114]:
matrix = matrix_tr.copy()
matrix

array([[0. , 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 1. , 0. , 0.5],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       ...,
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0.5]])

In [115]:
with tf.Session() as session:
    epochs = 100
    batch_size = 250

    session.run(init)
    session.run(local_init)
    
    num_batches = int(matrix.shape[0] / batch_size)
    matrix = np.array_split(matrix, num_batches)

    for i in range(epochs):

        avg_cost = 0

        for batch in matrix:
            _, l = session.run([optimizer, loss], feed_dict={X: batch})
            avg_cost += l

        avg_cost /= num_batches

        print("Epoch: {} Loss: {}".format(i + 1, avg_cost))

    print("Predictions...")

    matrix = np.concatenate(matrix, axis=0)

    preds = session.run(decoder_op, feed_dict={X: matrix})

    predictions = predictions.append(pd.DataFrame(preds))

    predictions = predictions.stack().reset_index(name='Rating')
    predictions.columns = ['User', 'Movie', 'Rating']
    predictions['User'] = predictions['User'].map(lambda value: users[value])
    predictions['Movie'] = predictions['Movie'].map(lambda value: items[value])
    
    print("Filtering out items in training set")

    keys = ['User', 'Movie']
    i1 = predictions.set_index(keys).index
    df = train_tr.copy()
    i2 = df.set_index(keys).index

    #recs = predictions[~i1.isin(i2)]
    recs = predictions[i1.isin(i2)]
    recs = recs.sort_values(['User', 'Rating'], ascending=[True, False])
    recs = recs.groupby('User').head(10)
   # recs.to_csv('recs.tsv', index=False, header=False)

Epoch: 1 Loss: 0.33865233287215235
Epoch: 2 Loss: 0.33617846965789794
Epoch: 3 Loss: 0.31951463147997855
Epoch: 4 Loss: 0.2501043420284986
Epoch: 5 Loss: 0.1083067674189806
Epoch: 6 Loss: 0.051446060091257094
Epoch: 7 Loss: 0.04880251819267869
Epoch: 8 Loss: 0.04830874148756266
Epoch: 9 Loss: 0.04717384213581681
Epoch: 10 Loss: 0.04601081358268857
Epoch: 11 Loss: 0.045704937912523745
Epoch: 12 Loss: 0.0455327108502388
Epoch: 13 Loss: 0.045409477315843105
Epoch: 14 Loss: 0.045302273239940405
Epoch: 15 Loss: 0.04522054763510823
Epoch: 16 Loss: 0.045146352518349885
Epoch: 17 Loss: 0.04511649133637548
Epoch: 18 Loss: 0.0450407731346786
Epoch: 19 Loss: 0.04499190943315625
Epoch: 20 Loss: 0.044984130375087264
Epoch: 21 Loss: 0.04493501167744398
Epoch: 22 Loss: 0.04489314304664731
Epoch: 23 Loss: 0.044886811077594756
Epoch: 24 Loss: 0.04482487924396992
Epoch: 25 Loss: 0.04484245739877224
Epoch: 26 Loss: 0.04480467587709427
Epoch: 27 Loss: 0.044766230136156084
Epoch: 28 Loss: 0.044736027158796

In [116]:
r = recs.sort_values(by=['User', 'Movie']).reset_index(drop=True)

In [117]:
t = train_tr.sort_values(by=['User', 'Movie']).reset_index(drop=True)

In [137]:
p = predictions.set_index(keys)

In [144]:
results = t.merge(r, how='inner', left_index=True, right_index=True)

### Surprise

from: https://github.com/NicolasHug/Surprise

In [7]:
from surprise import *
from surprise.model_selection import KFold, PredefinedKFold
from surprise import accuracy
from itertools import islice
from helpers import *

In [97]:
def surprise_cv_algo(data, algo, k_fold=5, verbose=True):
    # Split into folds
    kf = KFold(n_splits=k_fold)
    rmse_ = 0
        
    for trainset, testset in kf.split(data):
        # train and test algorithm.
        model = algo.fit(trainset)
        predictions = algo.test(testset)

        # Compute and print RMSE
        rmse_ += accuracy.rmse(predictions, verbose=verbose)
    
    rmse_mean = rmse_/k_fold
    return rmse_mean
    
    
def surprise_knn_best_params(train_path="datas/train.csv", test_path="datas/test.csv", verbose=True, t = Timer()):
    # reader with rating scale
    reader = Reader(line_format='user item rating', sep=',', rating_scale=(1, 5))
    # load data from df
    data = Dataset.load_from_file(train_path, reader)
    
    #knn parameters
    ks = np.linspace(40, 200, 9, dtype = np.int64)
    names = ['pearson_baseline', 'pearson', 'msd', 'cosine']
    user_baseds = [True, False]
    params = dict()
    rmses = dict()
    
    for k in ks:
        params['k'] = k
        for name in names:
            params['name'] = name
            for user_based in user_baseds:
                params['user_based'] = user_based
                algo = KNNBaseline(k=k, sim_options={'name': name, 'user_based': user_based})
                rmse = surprise_cv_algo(data, algo)
                print("------Time:{}, rmse: {}, k: {}, name: {}, user_based: {}------\n\n".format(t.now(), rmse, k, name, user_based))
                rmses[rmse] = params
    
    # Find the model with least RMSE
    lowest_rmse = min(rmses.keys())
    best_params = rmses[lowest_rmse]
    
    print("Best knn rmse: {}. Params: k: {}, name: {}, user_based: {}".format(lowest_rmse, best_params['k'], best_params['name'], best_params['user_based']))

    
def surprise_svd_best_params(train_path="datas/train.csv", test_path="datas/test.csv", verbose=True, t = Timer()):
    # reader with rating scale
    reader = Reader(line_format='user item rating', sep=',', rating_scale=(1, 5))
    # load data from df
    data = Dataset.load_from_file(train_path, reader)
    
    #svd parameters
    n_epochss = np.linspace(200, 40, 9, dtype=np.int32)
    reg_alls = np.logspace(-2, -5, 4)
    lr_bus = np.logspace(-10, -2, 9)
    lr_qis = np.logspace(-10, -2, 9)
    params = dict()
    rmses = dict()
    
    t.start()
    
    for n_epoch in n_epochss:
        params['n_epoch'] = k
        for reg_all in reg_alls:
            params['reg_all'] = reg_all
            for lr_bu in lr_bus:
                params['lr_bu'] = lr_bu
                for lr_qi in lr_qis:
                    params['lr_qi'] = lr_qi
                    algo = SVD(n_epoch = n_epoch, reg_all = reg_all, lr_bu = lr_bu, lr_qi = lr_qi)
                    rmse = surprise_cv_algo(data, algo)
                    print("------Time:{}, rmse: {}, k: {}, name: {}, user_based: {}------\n\n".format(t.now(), rmse, k, name, user_based))
                    rmses[rmse] = params
    
    # Find the model with least RMSE
    lowest_rmse = min(rmses.keys())
    best_params = rmses[lowest_rmse]
    
    print("Best svd rmse: {}. Params: k: {}, name: {}, user_based: {}".format(lowest_rmse, best_params['k'], best_params['name'], best_params['user_based']))


In [8]:
def surprise_algo(algo, train_path="datas/train.csv", test_path="datas/test.csv", verbose=True):
    # reader with rating scale
    reader = Reader(line_format='user item rating', sep=',', rating_scale=(1, 5))
    
    # Specify the training and test dataset
    folds_files = [(train_path, test_path)]

    data = Dataset.load_from_folds(folds_files, reader=reader)
    pkf = PredefinedKFold()
    
    print("Start prediction...")
    for trainset, testset in pkf.split(data):
        # train and predict algorithm.
        model = algo.fit(trainset)
        predictions = algo.test(testset)
    
    pred = pd.read_csv(test_path, names = ["User", "Movie", "Rating"])
    
    print("Postprocessing predictions...")
    for index, row in pred.iterrows():
        rating = round(predictions[index].est)
        if rating > 5:
            rating = 5
        elif rating < 1:
            rating = 1
        row.Rating = rating
    
    return pred


In [5]:
class Timer:
    import time
    import datetime 
    
    def __init__(self):
        self.t = 0
        
    def start(self):
        self.t = time.time()
        
    def stop(self):
        print("Time taken: {}".format(datetime.timedelta(seconds=time.time() - self.t).__str__()))
        self.t = 0

In [6]:
t = Timer()

In [179]:
# svd
t.start()
algo = SVD(n_epochs=30, lr_all=0.001, reg_all=0.001)
predictions = surprise_algo(algo)
t.stop()
t.start()
submission = create_csv_submission(predictions, 'User', 'Movie', 'Rating')
submission.to_csv("suprise_svd.csv")
t.stop() # 1.05

Start prediction...
Postprocessing predictions...
Time taken: 0:03:33.136205
Creating submission file...
Time taken: 0:00:44.869094


In [None]:
# ------rmse: 1.0665431544988566, n_factor:50, n_epoch: 200, reg_all: 0.01, lr_bu: 1e-09, lr_qi: 1e-05------
# svd
t.start()
algo = SVD(n_factors=50, n_epochs=200, lr_bu=(1*10**-9) , lr_qi= (1*10**-5), reg_all=0.01)
predictions_2 = surprise_algo(algo)
t.stop()
t.start()
submission_2 = create_csv_submission(predictions)
submission_2.to_csv("surprise_svd_better_param.csv")
t.stop() #

Start prediction...
Postprocessing predictions...


In [182]:
#svd++
t.start()
algo = SVDpp(n_epochs=30, lr_all=0.001, reg_all=0.001)
predictions = surprise_algo(algo)
t.stop()
t.start()
submission = create_csv_submission(predictions, 'User', 'Movie', 'Rating')
submission.to_csv("suprise_svd++.csv")
t.stop() #1.025 slightly better than knn

Start prediction...
Postprocessing predictions...
Time taken: 1:36:23.325838
Creating submission file...
Time taken: 0:00:44.045763


In [183]:
# knn
t.start()
algo = KNNBaseline(k=60, sim_options={'name': 'pearson_baseline', 'user_based': False})
predictions = surprise_algo(algo)
t.stop()
submission = create_csv_submission(predictions, 'User', 'Movie', 'Rating')
submission.to_csv("suprise_knnBaseline.csv") #1.025

Start prediction...
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Postprocessing predictions...
Time taken: 0:10:54.984091
Creating submission file...


In [10]:
#knn best params - Best knn rmse: 0.9902320847559991. Params: k: 200, name: cosine, user_based: False
t.start()
algo = KNNBaseline(k=200, sim_options={'name': 'cosine', 'user_based': False})
predictions = surprise_algo(algo)
t.stop()
submission = create_csv_submission(predictions)
submission.to_csv("suprise_knnBaseline_best_params.csv") #1.04 WTF -- got the wrong one...

Start prediction...
Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Postprocessing predictions...
Time taken: 0:07:57.490047
Creating submission file...


In [12]:
submission.head()

Unnamed: 0,Id,Prediction
0,r37_c1,3
1,r73_c1,3
2,r156_c1,4
3,r160_c1,3
4,r248_c1,3


In [14]:
# ------Time:17872 days, 12:09:10.825712, rmse: 0.9902320847559991, k: 100, name: pearson_baseline, user_based: False------
t.start()
algo = KNNBaseline(k=100, sim_options={'name': 'pearson_baseline', 'user_based': False})
predictions = surprise_algo(algo)
t.stop()
submission = create_csv_submission(predictions)
submission.to_csv("suprise_knnBaseline_best_params.csv") # still 1.025 ... :(

Start prediction...
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Postprocessing predictions...
Time taken: 0:06:55.763423
Creating submission file...


In [17]:
submission.head()

Unnamed: 0,Id,Prediction
0,r37_c1,3
1,r73_c1,3
2,r156_c1,4
3,r160_c1,3
4,r248_c1,3


### PyFM -  1.028

In [5]:
from pyfm import pylibfm
from sklearn.feature_extraction import DictVectorizer

In [29]:
def toPyFMData(df):
    data = []
    users = set(df.User.unique())
    movies = set(df.Movie.unique())
    ratings = df.Rating.astype(float).tolist()
    for row in df.iterrows():
        data.append({"user_id": str(row[1].User), "movie_id": str(row[1].Movie)})
    return (data, np.array(ratings), users, movies)

(train_data, y_train, train_users, train_items) = toPyFMData(train_tr)
(test_data, y_test, test_users, test_items) = toPyFMData(test_tr)

In [30]:
v = DictVectorizer()
X_train = v.fit_transform(train_data)
X_test = v.transform(test_data)

In [31]:
# Build and train a Factorization Machine
fm = pylibfm.FM(num_factors=10, num_iter=100, verbose=True, task="regression", initial_learning_rate=0.001, learning_rate_schedule="optimal")

fm.fit(X_train,y_train)

Creating validation dataset of 0.01 of training for adaptive regularization
-- Epoch 1
Training MSE: 0.54774
-- Epoch 2
Training MSE: 0.51386
-- Epoch 3
Training MSE: 0.50497
-- Epoch 4
Training MSE: 0.50006
-- Epoch 5
Training MSE: 0.49690
-- Epoch 6
Training MSE: 0.49467
-- Epoch 7
Training MSE: 0.49310
-- Epoch 8
Training MSE: 0.49181
-- Epoch 9
Training MSE: 0.49084
-- Epoch 10
Training MSE: 0.49001
-- Epoch 11
Training MSE: 0.48929
-- Epoch 12
Training MSE: 0.48865
-- Epoch 13
Training MSE: 0.48797
-- Epoch 14
Training MSE: 0.48735
-- Epoch 15
Training MSE: 0.48667
-- Epoch 16
Training MSE: 0.48603
-- Epoch 17
Training MSE: 0.48531
-- Epoch 18
Training MSE: 0.48459
-- Epoch 19
Training MSE: 0.48387
-- Epoch 20
Training MSE: 0.48313
-- Epoch 21
Training MSE: 0.48236
-- Epoch 22
Training MSE: 0.48157
-- Epoch 23
Training MSE: 0.48083
-- Epoch 24
Training MSE: 0.48006
-- Epoch 25
Training MSE: 0.47930
-- Epoch 26
Training MSE: 0.47859
-- Epoch 27
Training MSE: 0.47789
-- Epoch 28
Tra

In [32]:
# Evaluate
preds = fm.predict(X_test)
from sklearn.metrics import mean_squared_error
print("FM MSE: %.4f" % mean_squared_error(y_test,preds))

FM MSE: 0.9802


In [6]:
from helpers import create_csv_submission, toPyFMData

In [7]:
(train_data, y_train, train_users, train_items) = toPyFMData(train_df)
(test_data, y_test, test_users, test_items) = toPyFMData(test_df)


In [8]:
v = DictVectorizer()
X_train = v.fit_transform(train_data)
X_test = v.transform(test_data)

In [9]:
# Build and train a Factorization Machine
fm = pylibfm.FM(num_factors=10, num_iter=100, verbose=True, task="regression", initial_learning_rate=0.001, learning_rate_schedule="optimal")

fm.fit(X_train,y_train)

Creating validation dataset of 0.01 of training for adaptive regularization
-- Epoch 1
Training MSE: 0.54224
-- Epoch 2
Training MSE: 0.51051
-- Epoch 3
Training MSE: 0.50252
-- Epoch 4
Training MSE: 0.49830
-- Epoch 5
Training MSE: 0.49567
-- Epoch 6
Training MSE: 0.49396
-- Epoch 7
Training MSE: 0.49266
-- Epoch 8
Training MSE: 0.49170
-- Epoch 9
Training MSE: 0.49086
-- Epoch 10
Training MSE: 0.49015
-- Epoch 11
Training MSE: 0.48952
-- Epoch 12
Training MSE: 0.48890
-- Epoch 13
Training MSE: 0.48837
-- Epoch 14
Training MSE: 0.48779
-- Epoch 15
Training MSE: 0.48724
-- Epoch 16
Training MSE: 0.48668
-- Epoch 17
Training MSE: 0.48607
-- Epoch 18
Training MSE: 0.48552
-- Epoch 19
Training MSE: 0.48493
-- Epoch 20
Training MSE: 0.48434
-- Epoch 21
Training MSE: 0.48374
-- Epoch 22
Training MSE: 0.48314
-- Epoch 23
Training MSE: 0.48253
-- Epoch 24
Training MSE: 0.48189
-- Epoch 25
Training MSE: 0.48128
-- Epoch 26
Training MSE: 0.48065
-- Epoch 27
Training MSE: 0.47999
-- Epoch 28
Tra

In [10]:
preds = fm.predict(X_test)

In [12]:
predictions = test_df.copy()
predictions.Rating = preds

In [21]:
predictions.head()

Unnamed: 0,User,Movie,Rating
0,37,1,3.292142
1,73,1,3.15712
2,156,1,3.888872
3,160,1,3.408896
4,248,1,3.667776


In [30]:
def create_csv_submission(predictions):
    """Create submission file """
    print("Creating submission file...")
    predictions['Id'] = predictions.apply(lambda x: 'r{}_c{}'.format(int(x.User), int(x.Movie)), axis=1)
    predictions['Prediction'] = predictions.Rating.apply(lambda x: round(x))
    return predictions[['Id', 'Prediction']]

In [31]:
submission = create_csv_submission(predictions)

Creating submission file...


In [33]:
submission.to_csv("pyFM.csv") #1.028

In [32]:
submission.head()

Unnamed: 0,Id,Prediction
0,r37_c1,3
1,r73_c1,3
2,r156_c1,4
3,r160_c1,3
4,r248_c1,4


### Blending