In [1]:
import pandas as pd
import numpy as np

In [2]:
train_dataset = "./datas/data_train.csv"
pred_dataset = "./datas/sampleSubmission.csv"

In [3]:
def load_dataset(path):
    """Load dataset as a (User, Movie, Rating) pandas dataframe"""
    df = pd.read_csv(path)
    parsed_df = pd.DataFrame()
    # Get all pairs of (r44_c1) -> (44, 1) (user, movie)
    user_movie_indices = df.Id.apply(lambda x: x.split('_'))
    parsed_df['User'] =  [int(i[0][1:]) for i in user_movie_indices]
    parsed_df['Movie'] = [int(i[1][1:]) for i in user_movie_indices]
    parsed_df['Rating'] = df['Prediction']
    
    num_items = parsed_df.Movie.nunique()
    num_users = parsed_df.User.nunique()
    print("USERS: {} ITEMS: {}".format(num_users, num_items))
    return parsed_df

In [4]:
train_df = load_dataset(train_dataset)

USERS: 10000 ITEMS: 1000


In [5]:
def split_dataset(parsed_df, p_test=0.1, min_num_ratings=0):
    movies_per_user = parsed_df.User.value_counts()
    users_per_movie = parsed_df.Movie.value_counts()

    valid_users = movies_per_user[movies_per_user > min_num_ratings].index.values
    valid_movies = users_per_movie[users_per_movie > min_num_ratings].index.values
    valid_parsed_df = parsed_df[parsed_df.User.isin(valid_users) & parsed_df.Movie.isin(valid_movies)].reset_index(drop=True)
    
    print("movies per user: min[{a}], max[{b}], users per movie: min[{c}], max[{d}].".
          format(a=movies_per_user.min(), b=movies_per_user.max(), c=users_per_movie.min(), d=users_per_movie.max()))

    size = valid_parsed_df.shape[0]
    indexes = list(range(size))
    np.random.shuffle(indexes)

    test_ind = indexes[:int(size*p_test)]
    train_ind = indexes[int(size*p_test):]

    test = valid_parsed_df.loc[test_ind].reset_index(drop=True)
    train = valid_parsed_df.loc[train_ind].reset_index(drop=True)
    print("The shape of test_dataset: {test}, train_dataset: {train}".format(test=test.shape, train=train.shape))
    
    return train, test

train, test = split_dataset(train_df)
# print(train.iloc[0,2])
# type(train.iloc[0,2])

movies per user: min[3], max[522], users per movie: min[8], max[4590].
The shape of test_dataset: (117695, 3), train_dataset: (1059257, 3)


In [6]:
def compute_rmse(pred, real):
    pred_sorted = pred.sort_values(['Movie', 'User']).reset_index(drop=True)
    real_sorted = real.sort_values(['Movie', 'User']).reset_index(drop=True)

    mse = np.square(pred_sorted.Rating - real_sorted.Rating).mean()
    rmse = np.sqrt(mse)

    return rmse

In [7]:
def baseline_global_mean(train, test):
    mean = train.Rating.mean()
#     print(type(mean)),
    pred_test = test.copy()
    pred_test.Rating = mean

    rmse = compute_rmse(pred_test, test)

    return pred_test, rmse
pred_test, rmse = baseline_global_mean(train, test)
print(rmse)
# print(pred_test)
# print(pred_test.iloc[0,2])
# type(pred_test.iloc[0,2])

1.1145304342508455


In [8]:
def baseline_global_median(train, test):
    median = train.Rating.median()
#     print(type(median))
    pred_test = test.copy()
    pred_test.Rating = median

    rmse = compute_rmse(pred_test, test)

    return pred_test, rmse
pred_test, rmse = baseline_global_median(train, test)
print(rmse)
# print(pred_test)

1.1237730563387889


In [9]:
def baseline_user_mean(train, test):
    mean_per_user = train.groupby('User').mean().Rating

    pred_test = test.copy()

    def predict(sub_df):
        sub_df['Rating'] = mean_per_user[sub_df.iloc[0,0]]
        return sub_df

    pred_test = pred_test.groupby('User').apply(predict)

    rmse = compute_rmse(pred_test, test)

    return pred_test, rmse

pred_test, rmse = baseline_user_mean(train, test)
print(rmse)
# print(pred_test)

1.0903067308728722


In [10]:
def baseline_user_median(train, test):
    median_per_user = train.groupby('User').median().Rating

    pred_test = test.copy()

    def predict(sub_df):
        sub_df['Rating'] = median_per_user[sub_df.iloc[0,0]]
        return sub_df

    pred_test = pred_test.groupby('User').apply(predict)
    rmse = compute_rmse(pred_test, test)

    return pred_test, rmse

pred_test, rmse = baseline_user_median(train, test)
print(rmse)
# print(pred_test)

1.146593059753457


In [11]:
def baseline_movie_mean(train, test):
    mean_per_movie = train.groupby('Movie').mean().Rating

    pred_test = test.copy()

    def predict(sub_df):
        sub_df['Rating'] = mean_per_movie[sub_df.iloc[0,1]]
        return sub_df

    pred_test = pred_test.groupby('Movie').apply(predict)
    rmse = compute_rmse(pred_test, test)

    return pred_test, rmse

pred_test, rmse = baseline_movie_mean(train, test)
print(rmse)
# print(pred_test)

1.0282713452174768


In [12]:
def baseline_movie_median(train, test):
    median_per_movie = train.groupby('Movie').median().Rating

    pred_test = test.copy()

    def predict(sub_df):
        sub_df['Rating'] = median_per_movie[sub_df.iloc[0,1]]
        return sub_df

    pred_test = pred_test.groupby('Movie').apply(predict)
    rmse = compute_rmse(pred_test, test)

    return pred_test, rmse

pred_test, rmse = baseline_movie_median(train, test)
print(rmse)
# print(pred_test)"

1.0968752082150792


In [13]:
def user_standardize(df):
    mean_per_user = df.groupby('User').mean().Rating
    var_per_user = df.groupby('User').var().Rating
    stand_df = df.copy()
    stand_df['Rating'] = df.apply(lambda x: (x['Rating'] - mean_per_user[x['User']]) / var_per_user[x['User']], axis=1)
    return stand_df

# stand_df = user_standardize(train)
# print(stand_df)

def user_standardize_recover(df, stand_pred_test):
    mean_per_user = df.groupby('User').mean().Rating
    var_per_user = df.groupby('User').var().Rating
    pred_test = stand_pred_test.copy()
    pred_test['Rating'] = stand_pred_test.apply(lambda x: (x['Rating'] * var_per_user[x['User']] + mean_per_user[x['User']]), axis=1)
    return pred_test

# df = user_standardize_recover(train, stand_df)
# print(df)

In [14]:
def user_habit(df):
    global_mean = df.Rating.mean()
    mean_per_user = df.groupby('User').mean().Rating
    habit =mean_per_user - global_mean
    return habit

# habit = user_habit(train)
# print(habit)

def user_habit_standardize(df):
    habit = user_habit(df)
    stand_df = df.copy()
    stand_df['Rating'] = df.apply(lambda x: x['Rating'] - habit[x['User']], axis=1)
    return stand_df

# stand_df = user_habit_standardize(train)
# print(stand_df)

def user_habit_standardize_recover(df, stand_pred_test):
    habit = user_habit(df)
    pred_test = stand_pred_test.copy()
    pred_test['Rating'] = stand_pred_test.apply(lambda x: x['Rating'] + habit[x['User']], axis=1)
    return pred_test

# df = user_habit_standardize_recover(train, stand_df)
# print(df)

In [15]:
def movie_mean_user_standardize(train, test):
    #standardize the rating according to per user mean and variance
#     print(train.iloc[0,2])
#     print(type(train.iloc[0,2]))
    stand_train = user_standardize(train)
#     print(stand_train.iloc[0,2])
#     print(type(stand_train.iloc[0,2]))
    #predict the standardized test rating
    stand_pred_test, rmse = baseline_movie_mean(stand_train, test)

    #recover from the standardized predicted test rating
    pred_test = user_standardize_recover(train, stand_pred_test)

    #compute the rmse
    rmse = compute_rmse(pred_test, test)
    return pred_test, rmse

pred_test, rmse = movie_mean_user_standardize(train, test)
print(rmse)
# print(pred_test)"

1.0040541298054675


In [16]:
def movie_median_user_standardize(train, test):
    #standardize the rating according to per user mean and variance
    stand_train = user_standardize(train)

    #predict the standardized test rating
    stand_pred_test, rmse = baseline_movie_median(stand_train, test)

    #recover from the standardized predicted test rating
    pred_test = user_standardize_recover(train, stand_pred_test)

    #compute the rmse
    rmse = compute_rmse(pred_test, test)
    return pred_test, rmse

pred_test, rmse = movie_median_user_standardize(train, test)
print(rmse)
# print(pred_test)

1.0251927927569746


In [17]:
def movie_mean_user_habit_standardize(train, test):
    #standardize the rating according to per user habit
    pred_test = test.copy()
    pred_test.Rating = pred_test.Rating.apply(lambda x: float(x))
    stand_train = user_habit_standardize(train)

    #predict the standardized test rating
    stand_pred_test, rmse = baseline_movie_mean(stand_train, test)

    #recover from the standardized predicted test rating
    pred_test = user_habit_standardize_recover(train, stand_pred_test)

    #compute the rmse
    rmse = compute_rmse(pred_test, test)
    return pred_test, rmse

pred_test, rmse = movie_mean_user_habit_standardize(train, test)
print(rmse)
# print(pred_test)

1.0033023344448717


In [18]:
def movie_median_user_habit_standardize(train, test):
    #standardize the rating according to per user mean and variance
    stand_train = user_habit_standardize(train)

    #predict the standardized test rating
    stand_pred_test, rmse = baseline_movie_median(stand_train, test)

    #recover from the standardized predicted test rating
    pred_test = user_habit_standardize_recover(train, stand_pred_test)

    #compute the rmse
    rmse = compute_rmse(pred_test, test)
    return pred_test, rmse

pred_test, rmse = movie_median_user_habit_standardize(train, test)
print(rmse)
# print(pred_test)

1.021270509749067


In [19]:
def movie_mean_user_habit(train, test):
    habit = user_habit(train)
    mean_per_movie = train.groupby('Movie').mean().Rating

    pred_test = test.copy()
    pred_test['Rating'] = pred_test['Rating'].apply(lambda x: float(x))

    def predict(x):
        x['Rating'] = mean_per_movie[x['Movie']] + habit[x['User']]
        return x

    pred_test = pred_test.apply(predict, axis=1)
    pred_test['User'] = pred_test['User'].apply(lambda x: int(x))
    pred_test['Movie'] = pred_test['Movie'].apply(lambda x: int(x))

    rmse = compute_rmse(pred_test, test)
    return pred_test, rmse

pred_test, rmse = movie_mean_user_habit(train, test)
print(rmse)
# print(pred_test)

1.0035523727853255


In [20]:
def movie_median_user_habit(train, test):
    habit = user_habit(train)
    median_per_movie = train.groupby('Movie').median().Rating

    pred_test = test.copy()
    pred_test['Rating'] = pred_test['Rating'].apply(lambda x: float(x))

    def predict(x):
        x['Rating'] = median_per_movie[x['Movie']] + habit[x['User']]
        return x

    pred_test = pred_test.apply(predict, axis=1)
    pred_test['User'] = pred_test['User'].apply(lambda x: int(x))
    pred_test['Movie'] = pred_test['Movie'].apply(lambda x: int(x))

    rmse = compute_rmse(pred_test, test)
    return pred_test, rmse

pred_test, rmse = movie_median_user_habit(train, test)
print(rmse)
print(pred_test)

1.0736894788559919
        User  Movie    Rating
0       9351    920  3.162866
1       3315    482  4.299222
2        146     14  4.696167
3       9656    347  3.995159
4       3407    105  3.089711
5       2342    350  3.961223
6       3306     90  3.033142
7       4578    245  4.142595
8       9265    674  4.438545
9       5345     66  3.883975
10      5415    231  2.627702
11      8115     37  4.105558
12      2783    322  5.280526
13       259      6  5.340397
14       176    673  3.994208
15      9640    208  3.731636
16      6704    654  3.637332
17      8498    310  5.429552
18      9702    690  4.126291
19      6704    561  2.637332
20      2977     74  2.980540
21      5798    810  3.477674
22      8639    235  4.045034
23      2272    682  4.142595
24      7020    814  4.064817
25      2973    300  3.796441
26      1229    575  4.007460
27      9171    318  5.303054
28      8228    434  4.851550
29      2399    293  3.200149
...      ...    ...       ...
117665    37    247  