In [1]:
# This notebook aims to recommend a possible date that the user will like
# The main idea is for the user to enter his/her iid (unique id in the study), then we will find the people whom
# he/she has dated. We split this matrix into half and half as rated and unrated (pretend they are not rated), and 
# give each of the unrated date a predicted rating. We sort the ratings and recommend the highest 3 to the user.
# Finally, we evaluate this system by computing RMSE between predicted ratings and the real ratings.

In [97]:
# read data
import numpy as np # linear algebra
from numpy import linalg as la
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
data = pd.read_csv("Speed Dating Data.csv", encoding="ISO-8859-1")
from sklearn.preprocessing import StandardScaler 
from sklearn.cross_validation import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt

In [98]:
def preprocess(all_dates, input_vars):
    # preprocessing, dealing with NaN cells
    median_features = all_dates.loc[:, input_vars].dropna().median()
    features = all_dates.loc[:, input_vars].fillna(median_features) # use median to fill the NaN values
    like = features.like_o
    print(features.shape)
    return features, like

In [99]:
def sim(row1, row2): # cos similarity
    num = row1.transpose().dot(row2)
    denom = la.norm(row1)*la.norm(row2)
    sim = 0.5+0.5*(num/denom)
#     print("The similarity is: "+str(sim))
    return sim

In [100]:
def predict(rated_dates, unrated_dates, rated_like, unrated_like):
    m = unrated_dates.shape[0]
    n = rated_dates.shape[0]
    print("m: "+str(m)+"; n: "+str(n))
    predicted_like = np.zeros(m)
    for i in range(m):
        sim_sum = 0
        rat_sim_sum = 0
        for j in range(n):
            unrated = unrated_dates.as_matrix()[i,:]
            rated = rated_dates.as_matrix()[j,:]
            similarity = sim(unrated, rated)
            sim_sum += similarity
            rat_sim_sum += rated_like.values[j]*similarity
        predicted_like[i] = rat_sim_sum/sim_sum
    print("RMSE: "+str(sqrt(mean_squared_error(unrated_like, predicted_like))))
    return predicted_like

In [101]:
def recommend(iid, data):
    all_dates = data.loc[data['pid'] == iid] # raw matrix containing information for all dates
    # vars in interest from the last notebook, including scores of 6 main attributes given by partner, and 17 lifestyle scores
    input_vars1 = ['attr_o', 'sinc_o', 'intel_o','fun_o','amb_o','shar_o', 'sports','tvsports','exercise','dining','museums','art','hiking','gaming','clubbing','reading','tv','theater','movies','concerts','music','shopping','yoga']
    # gender and order
    input_vars2 = ['gender', 'ptn_order']
    # How they value the 6 main attributes during signup stage (stage 1)
    input_vars3 = ['attr1_1', 'sinc1_1', 'intel1_1', 'fun1_1', 'amb1_1', 'shar1_1']
    # How they value the 6 main attributes after the event (stage 2)
    input_vars4 = ['attr1_2', 'sinc1_2', 'intel1_2', 'fun1_2', 'amb1_2', 'shar1_2']
    # How they value the 6 main attributes after being sent their matches (stage 3)
    input_vars5 = ['attr1_3', 'sinc1_3', 'intel1_3', 'fun1_3', 'amb1_3', 'shar1_3']
    input_vars = input_vars1+input_vars2+input_vars3+input_vars4+input_vars5
    # In this system we also need 'like_o' and iid!! (like_o is how this user likes his/her dates from dates data)
    input_vars = input_vars + ['like_o', 'iid']
    
    features, like = preprocess(all_dates, input_vars)
    features = features.drop('like_o', 1)
    
    # random split
    rated_dates, unrated_dates, rated_like, unrated_like = train_test_split(
        features, like, test_size=0.50, random_state=0) # half and half
    # predict
    predicted_like = predict(rated_dates, unrated_dates, rated_like, unrated_like.values)
    
    # sort and recommend top 3
    # print(predicted_like)
    sorted_like = np.sort(predicted_like)[::-1]
    # print(sorted_like)
    for i in range(len(predicted_like)):
        if (predicted_like[i] == sorted_like[0]): 
            print("First recommendation: No. "+ str(int(unrated_dates.iloc[i,:].iid))+"; predicted rating: "+ str(predicted_like[i])+"; real rating: "+ str(unrated_like.values[i]))
        elif (predicted_like[i] == sorted_like[1]): 
            print("Second recommendation: No. "+ str(int(unrated_dates.iloc[i,:].iid))+"; predicted rating: "+ str(predicted_like[i])+"; real rating: "+ str(unrated_like.values[i]))
        elif (predicted_like[i] == sorted_like[2]): 
            print("Third recommendation: No. "+ str(int(unrated_dates.iloc[i,:].iid))+"; predicted rating: "+ str(predicted_like[i])+"; real rating: "+ str(unrated_like.values[i]))

In [102]:
# demo
recommend(45, data)

(19, 45)
m: 10; n: 9
RMSE: 0.9540783187899158
First recommendation: No. 31; predicted rating: 6.67553868596; real rating: 8.0
Second recommendation: No. 29; predicted rating: 6.67065530869; real rating: 7.0
Third recommendation: No. 25; predicted rating: 6.667808211; real rating: 7.0


In [None]:
# Good!! The real ratings fit what we have in the original data

In [None]:
# RMSE < 1, meaning our predicted rating is within an error of 1. Considering 'like' ranges from 1-10, 
# this is pretty good result!!

In [106]:
# another demo
recommend(67, data)

(10, 45)
m: 5; n: 5
RMSE: 0.3949382481857712
Second recommendation: No. 64; predicted rating: 5.68041865239; real rating: 6.0
Third recommendation: No. 60; predicted rating: 5.67976334621; real rating: 6.0
First recommendation: No. 65; predicted rating: 5.69722294408; real rating: 6.0


In [107]:
# another demo
recommend(56, data)

(10, 45)
m: 5; n: 5
RMSE: 2.4968321992268807
First recommendation: No. 74; predicted rating: 4.42647241456; real rating: 3.0
Third recommendation: No. 70; predicted rating: 4.39892108304; real rating: 9.0
Second recommendation: No. 67; predicted rating: 4.40979879981; real rating: 5.0


In [None]:
# RMSE not so stable, due to small date size. (This user only dates 10 people and rates 10 people.) 
# Still, this method works! If given larger dataset, we can do better.