#### Read data from files

In [1]:
import pandas as pd
from math import sqrt
# raw data
df_home = pd.read_csv('./data/home_page_table.csv')
df_search = pd.read_csv('./data/search_page_table.csv')
df_payment = pd.read_csv('./data/payment_page_table.csv')
df_payment_confirmation = pd.read_csv('./data/payment_confirmation_table.csv')

#### Data preprocessing

In [2]:
# drop unnecessary feature
df_home = df_home.drop(['page'], axis=1)
df_search = df_search.drop(['page'], axis=1)
df_payment = df_payment.drop(['page'], axis=1)
df_payment_confirmation = df_payment_confirmation.drop(['page'], axis=1)

In [3]:
# divide users into 2 groups ([home, search]/[payment, payment confirmation])
df_group2 = df_payment
df_group1 = pd.concat([df_home, df_payment]).drop_duplicates(subset='user_id', keep=False)

In [4]:
df_group1.head(5)

Unnamed: 0,user_id
0,313593
1,468315
2,264005
3,290784
4,639104


In [5]:
df_group1.shape

(84370, 1)

#### Generate ratings matrix

In [7]:
# generate movie ids
from random import randint

def random_with_N_digits(n):
    range_start = 10**(n-1)
    range_end = (10**n)-1
    return randint(range_start, range_end)

movie_ids = [random_with_N_digits(6) for i in range(5)]

# generate ratings
import numpy as np

ratings = np.random.randint(low=1, high=5, size=(len(df_home), 5))
ratings = [tuple(r) for r in ratings]

In [8]:
# create ratings matrix
df_ratings = pd.DataFrame(ratings, columns=movie_ids, index=df_home['user_id'].tolist())

#### Attach movie ids to users in group 2

In [9]:
# generate selected movie ids
selected_movie_ids = [random_with_N_digits(6) for i in range(len(df_group2))]

# create new column 'movie_id'
df_group2['movie_id'] = selected_movie_ids

In [51]:
df_group2.head(5)

Unnamed: 0,user_id,movie_id
0,253019,966434
1,310478,835686
2,304081,626248
3,901286,786518
4,195052,144534


In [10]:
# TODO
# 1. find similarity from 'df_ratings'
# 2. for each user in group1, get the most similar user from group2
# 3. from that user in group2, recommend value in 'movie_id' column to user in group1

In [11]:
df_ratings.head(5)

Unnamed: 0,852486,921582,827735,796900,492018
313593,4,2,2,3,4
468315,2,2,2,2,2
264005,2,4,4,2,4
290784,2,2,3,2,3
639104,2,4,4,3,4


In [12]:
df_ratings.shape

(90400, 5)

In [14]:
def sim_cosine(prefs, p1,p2):
    #compute cosine similarity of v1 to v2: (v1 dot v2)/{||v1||*||v2||)
    # Get the list of mutually rated items
    # Sums of the squares
    sum1Sq = sum([pow(prefs.loc[p1][it], 2) for it in prefs.loc[p1].index])
    sum2Sq = sum([pow(prefs.loc[p2][it], 2) for it in prefs.loc[p1].index])
    sum12Sq = sum([prefs.loc[p1][it]*prefs.loc[p2][it] for it in prefs.loc[p1].index])

    return sum12Sq/sqrt(sum1Sq*sum2Sq)

In [15]:
df_ratings.head(5)

Unnamed: 0,852486,921582,827735,796900,492018
313593,4,2,2,3,4
468315,2,2,2,2,2
264005,2,4,4,2,4
290784,2,2,3,2,3
639104,2,4,4,3,4


In [16]:
sim_cosine(df_ratings, 313593, 264005)

0.8781440805693944

In [20]:
df_ratings.head(5)

Unnamed: 0,852486,921582,827735,796900,492018
313593,4,2,2,3,4
468315,2,2,2,2,2
264005,2,4,4,2,4
290784,2,2,3,2,3
639104,2,4,4,3,4


In [30]:
def top_similar_users(prefs, group, user, n):
    '''
    To print top similar persons who viewed movies of p interest
    '''
    allpersons = []
    for otherperson in prefs.index:
        if otherperson != user and otherperson in group['user_id']:
            r = sim_cosine(prefs, user, otherperson)
            allpersons.append((r, otherperson))
    allpersons.sort(reverse=True)
    return allpersons[0:n]

In [31]:
lst = top_similar_users(df_ratings, df_group2,313593,5)
for person in lst:
    print(person)

(1.0, 1637)
(0.9926846128175763, 1210)
(0.9919501068991623, 5764)
(0.9914601339836673, 496)
(0.9906801321840805, 4987)
