# DataScience HW4

In [1]:
import pandas as pd
import numpy as np
import csv

In [2]:
# import data
items = pd.read_csv('./data/items.csv')
users = pd.read_csv('./data/users.csv')

In [3]:
train = pd.read_csv('./data/train.csv')
group = train.groupby('User ID').get_group(1)

In [4]:
def fast_similarity(ratings, kind='user', epsilon=1e-9):
    # epsilon -> small number for handling dived-by-zero errors
    if kind == 'user':
        sim = ratings.dot(ratings.T) + epsilon
    elif kind == 'item':
        sim = ratings.T.dot(ratings) + epsilon
    norms = np.array([np.sqrt(np.diagonal(sim))])
    return (sim / norms / norms.T)

def predict_fast_simple(ratings, similarity, kind='user'):
    if kind == 'user':
        return similarity.dot(ratings) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif kind == 'item':
        return ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])

### 1. caluculate rank by prefer 1,-1

In [5]:
# caluculate rank by prefer 1,-1
scores_direct = pd.DataFrame(columns=[x for x in range(1,11)])
for i in range(1,len(users)+1):
    group = train.groupby('User ID').get_group(i)
    score = pd.Series(np.full(10,0), index=[x for x in range(1,11)])
    for index, row in group.iterrows():
        if row['Preference']==0:
            score[row['Item1 ID']] += 1
            score[row['Item2 ID']] -= 1
        else:
            score[row['Item1 ID']] -= 1
            score[row['Item2 ID']] += 1
    rang = score.max() - score.min()
    scores_direct = scores_direct.append((score- score.min())*10/rang,ignore_index=True)
scores_direct.index += 1 

In [6]:
user_similarity_d = fast_similarity(scores_direct.values)
item_similarity_d = fast_similarity(scores_direct.values,kind='item')
user_d = predict_fast_simple(scores_direct.values,user_similarity_d)
item_d = predict_fast_simple(scores_direct.values,item_similarity_d,kind='item')

### 2. caluculate rank by pairwise transitive count

In [7]:
def transitive_closure(a):
    closure = set(a)
    while True:
        new_relations = set((x,w) for x,y in closure for q,w in closure if q == y)

        closure_until_now = closure | new_relations

        if closure_until_now == closure:
            break

        closure = closure_until_now

    return closure

In [8]:
# calculate rank
scores = pd.DataFrame(columns=[x for x in range(1,11)])
for i in range(1,len(users)+1):
    group = train.groupby('User ID').get_group(i)
    score = pd.Series(np.full(10,0), index=[x for x in range(1,11)])
    temp =  []
    for index, row in group.iterrows():
        if row['Preference']==0:
            temp.append((row['Item1 ID'],row['Item2 ID']))
        else:
            temp.append((row['Item2 ID'],row['Item1 ID']))
    tran = transitive_closure(temp)
    for i in tran:
        score[i[0]] += 1
    #rang = score.max() - score.min()
    #scores = scores.append((score- score.min())*10/rang,ignore_index=True)
    scores = scores.append(score/1.0,ignore_index=True)
scores.index += 1 

In [9]:
user_similarity_c = fast_similarity(scores.values)
item_similarity_c = fast_similarity(scores.values,kind='item')
user_c = predict_fast_simple(scores.values,user_similarity_c)
item_c = predict_fast_simple(scores.values,item_similarity_c,kind='item')

In [10]:
import scipy.sparse as sp
from scipy.sparse.linalg import svds

#get SVD components from train matrix. Choose k.
u, s, vt = svds(scores, k = 6)
s_diag_matrix=np.diag(s)
X_pred = np.dot(np.dot(u, s_diag_matrix), vt)

In [11]:
X_pred

array([[ 2.6563965 ,  0.90468331,  4.73875951,  4.36081047,  4.53732355,
         1.99253501,  0.19297929, -0.41567399,  3.77341997,  7.284214  ],
       [ 9.4315991 ,  8.8674512 ,  7.43745724,  8.78175059,  3.6694592 ,
         0.5074704 , 10.18973798,  7.68330298,  9.29427555, -0.15803431],
       [ 2.04162724,  7.46392145,  1.09920086,  1.74808247,  0.2695056 ,
         3.58992228,  7.77655069,  4.79420472,  1.69529422, -0.55066401],
       [ 4.39966463,  2.47181569,  6.33842517,  6.18308929,  4.00610839,
        -0.1585567 ,  2.32770486,  0.59416115,  4.82697516,  3.10917878],
       [ 8.78252814,  6.80439342,  8.4510442 , -0.30516585,  7.09423495,
         7.44436136,  9.70020758,  8.66898619,  7.08650642,  0.87785663],
       [ 4.76640936,  1.33078874,  9.51327128,  2.09351112, 10.72939502,
         8.57811276,  0.9697681 ,  0.32664396,  2.44759094, 10.69540226],
       [ 4.94853257,  3.22133821,  8.00460254,  6.14817754,  4.75210982,
         0.13752078,  3.33819433,  0.6008127 

 ## Export data

In [15]:
#rank = 0.5*(0.15*user_d + 0.85*item_d) + 0.5*(0.1*user_c + 0.9*item_c)
rank = 0.33333*(0.15*user_d + 0.85*item_d) + 0.33333*(0.1*user_c + 0.9*item_c) + X_pred*0.33333
#rank = X_pred

In [16]:
test = pd.read_csv('./data/test.csv')
User_Item1_Item2 = []
pred = []
for index,row in test.iterrows():
    User_Item1_Item2.append(str(int(row['User ID']))+'-'+str(int(row['Item1 ID']))+'-'+str(int(row['Item2 ID'])))
    if rank[row['User ID']-1][row['Item1 ID']-1] > rank[row['User ID']-1][row['Item2 ID']-1]:
        pred.append(0)
    else:
        pred.append(1)

In [17]:
with open('preference.csv', 'w') as f:
    f.write('User-Item1-Item2'+','+'Preference\n')
    for row in zip(User_Item1_Item2,pred):
        f.write(str(row[0])+','+str(row[1])+'\n')