In [1]:
import pandas as pd
import numpy as np
from scipy.io import arff
from io import StringIO
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import balanced_accuracy_score
import random
from scipy.stats import pearsonr
from collections import defaultdict

In [2]:
import warnings
warnings.simplefilter(action='ignore')

In [152]:
data = pd.DataFrame.from_csv('train_Interactions.csv', sep = ',').reset_index()

In [153]:
train = data[:190000]
val = data[190000:]

In [154]:
val['read'] = 1


In [155]:
book_list = data['bookID'].unique()
read_list = data.groupby('userID')['bookID'].apply(list).to_dict()

In [156]:
neg_val = pd.DataFrame({'userID':val['userID'], 'bookID': val['userID'].apply(lambda x: random.choice(list(set(book_list) - set(read_list[x])))), 'read': 0})


In [157]:
new_val = pd.concat([val.drop('rating', axis = 1), neg_val])

In [9]:
bookCount = defaultdict(int)
totalRead = 0

for user,book,_ in np.array(data.values):
    bookCount[book] += 1
    totalRead += 1

mostPopular = [(bookCount[x], x) for x in bookCount]
mostPopular.sort()
mostPopular.reverse()
mostPopular
len(mostPopular)

7170

In [10]:
### Would-read baseline: just rank which books are popular and which are not, and return '1' if a book is among the top-ranked
from collections import defaultdict
def predict_baseline(threshold):
    bookCount = defaultdict(int)
    totalRead = 0

    for user,book,_ in np.array(data.values):
        bookCount[book] += 1
        totalRead += 1

    mostPopular = [(bookCount[x], x) for x in bookCount]
    mostPopular.sort()
    mostPopular.reverse()

    return1 = set()
    count = 0
    for ic, i in mostPopular:
        count += ic
        return1.add(i)
        if count > totalRead*threshold/100: break

    predictions = []
    for user,book,_ in np.array(new_val.values):
        if book in return1:
            predictions.append(1)
        else:
            predictions.append(0)
    return predictions

In [11]:
print('prediction accuracy is:')
sum(predict_baseline(50) == new_val['read'])/len(new_val['read'])

prediction accuracy is:


0.65125

In [12]:
sum(predict_baseline(60) == new_val['read'])/len(new_val['read'])

0.6589

In [13]:
read_train_list = train.groupby('userID')['bookID'].apply(list).to_dict()
user_train_list = train.groupby('bookID')['userID'].apply(list).to_dict()

In [67]:
def predict_jaccard(threshold):
    def max_jaccard(x):
        bks = read_train_list[x.userID]
        if x.bookID in list(train.bookID.values):
            U = set(user_train_list[x.bookID])
        else:
            U = set()
        jaccards = [0]
        for b in bks:
            U_p = set(user_train_list[b])
            j = len(U_p.intersection(U))/len(U_p.union(U))
            jaccards.append(j)
        return max(jaccards)
    j_vals = new_val.apply(max_jaccard, axis = 1)
    return j_vals

In [24]:
train_books = train.bookID.unique()

In [27]:
def predict_pearson(threshold):
    def max_pearson(x):
        user_rate = UBmat.loc[x.userID,:]
        if x.bookID not in train_books:
            return 0
        user_list = UBmat.index[UBmat[x.bookID]>0].tolist()
        pearson = [0]
        for u in user_list:
            u_behav = UBmat.loc[u,:]
            p = pearsonr(user_rate, u_behav)[0]
            pearson.append(p)
        return max(pearson)
    p_vals = new_val.apply(max_pearson, axis = 1)
    return p_vals

In [28]:
p_pred = predict_pearson(0.0015)

In [69]:
p_vals = p_pred

In [60]:
p_result = (p_pred > 0.061).astype(int)

In [61]:
sum(p_result == new_val.read)/len(p_result)

0.6441

In [68]:
j_vals = predict_jaccard(0.01)
predictions_j = (j_vals > 0.01).astype(int)


In [147]:
read_train_list = train.groupby('userID')['bookID'].apply(list).to_dict()
user_train_list = train.groupby('bookID')['userID'].apply(list).to_dict()

In [169]:
new_val['booklist'] = new_val.groupby('userID')['bookID'].apply(list)

In [171]:
booklist = new_val.groupby('userID')['bookID'].apply(list)

In [280]:
read_train_list = data.groupby('userID')['bookID'].apply(list).to_dict()
user_train_list = data.groupby('bookID')['userID'].apply(list).to_dict()

In [281]:
def union_jaccard(x):
        bks = set(read_train_list[x.userID])
        if x.bookID in list(data.bookID.values):
            U = set(user_train_list[x.bookID])
        else:
            U = set()
        U_p = set()
        for b in bks:
            U_p = U_p.union(set(user_train_list[b]))
            
        return len(U_p.intersection(U))/len(U_p.union(U))


In [283]:
j_vals = test_df.apply(union_jaccard, axis = 1)

In [284]:
test_df['jaccard'] = j_vals

In [310]:
UBmat = data.set_index('userID').pivot(columns = 'bookID', values = 'rating').fillna(0)

In [311]:
def sum_pearson(x):
    user_rate = UBmat.loc[x.userID,:]
    if x.bookID not in data.bookID.values:
        return 0
    user_list = UBmat.index[UBmat[x.bookID]>0].tolist()
    pearson = [0]
    for u in user_list:
        u_behav = UBmat.loc[u,:]
        p = pearsonr(user_rate, u_behav)[0]
        pearson.append(p)
    return sum(pearson)

In [312]:
test_df['pearson'] = test_df.apply(sum_pearson, axis = 1)

In [327]:
test_df['pearson'] = test_df['pearson'].fillna(0)

In [328]:
test_df['pnorm'] = test_df['pearson'].apply(lambda x: (x-np.mean(test_df['pearson']))/test_df['pearson'].std())

In [319]:
test_df['jnorm'] = test_df['jaccard'].apply(lambda x: (x-np.mean(test_df['jaccard']))/test_df['jaccard'].std())

In [329]:
test_df['sum_norm'] = test_df['pnorm'] + test_df['jnorm']

In [285]:
len_list = test_df.groupby('userID').apply(len)

In [330]:
test_df['rank'] = test_df.groupby('userID')['sum_norm'].apply(lambda x: x.rank(method = 'first'))

In [322]:
def det_pred(x):
    length = len_list.loc[x.userID]#len(x)
    pred_list = [0]*(length//2) + [1]*(length//2)
    return pred_list[int(x['rank'])-1]
    
    

In [331]:
test_df.isna().sum()

userID      0
bookID      0
jaccard     0
rank        0
pred        0
pearson     0
pnorm       0
jnorm       0
sum_norm    0
dtype: int64

In [332]:
test_df['pred'] = test_df.apply(det_pred, axis = 1)

In [333]:
test_df

Unnamed: 0,userID,bookID,jaccard,rank,pred,pearson,pnorm,jnorm,sum_norm
0,u65407115,b69897799,0.000000,1.0,0,-0.049593,-0.742242,-0.878833,-1.621076
1,u53740605,b39436893,0.002604,1.0,0,0.001530,-0.557184,-0.394511,-0.951695
2,u88031275,b83889575,0.000000,1.0,0,-0.061727,-0.786165,-0.878833,-1.664998
3,u99759913,b39270822,0.014599,4.0,1,0.379083,0.809500,1.836198,2.645699
4,u20090895,b47380623,0.012346,6.0,1,0.421032,0.961348,1.417212,2.378560
5,u24294545,b66431497,0.006438,1.0,0,0.122132,-0.120625,0.318461,0.197836
6,u55512157,b82932751,0.001419,1.0,0,0.040601,-0.415754,-0.614845,-1.030599
7,u49944817,b08859581,0.002635,2.0,0,0.098241,-0.207106,-0.388768,-0.595874
8,u25738103,b40864764,0.000000,1.0,0,-0.009422,-0.596829,-0.878833,-1.475663
9,u60899187,b27621138,0.004545,2.0,1,0.102473,-0.191788,-0.033471,-0.225260


In [237]:
pred = test_df.apply(lambda x: x.jaccard >= sorted(j_list.loc[x.userID])[len(j_list.loc[x.userID])//2], axis = 1)

In [241]:
test_df['pred'] = pred

In [213]:
sum(pred == new_val.read)/len(pred)

0.6417

In [282]:
test_df = pd.DataFrame({'userID':[], 'bookID':[]})
count = 0
for l in open("pairs_Read.txt"):
    if l.startswith("userID"):
        continue
    u,b = l.strip().split('-')
    test_df = test_df.append({'userID': u , 'bookID': b} , ignore_index=True)



In [176]:
test_df.groupby('userID')['bookID'].apply(list)

userID
u00013729                               [b30189933, b65050854]
u00015033                               [b65310214, b75687427]
u00045712                               [b10451434, b97330601]
u00066827                               [b35841877, b18393772]
u00068766                               [b99899820, b46363613]
u00072947                               [b08606376, b79048149]
u00087472         [b18741581, b62734207, b71027077, b37028746]
u00098349    [b61418655, b96403431, b48178542, b87040093, b...
u00100195                               [b33483708, b38208557]
u00119080         [b63964479, b57458967, b86137861, b74344779]
u00129022                               [b40464871, b84230046]
u00141989                               [b10586979, b27525165]
u00151341         [b13058481, b22366895, b47086160, b32207996]
u00169633                               [b50180436, b16744002]
u00181613    [b75377845, b03439601, b14213635, b86041841, b...
u00229956                               [b024168

In [334]:
predictions = open("predictions_Read.txt", 'w')
count = 0
for l in open("pairs_Read.txt"):
    if l.startswith("userID"):
        #header
        predictions.write(l)
        continue
    u,b = l.strip().split('-')
    if test_df[(test_df.userID == u) & (test_df.bookID == b)]['pred'].values[0]:
        predictions.write(u + '-' + b + ",1\n")
    else:
        predictions.write(u + '-' + b + ",0\n")
    count+=1

predictions.close()

Kaggle username: winniecyc