In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
from scipy.spatial.distance import cosine
from sklearn.preprocessing import Imputer
from sklearn.model_selection import train_test_split

def getSummaries(ratings, matrix, index, impute=True):
    data = pd.DataFrame(np.array(ratings) * np.array(matrix.iloc[:,2:])
        , index=matrix[index], columns=matrix.iloc[:,2:].columns)

    #display(data.head(15))
    ids = pd.unique(matrix[index])
    packed = pd.DataFrame(index=ids,columns=data.columns)
    #display(packed.head(5))
    #display(packed.loc[272])
    summed = data.groupby([data.index]).max()
    
    if impute:
        # This Imputer is pretty naive. A KNN imputer would be much better.
        imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
        X_imputed = pd.DataFrame(data=imp.fit_transform(summed.as_matrix()))
        X_imputed.index = summed.index
        X_imputed.columns = summed.columns
        display(X_imputed.head(5))
        return X_imputed
    return summed

def findNeighbors(data, data_ibs):
    # Lets fill in those empty spaces with cosine similarities
    # Loop through the columns
    for i in range(0,len(data_ibs.columns)) :
        # Loop through the columns for each column
        if i % 100 == 0:
            display(i)
        for j in range(0,len(data_ibs.columns)) :
            # Fill in placeholder with cosine similarities
            data_ibs.iloc[i,j] = 1-cosine(data.iloc[:,i],data.iloc[:,j])
        
    #display(data_ibs.head(5))

    # Create a placeholder items for closes neighbours to an item
    data_neighbours = pd.DataFrame(index=data_ibs.columns,columns=range(1,21))
 
    # Loop through our similarity dataframe and fill in neighbouring item names
    for i in range(0,len(data_ibs.columns)):
        data_neighbours.iloc[i,:20] = data_ibs.iloc[0:,i].sort_values(ascending=False)[:20].index
 
    return data_neighbours


X = pd.read_csv('boardgame-frequent-users.csv')
#X = pd.read_csv('boardgame-elite-users.csv')
X = X.rename(columns = {"Compiled from boardgamegeek.com by Matt Borthwick":'userID'})

X, X_test, y_train, y_test = train_test_split(X, X['rating'], test_size=0.3, random_state=0)

pd.DataFrame(X_test).to_csv("test_data.csv", sep='\t')

X = pd.get_dummies(X, columns=['gameID'])
X = X.replace(0, np.nan)
ratings = X.iloc[:,1:2].copy()

gsum = getSummaries(ratings, X,'userID')
gsum2 = getSummaries(ratings, X,'userID', impute=False)
display(gsum2.head(6))

gsum.to_csv('gsum.csv', sep='\t')
gsum.transpose().to_csv('usum.csv', sep='\t')

gsum2.to_csv('gsum2.csv', sep='\t')
gsum2.transpose().to_csv('usum2.csv', sep='\t')


data_ibs = pd.DataFrame(index=gsum.columns,columns=gsum.columns)
gneighbors = findNeighbors(gsum, data_ibs)
display(gneighbors.head(13))

gneighbors.to_csv('gneighbors.csv', sep='\t')


Unnamed: 0_level_0,gameID_3,gameID_5,gameID_10,gameID_11,gameID_12,gameID_13,gameID_18,gameID_41,gameID_42,gameID_45,...,gameID_187645,gameID_192291,gameID_193738,gameID_194655,gameID_198773,gameID_201808,gameID_204583,gameID_205059,gameID_205637,gameID_209010
userID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
83,7.246037,7.0,6.534479,6.741042,8.0,8.0,8.0,6.669259,8.0,7.0,...,7.824654,7.172176,7.871343,7.330525,7.239211,7.391946,6.962018,7.463619,7.764696,7.626677
119,7.0,7.0,6.534479,6.741042,7.448207,7.0,6.658037,6.0,7.4,7.0,...,7.824654,7.172176,7.871343,7.330525,7.239211,7.391946,6.962018,7.463619,7.764696,7.626677
144,7.246037,7.029337,6.534479,7.0,7.448207,6.711557,6.0,7.0,7.421703,6.756149,...,9.0,7.172176,8.5,7.330525,7.239211,8.5,7.0,7.463619,7.764696,8.0
156,7.5,6.5,6.534479,7.0,8.0,4.0,7.0,6.669259,7.421703,6.756149,...,7.5,7.172176,9.5,8.0,7.239211,7.0,6.962018,7.463619,7.764696,7.0
186,7.246037,7.029337,6.534479,6.0,8.0,7.0,6.658037,6.669259,8.0,6.756149,...,7.824654,7.172176,7.871343,8.0,7.239211,8.0,6.962018,7.463619,7.764696,8.0


Unnamed: 0_level_0,gameID_3,gameID_5,gameID_10,gameID_11,gameID_12,gameID_13,gameID_18,gameID_41,gameID_42,gameID_45,...,gameID_187645,gameID_192291,gameID_193738,gameID_194655,gameID_198773,gameID_201808,gameID_204583,gameID_205059,gameID_205637,gameID_209010
userID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
83,,7.0,,,8.0,8.0,8.0,,8.0,7.0,...,,,,,,,,,,
119,7.0,7.0,,,,7.0,,6.0,7.4,7.0,...,,,,,,,,,,
144,,,,7.0,,,6.0,7.0,,,...,9.0,,8.5,,,8.5,7.0,,,8.0
156,7.5,6.5,,7.0,8.0,4.0,7.0,,,,...,7.5,,9.5,8.0,,7.0,,,,7.0
186,,,,6.0,8.0,7.0,,,8.0,,...,,,,8.0,,8.0,,,,8.0
225,8.0,7.0,7.5,,,9.0,,,8.0,8.0,...,,,,,,,,,,


0

100

200

300

400

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20
gameID_3,gameID_3,gameID_192291,gameID_55660,gameID_134342,gameID_24439,gameID_21059,gameID_2163,gameID_118048,gameID_204583,gameID_72287,gameID_205637,gameID_176396,gameID_111661,gameID_13291,gameID_40849,gameID_176189,gameID_201808,gameID_127067,gameID_155362,gameID_181279
gameID_5,gameID_5,gameID_192291,gameID_13291,gameID_24439,gameID_55660,gameID_134342,gameID_205637,gameID_127067,gameID_201808,gameID_209010,gameID_21059,gameID_2163,gameID_181279,gameID_172818,gameID_176189,gameID_118048,gameID_111661,gameID_72287,gameID_204583,gameID_40849
gameID_10,gameID_10,gameID_192291,gameID_55660,gameID_21059,gameID_24439,gameID_134342,gameID_13291,gameID_40849,gameID_204583,gameID_176396,gameID_205637,gameID_2163,gameID_118048,gameID_111661,gameID_127067,gameID_72287,gameID_209010,gameID_3943,gameID_1219,gameID_176189
gameID_11,gameID_11,gameID_192291,gameID_55660,gameID_24439,gameID_13291,gameID_118048,gameID_134342,gameID_2993,gameID_155362,gameID_21059,gameID_204583,gameID_127067,gameID_201808,gameID_72287,gameID_205637,gameID_111661,gameID_176396,gameID_155987,gameID_1219,gameID_40849
gameID_12,gameID_12,gameID_192291,gameID_134342,gameID_24439,gameID_201808,gameID_204583,gameID_205637,gameID_118048,gameID_181279,gameID_150658,gameID_111661,gameID_55660,gameID_21059,gameID_176396,gameID_172818,gameID_13291,gameID_172386,gameID_72287,gameID_31627,gameID_209010
gameID_13,gameID_13,gameID_192291,gameID_134342,gameID_24439,gameID_325,gameID_13291,gameID_176396,gameID_55660,gameID_118048,gameID_111661,gameID_21059,gameID_127067,gameID_2163,gameID_204583,gameID_155362,gameID_205637,gameID_31627,gameID_201808,gameID_40849,gameID_926
gameID_18,gameID_18,gameID_192291,gameID_127067,gameID_13291,gameID_209010,gameID_55660,gameID_24439,gameID_72287,gameID_134342,gameID_2163,gameID_3943,gameID_15364,gameID_181279,gameID_176189,gameID_118048,gameID_21059,gameID_172818,gameID_205637,gameID_204583,gameID_111661
gameID_41,gameID_41,gameID_192291,gameID_24439,gameID_55660,gameID_134342,gameID_21059,gameID_13291,gameID_118048,gameID_111661,gameID_204583,gameID_201808,gameID_127067,gameID_176189,gameID_1219,gameID_205637,gameID_40849,gameID_176396,gameID_72287,gameID_150658,gameID_156009
gameID_42,gameID_42,gameID_192291,gameID_55660,gameID_24439,gameID_13291,gameID_134342,gameID_118048,gameID_72287,gameID_21059,gameID_7854,gameID_181279,gameID_209010,gameID_182028,gameID_205637,gameID_31627,gameID_3943,gameID_176189,gameID_172386,gameID_66690,gameID_204583
gameID_45,gameID_45,gameID_192291,gameID_55660,gameID_134342,gameID_204583,gameID_24439,gameID_21059,gameID_127067,gameID_205637,gameID_13291,gameID_209010,gameID_72287,gameID_40849,gameID_118048,gameID_155362,gameID_156009,gameID_194655,gameID_176189,gameID_1219,gameID_201808


In [2]:
usum = gsum.transpose()
display(usum.head(6))

data_ibs = pd.DataFrame(index=usum.columns,columns=usum.columns)
uneighbors = findNeighbors(usum, data_ibs)
display(uneighbors.head(6))
uneighbors.to_csv('uneighbors.csv', sep='\t')

userID,83,119,144,156,186,225,238,272,319,387,...,192302,192640,192681,193034,193103,193129,193184,193266,193339,193491
gameID_3,7.246037,7.0,7.246037,7.5,7.246037,8.0,6.0,1.0,8.0,6.0,...,7.246037,7.246037,7.246037,8.0,7.5,8.0,7.5,9.5,6.0,9.0
gameID_5,7.0,7.0,7.029337,6.5,7.029337,7.0,9.0,1.0,7.029337,6.0,...,7.029337,7.029337,7.029337,7.029337,7.029337,7.0,7.0,7.029337,7.0,8.5
gameID_10,6.534479,6.534479,6.534479,6.534479,6.534479,7.5,6.534479,6.534479,6.534479,7.0,...,6.534479,6.534479,6.534479,7.0,6.534479,6.534479,6.534479,6.534479,6.0,6.534479
gameID_11,6.741042,6.741042,7.0,7.0,6.0,6.741042,6.741042,6.741042,6.741042,9.0,...,7.5,6.0,7.0,6.741042,7.5,7.0,4.0,8.0,3.0,6.741042
gameID_12,8.0,7.448207,7.448207,8.0,8.0,7.448207,7.448207,1.0,7.448207,10.0,...,8.0,8.0,7.448207,7.0,7.448207,8.0,8.0,7.448207,8.5,7.448207
gameID_13,8.0,7.0,6.711557,4.0,7.0,9.0,6.711557,6.711557,7.0,6.711557,...,6.711557,6.711557,6.711557,7.0,5.4,7.0,7.0,9.0,5.5,7.0


0

100

200

300

400

500

600

700

800

900

1000

1100

1200

1300

1400

1500

1600

1700

1800

1900

2000

2100

2200

2300

2400

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20
userID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
83,83,106942,171584,52450,174527,162188,71449,113993,188749,167619,188568,41916,6988,12037,16057,4882,50957,53681,37723,6908
119,119,188749,52450,106942,145331,123511,113993,142671,171584,85153,16057,4882,45239,125097,38271,176263,32814,71449,82906,160442
144,144,110607,12037,171584,16057,4882,52450,41916,142671,59120,113993,160442,188749,45146,53681,106942,139651,71449,176263,48059
156,156,106942,142671,188568,171584,188749,113993,16057,52450,4882,109309,45239,12037,67321,85612,174527,121301,123511,173604,167619
186,186,123511,52450,4882,12037,188749,171584,106942,85153,160442,110607,113993,25815,167619,137983,179189,105819,122273,85612,53681
225,225,52450,171584,160442,188749,123511,67321,12037,99801,59014,106942,4882,113993,30908,122895,109309,181250,53681,16057,167619
