In [1]:
import datetime
import numpy as np
import pandas as pd
import seaborn as sns
import pandas_profiling
import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix
from sklearn import preprocessing as pp

from lightfm.cross_validation import random_train_test_split
from lightfm.evaluation import auc_score, precision_at_k, recall_at_k
from lightfm import LightFM

  import pandas.util.testing as tm


In [2]:
train = pd.read_csv('train.csv')
train['date'] = train['timestamp'].apply(lambda x: datetime.datetime.fromtimestamp(x))
train.drop('timestamp', axis=1, inplace=True)
print(train.shape)
train.head(3)

(8674, 4)


Unnamed: 0,user_id,item_id,like,date
0,140,342,0,2017-03-31 08:03:42
1,378,172,1,2017-03-31 08:03:48
2,150,182,0,2017-03-31 08:04:10


In [3]:
test = pd.read_csv('test.csv')
test['date'] = test['timestamp'].apply(lambda x: datetime.datetime.fromtimestamp(x))
test.drop('timestamp', axis=1, inplace=True)
print(test.shape)
test.head(3)

(497, 2)


Unnamed: 0,user_id,date
0,166,2017-03-31 10:13:51
1,26,2017-03-31 13:49:31
2,41,2017-03-31 14:02:27


In [4]:
item_features = pd.read_csv('item-features.csv')
print(item_features.shape)
item_features.head(3)

(444, 33)


Unnamed: 0,item_id,0,1,2,3,4,5,6,7,8,...,22,23,24,25,26,27,28,29,30,31
0,35,0.005646,-0.01278,-0.011941,0.016942,-0.004044,0.005566,0.006587,0.005411,-0.024627,...,-0.03409,-0.005673,0.009111,-0.00877,-0.016189,-0.016189,0.003432,-0.00949,-0.00949,0.002416
1,19,0.00253,-0.005726,-0.00535,0.007591,-0.001812,0.002494,0.002951,0.002424,-0.011035,...,-0.015274,-0.002542,0.004082,-0.003929,-0.007254,-0.007254,0.001538,-0.004252,-0.004252,0.001082
2,145,0.001592,-0.003604,-0.003368,0.004778,-0.001141,0.00157,0.001858,0.001526,-0.006946,...,-0.009615,-0.0016,0.00257,-0.002473,-0.004566,-0.004566,0.000968,-0.002677,-0.002677,0.000681


In [5]:
user_features = pd.read_csv('user-features.csv')
print(user_features.shape)
user_features.head(3)

(497, 33)


Unnamed: 0,user_id,0,1,2,3,4,5,6,7,8,...,22,23,24,25,26,27,28,29,30,31
0,0,0.000695,-0.001573,-0.00147,0.002085,-0.000498,0.000685,0.000811,0.000666,-0.003031,...,-0.004196,-0.000698,0.001121,-0.001079,-0.001993,-0.001993,0.000422,-0.001168,-0.001168,0.000297
1,1,0.001204,-0.002725,-0.002546,0.003612,-0.000862,0.001187,0.001404,0.001154,-0.005251,...,-0.007268,-0.001209,0.001942,-0.00187,-0.003451,-0.003451,0.000732,-0.002023,-0.002023,0.000515
2,2,0.000491,-0.001112,-0.001039,0.001475,-0.000352,0.000484,0.000573,0.000471,-0.002144,...,-0.002967,-0.000494,0.000793,-0.000763,-0.001409,-0.001409,0.000299,-0.000826,-0.000826,0.00021


In [6]:
corr_matrix = user_features.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
user_features.drop(to_drop, axis=1, inplace=True)
user_features.columns = [col + '_user' if col != 'user_id' else col for col in user_features.columns ]
user_features.head(3)

Unnamed: 0,user_id,0_user
0,0,0.000695
1,1,0.001204
2,2,0.000491


In [7]:
user_features['tmp'] = 1/user_features['0_user']

In [8]:
user_features

Unnamed: 0,user_id,0_user,tmp
0,0,0.000695,1438.810320
1,1,0.001204,830.697526
2,2,0.000491,2034.785068
3,3,0.000777,1286.911073
4,4,0.000695,1438.810320
...,...,...,...
492,492,0.000983,1017.392534
493,493,0.001300,769.076466
494,494,0.000491,2034.785068
495,495,0.000983,1017.392534


In [9]:
corr_matrix = item_features.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.85)]
item_features.drop(to_drop, axis=1, inplace=True)
item_features.columns = [col + '_item' if col != 'item_id' else col for col in item_features.columns]
item_features.head(3)

Unnamed: 0,item_id,0_item,1_item,2_item,3_item,4_item,5_item,6_item,8_item,10_item,...,15_item,16_item,17_item,18_item,20_item,21_item,22_item,25_item,26_item,29_item
0,35,0.005646,-0.01278,-0.011941,0.016942,-0.004044,0.005566,0.006587,-0.024627,0.027773,...,-0.017058,-0.001453,-0.01362,-0.007217,-0.006034,-0.025846,-0.03409,-0.00877,-0.016189,-0.00949
1,19,0.00253,-0.005726,-0.00535,0.007591,-0.001812,0.002494,0.002951,-0.011035,0.012444,...,-0.007643,-0.000651,-0.006103,-0.003234,-0.002703,-0.01158,-0.015274,-0.003929,-0.007254,-0.004252
2,145,0.001592,-0.003604,-0.003368,0.004778,-0.001141,0.00157,0.001858,-0.006946,0.007833,...,-0.004811,-0.00041,-0.003841,-0.002036,-0.001702,-0.007289,-0.009615,-0.002473,-0.004566,-0.002677


In [10]:
for col in item_features.columns:
    if col != 'item_id':
        item_features[col] = 1/item_features[col]

In [11]:
corr_matrix = item_features.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
item_features.drop(to_drop, axis=1, inplace=True)
#item_features.columns = [col + '_item' if col != 'item_id' else col for col in item_features.columns]
item_features.head(3)

Unnamed: 0,item_id,0_item,8_item,10_item,17_item
0,35,177.105308,-40.605295,36.006768,-73.421905
1,19,395.271594,-90.624725,80.361524,-163.86631
2,145,627.948305,-143.970989,127.666353,-260.32625


In [12]:
#item_features.to_excel('items_excel.xlsx')
#user_features.to_excel('users_excel.xlsx')

## 1 находим группы

In [13]:
item_features['0_item_round'] = item_features['0_item'].round(4)
item_features['8_item_round'] = item_features['8_item'].round(4)
item_features['10_item_round'] = item_features['10_item'].round(4)
item_features['17_item_round'] = item_features['17_item'].round(4)
item_features.drop(['0_item','8_item','10_item','17_item'], axis=1, inplace=True)

In [14]:
item_features.drop('item_id', axis=1).drop_duplicates().shape

(77, 4)

In [15]:
item_features = pd.read_excel('items_excel.xlsx')
item_features.head()

Unnamed: 0,item_id,0_item,8_item,10_item,17_item,group
0,282,2877.62064,-659.757957,585.040725,-1192.964746,1
1,271,2877.62064,-659.757957,585.040725,-1192.964746,1
2,268,2877.62064,-659.757957,585.040725,-1192.964746,1
3,273,2877.62064,-659.757957,585.040725,-1192.964746,1
4,283,2877.62064,-659.757957,585.040725,-1192.964746,1


In [16]:
user_features = pd.read_excel('users_excel.xlsx')
user_features.head()

Unnamed: 0,user_id,0_user,group_user
0,14,0.000491,1
1,106,0.000491,1
2,119,0.000491,1
3,40,0.000491,1
4,209,0.000491,1


In [17]:
ans = []
for k,v in test.iterrows():
    user_group = user_features[user_features['user_id']==v['user_id']]['group_user'].values[0]
    user_list = user_features[user_features['group_user']==user_group]['user_id'].unique()
    like_items = list(train[train['user_id'].isin(user_list) & (train['like']==1)]['item_id'].unique())
    ans.append({'user_id':v['user_id'], 'items':like_items})
ans = pd.DataFrame(ans)
ans['len'] = ans['items'].apply(len)

In [18]:
ans.head()

Unnamed: 0,user_id,items,len
0,166,"[67, 119, 22, 35, 66, 59, 72, 40, 84, 37, 78, ...",28
1,26,"[18, 1, 143, 45, 83, 63, 14, 76, 152, 22, 84, ...",19
2,41,"[40, 37, 129, 76, 22, 7, 90, 103, 147, 101, 60...",29
3,286,"[15, 22, 9, 65, 180, 37, 172, 40, 90, 20, 23, ...",18
4,108,"[15, 22, 9, 65, 180, 37, 172, 40, 90, 20, 23, ...",18


In [19]:
best_answer = pd.read_csv('answer2020-08-13 22:11:37.122287.csv')
best_answer.head()

Unnamed: 0,user_id,0,1,2,3,4,5,6,7,8,...,10,11,12,13,14,15,16,17,18,19
0,166,60,35,37,76,22,65,72,66,80,...,40,148,200,146,11,147,36,7,87,59
1,26,60,35,37,22,76,65,72,67,78,...,32,49,66,1,33,19,21,59,34,58
2,41,60,35,37,22,76,65,66,72,67,...,59,32,33,36,19,21,40,58,87,49
3,286,60,35,37,22,76,65,66,72,80,...,67,59,40,32,78,33,36,146,87,19
4,108,60,35,37,22,76,65,66,72,78,...,33,59,32,36,87,19,49,21,80,58


In [21]:
for k,v in ans.iterrows():
    print(ans.at[k,'len'])
    if ans.at[k,'len'] < 20:
        row = best_answer[best_answer['user_id'] == v['user_id']]
        for i in range(20):
            if (row[str(i)].values[0] not in set(v['items'])) & (ans.at[k,'len'] < 20):
                ans.at[k,'items'].append(row[str(i)].values[0])
                ans.at[k,'len'] = len(ans.at[k,'items'])

28
19
29
18
18
30
27
17
39
29
29
19
39
51
51
19
29
17
26
24
30
37
29
20
18
33
37
23
27
19
19
19
24
31
33
17
41
20
29
28
31
19
22
27
20
36
33
36
39
19
27
18
29
33
26
19
29
36
31
30
24
19
33
29
18
18
30
33
27
24
19
19
29
37
41
20
41
18
31
33
27
19
29
39
18
39
24
30
19
37
19
39
27
33
29
29
24
36
26
39
41
37
29
19
18
31
18
29
26
27
36
19
27
39
18
29
19
19
36
41
27
29
31
24
24
19
36
19
31
18
28
27
41
23
17
37
36
19
27
20
41
19
19
29
17
33
41
27
18
33
17
39
41
39
23
19
29
23
41
31
18
27
33
17
33
19
37
31
22
27
19
36
41
31
23
26
33
29
27
39
51
17
36
17
37
36
27
26
19
41
27
39
18
36
41
18
19
39
41
27
41
51
29
18
18
39
51
18
51
41
29
18
30
28
18
51
51
39
41
24
26
41
51
36
19
33
19
51
26
37
26
27
19
41
39
41
15
41
41
41
12
26
15
31
33
33
33
39
37
37
19
27
15
31
22
31
26
36
33
19
19
26
19
36
33
39
26
41
17
36
22
36
19
33
19
31
31
39
31
30
17
17
41
31
33
31
30
33
30
13
41
12
36
56
36
27
22
17
30
18
41
41
20
36
26
29
41
22
17
20
29
29
33
22
56
18
33
41
19
41
27
39
22
27
28
37
31
18
19
20
27
56
33
3

In [25]:
res = []
for k,v in ans.iterrows():
    res.append({
        'user_id': int(v['user_id']),
        '0': int(v['items'][0]),
        '1': int(v['items'][1]),
        '2': int(v['items'][2]),
        '3': int(v['items'][3]),
        '4': int(v['items'][4]),
        '5': int(v['items'][5]),
        '6': int(v['items'][6]),
        '7': int(v['items'][7]),
        '8': int(v['items'][8]),
        '9': int(v['items'][9]),
        '10': int(v['items'][10]),
        '11': int(v['items'][11]),
        '12': int(v['items'][12]),
        '13': int(v['items'][13]),
        '14': int(v['items'][14]),
        '15': int(v['items'][15]),
        '16': int(v['items'][16]),
        '17': int(v['items'][17]),
        '18': int(v['items'][18]),
        '19': int(v['items'][19])
    })   
answer = pd.DataFrame(res)

In [26]:
answer.to_csv('new_answer'+str(datetime.datetime.now())+'.csv', index=False)
answer.head()

Unnamed: 0,user_id,0,1,2,3,4,5,6,7,8,...,10,11,12,13,14,15,16,17,18,19
0,166,67,119,22,35,66,59,72,40,84,...,78,87,76,149,155,5,39,90,47,21
1,26,18,1,143,45,83,63,14,76,152,...,84,6,24,21,134,35,113,17,48,60
2,41,40,37,129,76,22,7,90,103,147,...,60,51,14,66,26,49,44,54,97,12
3,286,15,22,9,65,180,37,172,40,90,...,23,39,8,76,58,87,13,19,60,35
4,108,15,22,9,65,180,37,172,40,90,...,23,39,8,76,58,87,13,19,60,35


# Теперь попробуем обратное со стороны пользователей

In [None]:
ans = []
for k,v in test.iterrows():
    train[(train['user_id'] == v['user_id']) & (train['like'] == 1)]['item_id'].
    

In [None]:
user_group = user_features[user_features['user_id'] == v['user_id']]['group_user'].values[0]
    like_list = user_features[user_features['group_user'] == user_group]['user_id'].unique()
    like_items = list(train[train['user_id'].isin(user_list) & (train['like']==1)]['item_id'].unique())
    ans.append({'user_id':v['user_id'], 'items':like_items})
ans = pd.DataFrame(ans)
ans['len'] = ans['items'].apply(len)