In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from scipy import sparse
from scipy.linalg import svd
from tqdm.notebook import tqdm

%matplotlib inline

### Загрузим данные

In [2]:
users = pd.read_csv('data/users.csv', low_memory=False)
organisations = pd.read_csv('data/organisations.csv', low_memory=False)
features = pd.read_csv('data/features.csv', low_memory=False)
aspects = pd.read_csv('data/aspects.csv', low_memory=False)
rubricks = pd.read_csv('data/rubrics.csv', low_memory=False)
reviews = pd.read_csv('data/reviews.csv', low_memory=False)


to_list = lambda rubrics: [int(rubric) for rubric in str(rubrics).split(' ')]


def apply_to_columns(df, columns, func=to_list):
    for column in columns:
        df.loc[~df[column].isnull(), column] = df.loc[~df[column].isnull(), column].apply(func)

columns_to_int = ['rubrics_id', 'features_id']
apply_to_columns(organisations, columns_to_int)

columns_to_int = ['aspects']
apply_to_columns(reviews, columns_to_int)

### Закодируем расположение города пользователя и организации методом one-hot-encoding

In [3]:
users = pd.get_dummies(users, columns=['city'], prefix='user')
organisations = pd.get_dummies(organisations, columns=['city'], prefix='org')

### Попробуем реализовать алгоритм сингулярного разложения матриц
(SVD - Singular-Value Decomposition)
Для этого сначала преобразуем данные к матричному виду, где строка - пользователи,
а столбцы - продукты, с котороыми пользователи взаимодействовали

In [4]:
aspects_columns = [id for id, aspect in aspects.values]
features_columns = [id for id, feature in features.values]
rubricks_columns = [id for id, rubrick in rubricks.values]
columns = aspects_columns + features_columns + rubricks_columns

data = users.merge(reviews, on="user_id")
data = data.merge(organisations, on="org_id")
data = data.rename({'rating_x': 'user_rating', 'rating_y': 'org_rating'}, axis=1)

for column_name in columns:
    data[column_name] = float(0)

data.head(2)

  data[column_name] = float(0)


Unnamed: 0,user_id,user_msk,user_spb,org_id,user_rating,ts,aspects,average_bill,org_rating,rubrics_id,...,30776,30777,31286,31350,31375,31401,31495,3108292683,3501514558,3501750896
0,523295021912509756,1,0,5145242920031317950,5.0,819,,500.0,4.934783,[31495],...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,362006428924147790,1,0,5145242920031317950,5.0,839,"[10, 274]",500.0,4.934783,[31495],...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Получили датасет размерностью (3640835, 223)
Теперь необходимо заполнить новые колонки данными

In [5]:
for row in tqdm(range(0, len(data))):
    if data.iloc[row, 6] is not np.nan:
        for aspect in data.iloc[row, 6]:
            data.at[row, aspect] = 1
    if data.iloc[row, 10] is not np.nan:
        for feature in data.iloc[row, 10]:
            data.at[row, feature] = 1
    for rubrick in data.iloc[row, 9]:
        data.at[row, rubrick] = 1

data.head(2)

  0%|          | 0/3640835 [00:00<?, ?it/s]

Unnamed: 0,user_id,user_msk,user_spb,org_id,user_rating,ts,aspects,average_bill,org_rating,rubrics_id,...,30776,30777,31286,31350,31375,31401,31495,3108292683,3501514558,3501750896
0,523295021912509756,1,0,5145242920031317950,5.0,819,,500.0,4.934783,[31495],...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,362006428924147790,1,0,5145242920031317950,5.0,839,"[10, 274]",500.0,4.934783,[31495],...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


### Отбросим более не нужные колонки и заменим nan на 0

In [35]:
data = data.drop(data.columns[[6, 9, 10]], axis=1)
data.head(10)

Unnamed: 0,user_id,user_msk,user_spb,org_id,user_rating,ts,org_rating,org_msk,2,3,...,30776,30777,31286,31350,31375,31401,31495,3108292683,3501514558,3501750896
0,35829,1,0,18640,5.0,819,4.934783,1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,24759,1,0,18640,5.0,839,4.934783,1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,725867,1,0,18640,5.0,975,4.934783,1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,672972,1,0,18640,5.0,1145,4.934783,1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,950809,1,0,18640,5.0,1093,4.934783,1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
5,625213,1,0,18640,4.0,876,4.934783,1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
6,715505,1,0,18640,5.0,923,4.934783,1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
7,140319,1,0,18640,4.0,1095,4.934783,1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
8,622888,1,0,18640,5.0,920,4.934783,1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
9,594987,1,0,18640,5.0,1001,4.934783,1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


### Сохраним полученный датасет

In [7]:
data.to_csv('data.csv')

Собстевнно SVD

In [78]:
data = pd.read_csv('data.csv')


MemoryError: Unable to allocate 5.99 GiB for an array with shape (221, 3640835) and data type float64

In [15]:
from sklearn.preprocessing import LabelEncoder

user_enc = LabelEncoder()
org_enc = LabelEncoder()

user_enc = user_enc.fit(data.user_id.values)
org_enc = org_enc.fit(data.org_id.values)


data.loc[:, 'user_id'] = user_enc.transform(data.loc[:, 'user_id'].values)
data.loc[:, 'org_id'] = org_enc.transform(data.loc[:, 'org_id'].values)


In [32]:
from scipy.sparse import coo_matrix, csr_matrix

r = coo_matrix((data.user_rating.values, (data.user_id.values, data.org_id.values)))
r

<1252801x66405 sparse matrix of type '<class 'numpy.float64'>'
	with 3640835 stored elements in COOrdinate format>

In [33]:
from scipy.sparse.linalg import svds

u, s ,vt = svds(r, k=20)
u.shape, vt.shape

((1252801, 20), (20, 66405))

In [34]:
from sklearn.neighbors import NearestNeighbors

nn = NearestNeighbors(n_neighbors=20)
v = vt.T
nn.fit(v)
_, ind = nn.kneighbors(v, n_neighbors=20)

ind[:20]

array([[    0, 46056, 10433, 30300, 46643, 31535, 30772, 15313, 22398,
        45000, 57332, 22164, 63009, 27993,  5091,  2668,  5094, 33932,
        25680, 62900],
       [    1, 38535, 14876, 32550, 58601, 38470, 47634, 24804, 40871,
        42983, 62667,  9180, 23608, 34273,  4722,  8693, 46592, 10982,
         2960,  2823],
       [    2, 49095, 47762, 25328, 62942, 44995, 48993, 50992, 26932,
         3407,  6047, 40409, 13638, 11289, 54717, 36397, 50023, 22866,
        29786, 56633],
       [    3, 33482, 10071, 13219, 39763, 10044, 52771, 59892, 14007,
          391, 47332, 26491, 28675, 64163, 25831, 49736,  4357, 38870,
         8182, 46067],
       [    4, 23346, 35369, 27247, 34875, 14255, 11444,  3193, 63355,
        64114, 63148, 29817, 17550, 50169, 20689, 30747, 17293,  3052,
        58258, 14933],
       [    5,  3507, 26090,  1288, 23633,  4679, 21128, 21130, 14007,
        14993, 38941, 38386, 37306, 16288, 63833, 34295,  2423, 54062,
        49974, 20684],
       [  

In [31]:
orgs = data.sort_values('org_id').loc[:, 'org_id'].values
columns = ['org'] + ['nn_{}'.format(i) for i in range(1,20)]
df_nn = pd.DataFrame(data=orgs[ind], columns=columns)
df_nn.head(5)

Unnamed: 0,org,nn_1,nn_2,nn_3,nn_4,nn_5,nn_6,nn_7,nn_8,nn_9,nn_10,nn_11,nn_12,nn_13,nn_14,nn_15,nn_16,nn_17,nn_18,nn_19
0,0,2,2,2,2,2,2,2,2,2,2,2,2,1,2,2,0,2,2,2
1,0,2,2,2,2,0,2,2,2,2,2,2,2,2,2,1,2,2,2,2
2,1,2,2,2,2,0,2,2,2,2,2,2,2,2,2,2,0,2,2,2
3,2,2,2,2,2,0,2,2,2,2,2,2,2,2,1,2,2,0,2,2
4,2,2,2,2,2,0,2,2,2,2,2,2,2,2,1,2,0,2,2,2


In [29]:
nn = NearestNeighbors(n_neighbors=20)

nn.fit(u)
_, ind = nn.kneighbors(u, n_neighbors=20)

users_nn = data.sort_values('user_id').loc[:, 'user_id'].values
columns = ['user'] + ['nn_{}'.format(i) for i in range(1,20)]
df_nn = pd.DataFrame(data=users_nn[ind], columns=columns)
df_nn.head(5)

Unnamed: 0,user,nn_1,nn_2,nn_3,nn_4,nn_5,nn_6,nn_7,nn_8,nn_9,nn_10,nn_11,nn_12,nn_13,nn_14,nn_15,nn_16,nn_17,nn_18,nn_19
0,0,10,8,2,5,5,7,6,3,2,6,9,4,1,4,7,1,1,2,7
1,1,10,8,5,2,0,5,7,6,3,2,6,4,9,4,1,7,1,2,7
2,1,10,8,5,2,0,5,7,6,3,2,6,9,4,4,7,1,1,2,7
3,1,10,8,2,5,0,5,7,6,3,2,6,9,4,1,4,7,1,2,7
4,2,10,8,5,2,0,5,7,6,3,6,4,9,4,1,7,1,1,2,7


In [54]:
organisations.head(3)
org = organisations.sort_values('rating', ascending=False).loc[:, "org_id"].values

org = pd.DataFrame(data=org)
rate = organisations.loc[organisations['org_id'] == 7154086032495001965]
rate.head()

Unnamed: 0,org_id,average_bill,rating,rubrics_id,features_id,org_msk,org_spb
55731,7154086032495001965,,5.0,[30776],[20422],1,0
