In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from scipy import sparse
from scipy.linalg import svd
from tqdm.notebook import tqdm

%matplotlib inline

### Загрузим данные

In [6]:
users = pd.read_csv('data/users.csv', low_memory=False)
organisations = pd.read_csv('data/organisations.csv', low_memory=False)
features = pd.read_csv('data/features.csv', low_memory=False)
aspects = pd.read_csv('data/aspects.csv', low_memory=False)
rubricks = pd.read_csv('data/rubrics.csv', low_memory=False)
reviews = pd.read_csv('data/reviews.csv', low_memory=False)


to_list = lambda rubrics: [int(rubric) for rubric in str(rubrics).split(' ')]


def apply_to_columns(df, columns, func=to_list):
    for column in columns:
        df.loc[~df[column].isnull(), column] = df.loc[~df[column].isnull(), column].apply(func)

columns_to_int = ['rubrics_id', 'features_id']
apply_to_columns(organisations, columns_to_int)

columns_to_int = ['aspects']
apply_to_columns(reviews, columns_to_int)

### Закодируем расположение города пользователя и организации методом one-hot-encoding

In [7]:
users = pd.get_dummies(users, columns=['city'], prefix='user')
organisations = pd.get_dummies(organisations, columns=['city'], prefix='org')

### Попробуем реализовать алгоритм сингулярного разложения матриц
(SVD - Singular-Value Decomposition)
Для этого сначала преобразуем данные к матричному виду, где строка - пользователи,
а столбцы - продукты, с котороыми пользователи взаимодействовали

In [8]:
aspects_columns = [id for id, aspect in aspects.values]
features_columns = [id for id, feature in features.values]
rubricks_columns = [id for id, rubrick in rubricks.values]
columns = aspects_columns + features_columns + rubricks_columns

data = users.merge(reviews, on="user_id")
data = data.merge(organisations, on="org_id")
data = data.rename({'rating_x': 'user_rating', 'rating_y': 'org_rating'}, axis=1)

for column_name in columns:
    data[column_name] = float(0)

data.head(2)

  data[column_name] = float(0)


Unnamed: 0,user_id,user_msk,user_spb,org_id,user_rating,ts,aspects,average_bill,org_rating,rubrics_id,...,30776,30777,31286,31350,31375,31401,31495,3108292683,3501514558,3501750896
0,523295021912509756,1,0,5145242920031317950,5.0,819,,500.0,4.934783,[31495],...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,362006428924147790,1,0,5145242920031317950,5.0,839,"[10, 274]",500.0,4.934783,[31495],...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Получили датасет размерностью (3640835, 223)
Теперь необходимо заполнить новые колонки данными

In [9]:
for row in tqdm(range(0, len(data))):
    if data.iloc[row, 6] is not np.nan:
        for aspect in data.iloc[row, 6]:
            data.at[row, aspect] = 1
    if data.iloc[row, 10] is not np.nan:
        for feature in data.iloc[row, 10]:
            data.at[row, feature] = 1
    for rubrick in data.iloc[row, 9]:
        data.at[row, rubrick] = 1
    break

data.head(2)

  0%|          | 0/3640835 [00:00<?, ?it/s]

Unnamed: 0,user_id,user_msk,user_spb,org_id,user_rating,ts,aspects,average_bill,org_rating,rubrics_id,...,30776,30777,31286,31350,31375,31401,31495,3108292683,3501514558,3501750896
0,523295021912509756,1,0,5145242920031317950,5.0,819,,500.0,4.934783,[31495],...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,362006428924147790,1,0,5145242920031317950,5.0,839,"[10, 274]",500.0,4.934783,[31495],...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Отбросим более не нужные колонки и заменим nan на 0

In [6]:
data = data.drop(data.columns[[6, 9, 10]], axis=1)
data.head(10)

Unnamed: 0,user_id,user_msk,user_spb,org_id,rating_x,aspects,average_bill,rating_y,org_msk,org_spb,...,0,Unnamed: 13,2,7,4,6,9,3,8,5
0,523295021912509756,1,0,5145242920031317950,5.0,0,500.0,4.934783,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,362006428924147790,1,0,5145242920031317950,5.0,10 274,500.0,4.934783,1,0,...,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
2,10686793557064657689,1,0,5145242920031317950,5.0,0,500.0,4.934783,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,9907863350447728102,1,0,5145242920031317950,5.0,0,500.0,4.934783,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,13999966736615242178,1,0,5145242920031317950,5.0,6 9 10,500.0,4.934783,1,0,...,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
5,9199988279729242755,1,0,5145242920031317950,4.0,9 307,500.0,4.934783,1,0,...,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0
6,10533787511801499419,1,0,5145242920031317950,5.0,10,500.0,4.934783,1,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,2065536097155165002,1,0,5145242920031317950,4.0,0,500.0,4.934783,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,9165256876644519976,1,0,5145242920031317950,5.0,307,500.0,4.934783,1,0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
9,8752067030694028029,1,0,5145242920031317950,5.0,10,500.0,4.934783,1,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
data.to_csv('data.csv')