In [1]:
import os
import json
import pandas as pd
import numpy as np
import tqdm
import scipy.sparse as sp

from pprint import pprint

In [2]:
pd.set_option('display.max_columns',100)

In [3]:
DATA_PATH = './orig_data'

In [4]:
with open(os.path.join(DATA_PATH, 'catalogue.json'), 'r') as f:
    catalogue = json.load(f)
    
catalogue = {int(k): v for k, v in catalogue.items()}

In [5]:
pprint(catalogue[100])

{'attributes': [18441,
                16300,
                16580,
                18770,
                18771,
                18643,
                396,
                18772,
                3771,
                18773,
                910,
                18774,
                16364,
                3277],
 'availability': ['purchase', 'rent'],
 'duration': 80,
 'feature_1': 6064738.740195342,
 'feature_2': 0.752750538,
 'feature_3': 4,
 'feature_4': 0.9537104605,
 'feature_5': 0.0,
 'type': 'movie'}


 - `attributes` — мешок атрибутов
 - `availability` — доступность (может содержать значения `purchase`, `rent` и `subscription`)
 - `duration` — длительность в минутах, округлённая до десятков (продолжительность серии для сериалов и многосерийных фильмов)
 - `feature_1,2,4,5` —  анонимизированные вещественные признаки, `feature_3` - порядковый
 - `type` — принимает значения `movie`, `multipart_movie` или `series`

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

In [7]:
s = [','.join([str(i) for i in catalogue[100]['attributes']])]
s1 = [','.join([str(i) for i in catalogue[10]['attributes']])]
print(s,s1)
cv = CountVectorizer(token_pattern='\d+')
print(cv.fit_transform([*s,*s1]).todense())


['18441,16300,16580,18770,18771,18643,396,18772,3771,18773,910,18774,16364,3277'] ['10511,2752,9076,1373,472,7,10512,42,43,10513,25']
[[0 0 0 0 1 1 1 1 1 1 1 1 1 1 0 0 1 1 1 0 0 0 0 0 1]
 [1 1 1 1 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 1 1 1 1 1 0]]


In [8]:
# Сделаем мешок атрибутов для каждого фильма
bag_of_attr = {}
map_type = {'movie':2,'multipart_movie':2,'series':3}
buf = []
ind = []
bag = []
for key,cat in tqdm.tqdm(catalogue.items()):
    
    bag_of_attr[int(key)]= ','.join([str(i) for i in cat['attributes']])
    bag.append((','.join([str(i) for i in cat['attributes']])))
    buf.append([1*('purchase' in cat['availability']), 1*('rent' in cat['availability']),1*('subscription' in cat['availability']),
               cat['duration'],cat['feature_1'],cat['feature_2'],cat['feature_3'],cat['feature_4'],cat['feature_5'],
               cat['type']])
    ind.append(int(key))
data_catalogue = pd.DataFrame(buf,columns = ['is_purchase','is_rent','is_subscription','duration','feature_1',
                                            'feature_2','feature_3','feature_4','feature_5','type'],index = ind)
data_catalogue['type'] = data_catalogue['type'].map(map_type)
cv = CountVectorizer(token_pattern='\d+',)
X = cv.fit_transform(bag)
#bag_of_attr = pd.DataFrame(X.todense(),columns = cv.get_feature_names(),index = ind)

100%|████████████████████████████████████████████████████████████████████████| 10200/10200 [00:00<00:00, 132703.97it/s]


In [9]:
data_catalogue.head()

Unnamed: 0,is_purchase,is_rent,is_subscription,duration,feature_1,feature_2,feature_3,feature_4,feature_5,type
1983,1,1,1,140,1657223.0,0.75361,39,1.119409,0.0,2
3783,1,1,1,110,35565210.0,0.766254,41,1.138604,0.654707,2
5208,1,1,1,90,13270680.0,0.765425,27,1.131807,0.592716,2
9744,1,1,1,120,21749920.0,0.757874,26,1.133525,0.654707,2
1912,1,1,0,110,9212964.0,0.759566,7,1.110127,0.654707,2


In [10]:
#bag_of_attr.describe()
#bag_of_attr.shape

In [11]:
%%time
transactions = pd.read_csv(
    os.path.join(DATA_PATH, 'transactions.csv'),
    dtype={
        'element_uid': np.uint16,
        'user_uid': np.uint32,
        'consumption_mode': 'category',
        'ts': np.float64,
        'watched_time': np.uint64,
        'device_type': np.uint8,
        'device_manufacturer': np.uint8
    }
)

Wall time: 4.7 s


In [12]:
%%time
ratings = pd.read_csv(
    os.path.join(DATA_PATH, 'ratings.csv'),
    dtype={
        'element_uid': np.uint16,
        'user_uid': np.uint32,
        'ts': np.float64,
        'rating': np.uint8
    }
)

Wall time: 194 ms


In [13]:
ratings.iloc[100]

user_uid       2.078610e+05
element_uid    2.714000e+03
rating         1.000000e+01
ts             4.430376e+07
Name: 100, dtype: float64

In [14]:
%%time
bookmarks = pd.read_csv(
    os.path.join(DATA_PATH, 'bookmarks.csv'),
    dtype={
        'element_uid': np.uint16,
        'user_uid': np.uint32,
        'ts': np.float64
    }
)

Wall time: 306 ms


In [15]:
#Попробуем для каждого пользователя сделать некоторый путь потребления контента
# Для этого объединим все действия пользователей
transactions['action'] = 'watch'
ratings['action'] = 'rate'
bookmarks['action'] = 'bookmate'

In [16]:
bookmarks.iloc[100]

user_uid            559099
element_uid            943
ts             4.43041e+07
action            bookmate
Name: 100, dtype: object

In [17]:
actions = pd.concat((transactions,ratings,bookmarks))

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


In [18]:
actions.head()

Unnamed: 0,action,consumption_mode,device_manufacturer,device_type,element_uid,rating,ts,user_uid,watched_time
0,watch,S,50.0,0.0,3336,,44305180.0,5177,4282.0
1,watch,S,11.0,0.0,481,,44305180.0,593316,2989.0
2,watch,S,50.0,0.0,4128,,44305180.0,262355,833.0
3,watch,S,99.0,0.0,6272,,44305180.0,74296,2530.0
4,watch,P,50.0,0.0,5543,,44305180.0,340623,6282.0


`consumption_mode` — тип потребления (переменная принимает следующие значения: P — покупка, R — аренда, S — просмотр по подписке)

In [None]:
actions.set_index(['user_uid','element_uid','ts'],inplace = True)

In [None]:
actions.sort_index(inplace = True)

In [None]:
one_user = actions.loc[207861]

In [None]:
one_user

Unnamed: 0_level_0,Unnamed: 1_level_0,action,consumption_mode,device_manufacturer,device_type,rating,watched_time
element_uid,ts,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1570,43575720.0,rate,,,,10.0,
1570,43576280.0,watch,R,50.0,0.0,,6229.0
2714,44303460.0,watch,R,50.0,0.0,,8088.0
2714,44303760.0,rate,,,,10.0,
3336,43351580.0,rate,,,,10.0,
3336,43352400.0,watch,P,50.0,0.0,,5839.0


In [None]:
actions.rating.value_counts()

10.0    167504
8.0      90764
9.0      62810
7.0      35343
6.0      32938
4.0      17264
5.0      15711
2.0       9289
3.0       4647
1.0       1685
0.0        835
Name: rating, dtype: int64

In [None]:
with open(os.path.join(DATA_PATH, 'test_users.json'), 'r') as f:
    test_users = set(json.load(f)['users'])

In [None]:
len(test_users & set(actions.index.get_level_values(0)))

50000

In [None]:
# Приделаем каждому чуваку атрибуты просмотренных фильмов. ну или вообще по всем действиям - они все позитивные
ind_user = []
buf = []
for i in tqdm.tqdm(np.unique(actions.index.get_level_values(0))):
    temp = np.unique(actions.loc[i].index.get_level_values(0))
    ind_user.append(i)
    
    s = ''
    for ii in temp:
        s+=bag_of_attr[ii]
        
        s+=','
    if  '26846' in s:
            break
    #assert X.shape[1] == len(a)
    buf.append(s)
    
    

 23%|████████████████▊                                                       | 116758/500000 [01:01<03:24, 1875.88it/s]

In [None]:
s

In [None]:
cv1 = CountVectorizer(token_pattern='\d+',)
X_user = cv1.fit_transform(buf)

In [None]:
X_user.shape

In [None]:
X.shape

In [None]:
# Дальше уже совершенно ненужная херь, запускать до сюда


In [None]:
len(cv1.get_feature_names())

In [None]:
len(cv.get_feature_names())

In [None]:
len(set(cv1.get_feature_names())^set(cv.get_feature_names()))

In [None]:
set(cv.get_feature_names())^set(cv1.get_feature_names())