## Przygotowanie danych

In [2]:
import dask.dataframe as dd
import numpy as np
import pandas as pd
import pickle
from dask_ml import preprocessing
import datetime

In [3]:
from dask.distributed import Client, LocalCluster

In [4]:
cluster = LocalCluster()
client = Client(cluster)

In [4]:
client

0,1
Client  Scheduler: tcp://127.0.0.1:40489  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 4  Cores: 8  Memory: 16.19 GB


In [6]:
df_headers = ['Sale','SalesAmountInEuro','time_delay_for_conversion','click_timestamp',
              'nb_clicks_1week','product_price','product_age_group','device_type',
              'audience_id','product_gender','product_brand',
              'prod_category1','prod_category2','prod_category3','prod_category4',
              'prod_category5','prod_category6','prod_category7','product_country',
              'product_id','product_title','partner_id','user_id']

# dtypes={'Sale':'bool','prod_category1': 'object','prod_category2': 'object',
#         'prod_category3': 'object','prod_category4': 'object',
#         'prod_category5': 'object','prod_category6': 'object',
#         'prod_category7': 'object', 'product_category': 'str'}

dtypes={'prod_category1': 'category','prod_category2': 'category',
        'prod_category3': 'category','prod_category4': 'category',
        'prod_category5': 'category','prod_category6': 'category',
        'prod_category7': 'category', 'product_category': 'str'}

encoder_labels = ['product_age_group','device_type','audience_id',
                  'product_gender', 'product_brand','prod_category1','prod_category2',
                  'prod_category3','prod_category4','prod_category5',
                  'prod_category6','prod_category7','product_country',
                  'product_id','product_title','partner_id','user_id']

def dateparse (time_in_secs):
    return datetime.datetime.fromtimestamp(float(time_in_secs))

df = pd.read_csv("../../../data/CriteoSearchData.csv", header=None,
                 names=df_headers, delimiter='\t', dtype=dtypes, parse_dates=['click_timestamp'], date_parser= dateparse)

In [7]:
df

Unnamed: 0,Sale,SalesAmountInEuro,time_delay_for_conversion,click_timestamp,nb_clicks_1week,product_price,product_age_group,device_type,audience_id,product_gender,...,prod_category3,prod_category4,prod_category5,prod_category6,prod_category7,product_country,product_id,product_title,partner_id,user_id
0,0,-1.0,-1,2020-08-31 18:37:00,-1,0.0,-1,7E56C27BFF0305E788DA55A029EC4988,-1,-1,...,-1,-1,-1,-1,-1,57A1D462A03BD076E029CF9310C11FC5,A66DB02AC1726A8D79C518B7F7AB79F0,-1,E3DDEB04F8AFF944B11943BB57D2F620,493CFB4A87C50804C94C0CF76ABD19CD
1,0,-1.0,-1,2020-09-01 03:54:44,0,0.0,4C90FD52FC53D2C1C205844CB69575AB,D7D1FB49049702BF6338894757E0D959,-1,1B491180398E2F0390E6A588B3BCE291,...,-1,-1,-1,-1,-1,57A1D462A03BD076E029CF9310C11FC5,4C6C62203B4CE5AA6DFCF17F2604DC37,3CE3D1623D32FB2A901822910AF72A0B C83584C673752...,BD01BAFAE73CF38C403978BBB458300C,D0EBCD4402172AE3AA6FD21FB77BDE84
2,0,-1.0,-1,2020-08-31 18:49:28,-1,0.0,-1,7E56C27BFF0305E788DA55A029EC4988,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,C4D189327BD87FEB3BF896DA716C6995,E8A8BEA257B19574C36108D0F0CDADE5
3,0,-1.0,-1,2020-08-31 18:26:14,-1,0.0,-1,D7D1FB49049702BF6338894757E0D959,B0E6BD3ACC0707BAB31CA2C2B08E652F,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,440255DF62CFD36FBC0206828FC488E0,9703077BD18F430BD232A6F74931928E
4,0,-1.0,-1,2020-08-31 22:20:45,-1,0.0,-1,7E56C27BFF0305E788DA55A029EC4988,-1,-1,...,-1,-1,-1,-1,-1,57A1D462A03BD076E029CF9310C11FC5,5F4AFF2693601C2EFFE9AF2C1ED4222E,-1,E3DDEB04F8AFF944B11943BB57D2F620,703B8CFC8D65A67BDF96595CCF992D27
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15995629,0,-1.0,-1,2020-08-09 20:35:17,14,0.0,88460293E9517D21E39AA889BD5AADC8,7E56C27BFF0305E788DA55A029EC4988,-1,1B491180398E2F0390E6A588B3BCE291,...,-1,-1,-1,-1,-1,F0C08A161AA0404193A7EF78F5C3E06C,81075F8D35AD6A054547EEDBF9C3C845,202165BDC8B614AC3B7F292214B455AD 8859F5D2DCBB5...,CC14152DD0806823ED88BA55D9A5240B,91AAEAC11C0A82339C4413F51EF40873
15995630,0,-1.0,-1,2020-08-10 02:21:25,0,0.0,4C90FD52FC53D2C1C205844CB69575AB,7E56C27BFF0305E788DA55A029EC4988,-1,1B491180398E2F0390E6A588B3BCE291,...,7A6CEBB58995350B3B174F41DA99FBE7,61B7D8AAF123E545037AA95782986E1D,-1,-1,-1,F0C08A161AA0404193A7EF78F5C3E06C,DE8B7BA7EF523912230CB0F52C38061E,1DBBC0561EC13267859FFEEC908945FA 66418DFAB96BE...,4691EA5C1795A32C20FCEF274CC28BE5,E9B8E33F395DB60C9B2A1AA01C8044AE
15995631,0,-1.0,-1,2020-08-09 10:07:46,5,0.0,4C90FD52FC53D2C1C205844CB69575AB,7E56C27BFF0305E788DA55A029EC4988,-1,C45A9AC6D102ACAEEDF0D6F78636D84A,...,-1,-1,-1,-1,-1,F0C08A161AA0404193A7EF78F5C3E06C,7169291F07C0D3D301AF763B67FF6A4F,A6846C722EEB95A9DF0F2FC5B2B04CE2 8EED363963B19...,CC14152DD0806823ED88BA55D9A5240B,1DB53816522BE08D341C0BB19189527E
15995632,0,-1.0,-1,2020-08-09 10:08:01,5,0.0,4C90FD52FC53D2C1C205844CB69575AB,7E56C27BFF0305E788DA55A029EC4988,-1,C45A9AC6D102ACAEEDF0D6F78636D84A,...,-1,-1,-1,-1,-1,F0C08A161AA0404193A7EF78F5C3E06C,7169291F07C0D3D301AF763B67FF6A4F,A6846C722EEB95A9DF0F2FC5B2B04CE2 8EED363963B19...,CC14152DD0806823ED88BA55D9A5240B,1DB53816522BE08D341C0BB19189527E


In [8]:
df['product_title'] = df.product_title.astype(str)
df['product_title'] = df['product_title'].replace('nan', 'BROKEN')

In [9]:
df['product_title'] = df.product_title.astype(str)
df['user_id'] = df.user_id.astype(str)


### Praca z dask_ml.preprocessing.LabelEncoder

#### Przygotowanie LabelEncodera

In [None]:
# not needed anymore, load pickle file instead
labelEncoders = {x:preprocessing.LabelEncoder() for x in encoder_labels}
for k,v in labelEncoders.items():
    print('starting {}'.format(k))
    df[k] = v.fit_transform(df[k])
    print('done')

#### Zapis labelEncodera do pliku

In [31]:
import pickle
with open('../lablencoder_new.pickle', 'wb') as handle: #wiem, że tu jest literówka, trudno
    pickle.dump(labelEncoders, handle, protocol=pickle.HIGHEST_PROTOCOL)

#### Wykorzystanie labelEncodera

Załaduj przygotowany labelEncoder za pomocą:

In [10]:
labelEncoders = pickle.load(open("../../../data/lablencoder.pickle","rb"))

#### Badanie zawartości labelEncodera

In [11]:
labelEncoders

{'product_age_group': LabelEncoder(),
 'device_type': LabelEncoder(),
 'audience_id': LabelEncoder(),
 'product_gender': LabelEncoder(),
 'product_brand': LabelEncoder(),
 'prod_category1': LabelEncoder(),
 'prod_category2': LabelEncoder(),
 'prod_category3': LabelEncoder(),
 'prod_category4': LabelEncoder(),
 'prod_category5': LabelEncoder(),
 'prod_category6': LabelEncoder(),
 'prod_category7': LabelEncoder(),
 'product_country': LabelEncoder(),
 'product_id': LabelEncoder(),
 'product_title': LabelEncoder(),
 'partner_id': LabelEncoder(),
 'user_id': LabelEncoder()}

In [12]:
for k,v in labelEncoders.items():
    df[k] = v.transform(df[k].values)
    print('{} done'.format(k))

product_age_group done
device_type done
audience_id done
product_gender done
product_brand done
prod_category1 done
prod_category2 done
prod_category3 done
prod_category4 done
prod_category5 done
prod_category6 done
prod_category7 done
product_country done
product_id done
product_title done
partner_id done
user_id done


In [None]:
for k,v in labelEncoders.items():
    print("{} table lookup".format(k))
    print("showing {} classes:".format(len(v.classes_)))
    print(v.classes_)
    print("\n")

Kolumna product_title zawiera wartość BROKEN zastępującą wcześniej pojawiający sie NaN

In [37]:
'BROKEN' in labelEncoders.get('product_title').classes_

True

In [38]:
labelEncoders.get('product_title').transform(['BROKEN'])

array([596334])

In [39]:
labelEncoders.get('product_title').inverse_transform([596334])

array(['BROKEN'], dtype=object)

tornado.application - ERROR - Uncaught exception GET /status/ws (127.0.0.1)
HTTPServerRequest(protocol='http', host='127.0.0.1:8787', method='GET', uri='/status/ws', version='HTTP/1.1', remote_ip='127.0.0.1')
Traceback (most recent call last):
  File "/usr/local/lib/python3.8/dist-packages/tornado/websocket.py", line 954, in _accept_connection
    open_result = handler.open(*handler.open_args, **handler.open_kwargs)
  File "/usr/local/lib/python3.8/dist-packages/tornado/web.py", line 3173, in wrapper
    return method(self, *args, **kwargs)
  File "/usr/local/lib/python3.8/dist-packages/bokeh/server/views/ws.py", line 137, in open
    raise ProtocolError("Token is expired.")
bokeh.protocol.exceptions.ProtocolError: Token is expired.


In [15]:
df.to_parquet('AAAAAAAAAAA.parquet')