## Przygotowanie danych

In [1]:
import dask.dataframe as dd
import numpy as np
import pandas as pd
import pickle
from dask_ml import preprocessing
import datetime

In [2]:
from dask.distributed import Client, LocalCluster

In [3]:
cluster = LocalCluster()
client = Client(cluster)

In [4]:
client

0,1
Client  Scheduler: tcp://127.0.0.1:40489  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 4  Cores: 8  Memory: 16.19 GB


In [2]:
df_headers = ['Sale','SalesAmountInEuro','time_delay_for_conversion','click_timestamp',
              'nb_clicks_1week','product_price','product_age_group','device_type',
              'audience_id','product_gender','product_brand',
              'prod_category1','prod_category2','prod_category3','prod_category4',
              'prod_category5','prod_category6','prod_category7','product_country',
              'product_id','product_title','partner_id','user_id']

# dtypes={'Sale':'bool','prod_category1': 'object','prod_category2': 'object',
#         'prod_category3': 'object','prod_category4': 'object',
#         'prod_category5': 'object','prod_category6': 'object',
#         'prod_category7': 'object', 'product_category': 'str'}

dtypes={'prod_category1': 'category','prod_category2': 'category',
        'prod_category3': 'category','prod_category4': 'category',
        'prod_category5': 'category','prod_category6': 'category',
        'prod_category7': 'category', 'product_category': 'str'}

encoder_labels = ['product_age_group','device_type','audience_id',
                  'product_gender', 'product_brand','prod_category1','prod_category2',
                  'prod_category3','prod_category4','prod_category5',
                  'prod_category6','prod_category7','product_country',
                  'product_id','product_title','partner_id','user_id']

def dateparse (time_in_secs):
    return datetime.datetime.fromtimestamp(float(time_in_secs))

df = pd.read_csv("../../../data/CriteoSearchData.csv", header=None,
                 names=df_headers, delimiter='\t', dtype=dtypes, parse_dates=['click_timestamp'], date_parser= dateparse)

In [3]:
df

Unnamed: 0,Sale,SalesAmountInEuro,time_delay_for_conversion,click_timestamp,nb_clicks_1week,product_price,product_age_group,device_type,audience_id,product_gender,...,prod_category3,prod_category4,prod_category5,prod_category6,prod_category7,product_country,product_id,product_title,partner_id,user_id
0,0,-1.0000,-1,2020-08-31 18:37:00,-1,0.0,-1,7E56C27BFF0305E788DA55A029EC4988,-1,-1,...,-1,-1,-1,-1,-1,57A1D462A03BD076E029CF9310C11FC5,A66DB02AC1726A8D79C518B7F7AB79F0,-1,E3DDEB04F8AFF944B11943BB57D2F620,493CFB4A87C50804C94C0CF76ABD19CD
1,0,-1.0000,-1,2020-09-01 03:54:44,0,0.0,4C90FD52FC53D2C1C205844CB69575AB,D7D1FB49049702BF6338894757E0D959,-1,1B491180398E2F0390E6A588B3BCE291,...,-1,-1,-1,-1,-1,57A1D462A03BD076E029CF9310C11FC5,4C6C62203B4CE5AA6DFCF17F2604DC37,3CE3D1623D32FB2A901822910AF72A0B C83584C673752...,BD01BAFAE73CF38C403978BBB458300C,D0EBCD4402172AE3AA6FD21FB77BDE84
2,0,-1.0000,-1,2020-08-31 18:49:28,-1,0.0,-1,7E56C27BFF0305E788DA55A029EC4988,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,C4D189327BD87FEB3BF896DA716C6995,E8A8BEA257B19574C36108D0F0CDADE5
3,0,-1.0000,-1,2020-08-31 18:26:14,-1,0.0,-1,D7D1FB49049702BF6338894757E0D959,B0E6BD3ACC0707BAB31CA2C2B08E652F,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,440255DF62CFD36FBC0206828FC488E0,9703077BD18F430BD232A6F74931928E
4,0,-1.0000,-1,2020-08-31 22:20:45,-1,0.0,-1,7E56C27BFF0305E788DA55A029EC4988,-1,-1,...,-1,-1,-1,-1,-1,57A1D462A03BD076E029CF9310C11FC5,5F4AFF2693601C2EFFE9AF2C1ED4222E,-1,E3DDEB04F8AFF944B11943BB57D2F620,703B8CFC8D65A67BDF96595CCF992D27
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5214279,0,-1.0000,-1,2020-08-18 07:03:10,-1,0.0,-1,D7D1FB49049702BF6338894757E0D959,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,A9AD79D7E99CB08B4DA34A6B1A35557D,7980BC37E89BFC233A6790C5CA096F53
5214280,0,-1.0000,-1,2020-08-18 05:02:50,-1,0.0,-1,7E56C27BFF0305E788DA55A029EC4988,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,344404F6582BD73C10ED6584A64C2C2C,427BAA921C3C34D10407A8A5703913E7
5214281,0,-1.0000,-1,2020-08-18 07:22:19,-1,0.0,-1,7E56C27BFF0305E788DA55A029EC4988,-1,-1,...,A227180952EEB783728B193330A5E850,-1,-1,-1,-1,2AC62132FBCFA093B9426894A4BC6278,D8190CDFD2277B28F35B430D5B0852D7,30A5E0DBF8C085C415C4B8313820ED64 44FECE1789723...,699FD8CE44479877E1A1907032BA901A,95ECDAACE379D59384A9DFACDF3E6942
5214282,1,84.5554,908,2020-08-18 05:54:11,-1,0.0,-1,D7D1FB49049702BF6338894757E0D959,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,95F6B7BA0BA90E5D5DF1CC063390BA51,517793D32F22CDE816007AE553FC8247


In [4]:
df['product_title'] = df.product_title.astype(str)
df['product_title'] = df['product_title'].replace('nan', 'BROKEN')

In [24]:
df.head()

Unnamed: 0,Sale,SalesAmountInEuro,time_delay_for_conversion,click_timestamp,nb_clicks_1week,product_price,product_age_group,device_type,audience_id,product_gender,...,prod_category3,prod_category4,prod_category5,prod_category6,prod_category7,product_country,product_id,product_title,partner_id,user_id
0,False,-1.0,-1,2020-08-31 16:37:00,-1,0.0,0,3,0,0,...,0,0,0,0,0,8,1058198,0,277,3924600
1,False,-1.0,-1,2020-09-01 01:54:44,0,0.0,4,7,0,2,...,0,0,0,0,0,8,486749,188114,230,11188022
2,False,-1.0,-1,2020-08-31 16:49:28,-1,0.0,0,3,0,0,...,0,0,0,0,0,0,0,0,245,12458754
3,False,-1.0,-1,2020-08-31 16:26:14,-1,0.0,0,7,12657,0,...,0,0,0,0,0,0,0,0,85,8086037
4,False,-1.0,-1,2020-08-31 20:20:45,-1,0.0,0,3,0,0,...,0,0,0,0,0,8,606352,0,277,6010702


### Praca z dask_ml.preprocessing.LabelEncoder

#### Przygotowanie LabelEncodera

In [None]:
# not needed anymore, load pickle file instead
labelEncoders = {x:preprocessing.LabelEncoder() for x in encoder_labels}
for k,v in labelEncoders.items():
    print('starting {}'.format(k))
    df[k] = v.fit_transform(df[k])
    print('done')

#### Zapis labelEncodera do pliku

In [31]:
import pickle
with open('../lablencoder_new.pickle', 'wb') as handle: #wiem, że tu jest literówka, trudno
    pickle.dump(labelEncoders, handle, protocol=pickle.HIGHEST_PROTOCOL)

#### Wykorzystanie labelEncodera

Załaduj przygotowany labelEncoder za pomocą:

In [6]:
labelEncoders = pickle.load(open("../../../data/lablencoder.pickle","rb"))

#### Badanie zawartości labelEncodera

In [7]:
labelEncoders

{'product_age_group': LabelEncoder(),
 'device_type': LabelEncoder(),
 'audience_id': LabelEncoder(),
 'product_gender': LabelEncoder(),
 'product_brand': LabelEncoder(),
 'prod_category1': LabelEncoder(),
 'prod_category2': LabelEncoder(),
 'prod_category3': LabelEncoder(),
 'prod_category4': LabelEncoder(),
 'prod_category5': LabelEncoder(),
 'prod_category6': LabelEncoder(),
 'prod_category7': LabelEncoder(),
 'product_country': LabelEncoder(),
 'product_id': LabelEncoder(),
 'product_title': LabelEncoder(),
 'partner_id': LabelEncoder(),
 'user_id': LabelEncoder()}

In [10]:
for k,v in labelEncoders.items():
    df[k] = v.transform(df[k])
    print('{} done'.format(k))

[9]
product_age_group done
[5]
device_type done
[13804]
audience_id done
[14]
product_gender done
[42303]
product_brand done
[18]
prod_category1 done
[101]
prod_category2 done
[473]
prod_category3 done
[1553]
prod_category4 done
[733]
prod_category5 done
[168]
prod_category6 done
[1]
prod_category7 done
[16]
product_country done
[1226495]
product_id done
[600039]
product_title done
[235]
partner_id done
[10333150]
user_id done


In [None]:
for k,v in labelEncoders.items():
    print("{} table lookup".format(k))
    print("showing {} classes:".format(len(v.classes_)))
    print(v.classes_)
    print("\n")

Kolumna product_title zawiera wartość BROKEN zastępującą wcześniej pojawiający sie NaN

In [37]:
'BROKEN' in labelEncoders.get('product_title').classes_

True

In [38]:
labelEncoders.get('product_title').transform(['BROKEN'])

array([596334])

In [39]:
labelEncoders.get('product_title').inverse_transform([596334])

array(['BROKEN'], dtype=object)

tornado.application - ERROR - Uncaught exception GET /status/ws (127.0.0.1)
HTTPServerRequest(protocol='http', host='127.0.0.1:8787', method='GET', uri='/status/ws', version='HTTP/1.1', remote_ip='127.0.0.1')
Traceback (most recent call last):
  File "/usr/local/lib/python3.8/dist-packages/tornado/websocket.py", line 954, in _accept_connection
    open_result = handler.open(*handler.open_args, **handler.open_kwargs)
  File "/usr/local/lib/python3.8/dist-packages/tornado/web.py", line 3173, in wrapper
    return method(self, *args, **kwargs)
  File "/usr/local/lib/python3.8/dist-packages/bokeh/server/views/ws.py", line 137, in open
    raise ProtocolError("Token is expired.")
bokeh.protocol.exceptions.ProtocolError: Token is expired.
