# Data Exploration

### Imports


In [30]:
import pandas as pd
import numpy as np
from tqdm import tqdm


<div class="alert alert-block alert-warning">
<b>Dask:</b> Multiprocessing package that can be used for (almost all) Pandas & Scikit-learn functionality.
 Ships with Anaconda, but not with MiniConda.
</div>

In [31]:
import dask
import dask.dataframe as dd   # denk dat we het wel redden met pandas sequential 
# from sklearn.grid_search import GridSearchCV
#from dklearn.grid_search import GridSearchCV   # voorbeeld van dklearn => dit gaan we zeker nodig hebben bij hyperopt etc.
import dask
from dask.distributed import Client
client = Client() # often http://localhost:8787/status

Perhaps you already have a cluster running?
Hosting the HTTP server on port 55503 instead
  http_address["port"], self.http_server.port


### Load Data In

In [32]:
%%timeit -n 1 -r 1

df = pd.read_csv('2nd-assignment-dmt-2021/training_set_VU_DM.csv')
display(df)

Unnamed: 0,srch_id,date_time,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_starrating,prop_review_score,...,comp6_rate_percent_diff,comp7_rate,comp7_inv,comp7_rate_percent_diff,comp8_rate,comp8_inv,comp8_rate_percent_diff,click_bool,gross_bookings_usd,booking_bool
0,1,2013-04-04 08:32:15,12,187,,,219,893,3,3.5,...,,,,,0.0,0.0,,0,,0
1,1,2013-04-04 08:32:15,12,187,,,219,10404,4,4.0,...,,,,,0.0,0.0,,0,,0
2,1,2013-04-04 08:32:15,12,187,,,219,21315,3,4.5,...,,,,,0.0,0.0,,0,,0
3,1,2013-04-04 08:32:15,12,187,,,219,27348,2,4.0,...,,,,,-1.0,0.0,5.0,0,,0
4,1,2013-04-04 08:32:15,12,187,,,219,29604,4,3.5,...,,,,,0.0,0.0,,0,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4958342,332785,2013-06-30 19:55:18,5,219,,,219,77700,3,4.0,...,,,,,,,,0,,0
4958343,332785,2013-06-30 19:55:18,5,219,,,219,88083,3,4.0,...,,,,,,,,0,,0
4958344,332785,2013-06-30 19:55:18,5,219,,,219,94508,3,3.5,...,,,,,,,,0,,0
4958345,332785,2013-06-30 19:55:18,5,219,,,219,128360,3,5.0,...,,,,,,,,1,157.84,1


34.4 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [33]:
mem_size = df.memory_usage(index=True).sum() / 1_024**2

print(f"Size in Memory: {mem_size:.0f} MB")

Size in Memory: 2043 MB


In [34]:
# Inspect inferred types
display(df.dtypes)

# Inspect numerical ranges
print('Min. Values')
print(df.min())

print('Max. Values')
print(df.max())

srch_id                          int64
date_time                       object
site_id                          int64
visitor_location_country_id      int64
visitor_hist_starrating        float64
visitor_hist_adr_usd           float64
prop_country_id                  int64
prop_id                          int64
prop_starrating                  int64
prop_review_score              float64
prop_brand_bool                  int64
prop_location_score1           float64
prop_location_score2           float64
prop_log_historical_price      float64
position                         int64
price_usd                      float64
promotion_flag                   int64
srch_destination_id              int64
srch_length_of_stay              int64
srch_booking_window              int64
srch_adults_count                int64
srch_children_count              int64
srch_room_count                  int64
srch_saturday_night_bool         int64
srch_query_affinity_score      float64
orig_destination_distance

Min. Values
srch_id                                          1
date_time                      2012-11-01 00:08:29
site_id                                          1
visitor_location_country_id                      1
visitor_hist_starrating                       1.41
visitor_hist_adr_usd                           0.0
prop_country_id                                  1
prop_id                                          1
prop_starrating                                  0
prop_review_score                              0.0
prop_brand_bool                                  0
prop_location_score1                           0.0
prop_location_score2                           0.0
prop_log_historical_price                      0.0
position                                         1
price_usd                                      0.0
promotion_flag                                   0
srch_destination_id                              2
srch_length_of_stay                              1
srch_booking_window

In [42]:
# # memory + speed improvements
# optimal_dtypes = {'srch_id': (np.uint32, np.nan),
#                   'site_id': (np.uint8, np.nan),
#                   'visitor_location_country_id': (np.uint8, np.nan),
#                   'prop_country_id': (np.uint8, np.nan),
#                   'prop_id': "uint32"(np.uint32, np.nan),
#                   'prop_starrating': (np.uint8, np.nan),
#                   'prop_brand_bool': (np.uint8, np.nan),
#                   'promotion_flag': (np.uint8, np.nan),
#                   'srch_destination_id': (np.uint16, np.nan),
#                   'srch_length_of_stay': (np.uint16, np.nan),
#                   'srch_booking_window': (np.uint16, np.nan),
#                   'srch_adults_count': (np.uint8, np.nan),
#                   'srch_children_count': (np.uint8, np.nan),  # 255 kids max should suffice
#                   'srch_room_count': (np.uint8, np.nan),
#                   'srch_saturday_night_bool': (np.uint8, np.nan),
#                   'random_bool': (np.uint8, np.nan),
#                   'position': (np.uint8, np.nan),
#                   'click_bool': (np.uint8, np.nan),
#                   'booking_bool': (np.uint8, np.nan)}
#
# for i in range(1, 9):
#     optimal_dtypes[f'comp{i}_rate'] = (np.int8, np.nan)
#     optimal_dtypes[f'comp{i}_inv'] = (np.int8 , np.nan)    # assignment does not specify -1 but is exists??
#     # competitor_dtypes[f'comp{i}_rate_percent_diff'] = np.uint

# # memory + speed improvements
# optimal_dtypes = {'srch_id': (np.uint32, np.nan),
#                   'site_id': (np.uint8, np.nan),
#                   'visitor_location_country_id': (np.uint8, np.nan),
#                   'prop_country_id': (np.uint8, np.nan),
#                   'prop_id': "uint32"(np.uint32, np.nan),
#                   'prop_starrating': (np.uint8, np.nan),
#                   'prop_brand_bool': (np.uint8, np.nan),
#                   'promotion_flag': (np.uint8, np.nan),
#                   'srch_destination_id': (np.uint16, np.nan),
#                   'srch_length_of_stay': (np.uint16, np.nan),
#                   'srch_booking_window': (np.uint16, np.nan),
#                   'srch_adults_count': (np.uint8, np.nan),
#                   'srch_children_count': (np.uint8, np.nan),  # 255 kids max should suffice
#                   'srch_room_count': (np.uint8, np.nan),
#                   'srch_saturday_night_bool': (np.uint8, np.nan),
#                   'random_bool': (np.uint8, np.nan),
#                   'position': (np.uint8, np.nan),
#                   'click_bool': (np.uint8, np.nan),
#                   'booking_bool': (np.uint8, np.nan)}
#
# for i in range(1, 9):
#     optimal_dtypes[f'comp{i}_rate'] = (np.int8, np.nan)
#     optimal_dtypes[f'comp{i}_inv'] = (np.int8 , np.nan)    # assignment does not specify -1 but is exists??
#     # competitor_dtypes[f'comp{i}_rate_percent_diff'] = np.uint
#
# dfn = df.dtypes()
# dfn = df.convert_dtypes(convert_integer=True)
# print(dfn.dtypes)
