# Exploratory data analysis - First understanding of the data

## Consume the pre-calculated metadata

In [2]:
!pip install pandas-profiling

Collecting pandas-profiling
  Downloading pandas_profiling-3.2.0-py2.py3-none-any.whl (262 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m262.6/262.6 kB[0m [31m21.0 MB/s[0m eta [36m0:00:00[0m
Collecting visions[type_image_path]==0.7.4
  Downloading visions-0.7.4-py3-none-any.whl (102 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m102.4/102.4 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting htmlmin>=0.1.12
  Downloading htmlmin-0.1.12.tar.gz (19 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting multimethod>=1.4
  Downloading multimethod-1.8-py3-none-any.whl (9.8 kB)
Collecting missingno>=0.4.2
  Downloading missingno-0.5.1-py3-none-any.whl (8.7 kB)
Collecting tangled-up-in-unicode==0.2.0
  Downloading tangled_up_in_unicode-0.2.0-py3-none-any.whl (4.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.7/4.7 MB[0m [31m70.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m
Collecting phik>=0.11.1
  Downl

In [31]:
import os

from pickle import load
import pandas as pd

from ydata.metadata import Metadata

def load_sets(path: str):
    with open(path, 'rb') as f:
        return load(f)

In [32]:
try:
    set_path = os.environ['SET_PATH']
except:
    set_path = 'train.pkl'
    
data, y = load_sets(set_path)

In [33]:
try:
    data = data.drop('Unnamed: 0', axis=1)
except:
    "No unnamed"

In [34]:
meta = Metadata.load('metadata.pkl')
print(meta)

[1mMetadata Summary 
 
[0m[1mDataset type: [0mTABULAR
[1mDataset attributes: [0m
[1mNumber of columns: [0m32
[1m% of duplicate rows: [0m26
[1mTarget column: [0m

[1mColumn detail: [0m
                            Column    Data type Variable type
0                            hotel  categorical        string
1                      is_canceled  categorical           int
2                        lead_time  categorical           int
3                arrival_date_year  categorical           int
4               arrival_date_month  categorical        string
5         arrival_date_week_number  categorical           int
6        arrival_date_day_of_month  categorical           int
7          stays_in_weekend_nights  categorical           int
8             stays_in_week_nights  categorical           int
9                           adults  categorical           int
10                        children  categorical         float
11                          babies  categorical           

## Generating the full data profile

In [7]:
try:
    data_split=os.environ['DATA_SPLIT']
except:
    data_split='train'

In [8]:
from pandas_profiling import ProfileReport

print(f'Profile Name: {data_split}_profile')
profile = ProfileReport(df=data, title='Hotel bookings demand')
profile.config.html.navbar_show = False

profile.to_file(f'{data_split}_profile.html')

Profile Name: train_profile


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

  (2 * xtie * ytie) / m + x0 * y0 / (9 * m * (size - 2)))


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

## Add here the html artifact to be generated

In [37]:
ratio_labels = pd.DataFrame(y.value_counts(normalize=True))

In [39]:
ratio_labels

Unnamed: 0,is_canceled
0,0.627481
1,0.372519


In [43]:
import json

metadata = {
    'outputs' : [
        {
      'type': 'table',
      'storage': 'inline',
      'format': 'csv',
      'header': list(ratio_labels.columns),
      'source': ratio_labels.to_csv(header=False, index=True)
    },
        {
      'type': 'web-app',
      'storage': 'inline',
      'source': profile.to_html(),
    }
    ]
  }

with open('mlpipeline-ui-metadata.json', 'w') as metadata_file:
    json.dump(metadata, metadata_file)