# Data Quality Assurance

In [144]:
import pandas as pd
pd.set_option('display.max_columns', 500)

import numpy as np

from common import display_dtypes
from utils.common_transformers import DateHandler, DTypeTransformer

In [137]:
df = pd.read_csv("../data/hotel_bookings.csv",
                 usecols=["hotel","is_canceled","arrival_date_year","arrival_date_month",
                          "arrival_date_week_number","meal","is_repeated_guest"]
                )
copy = df.copy()

In [138]:
df.head()

Unnamed: 0,hotel,is_canceled,arrival_date_year,arrival_date_month,arrival_date_week_number,meal,is_repeated_guest
0,Resort Hotel,0,2015,July,27,BB,0
1,Resort Hotel,0,2015,July,27,BB,0
2,Resort Hotel,0,2015,July,27,BB,0
3,Resort Hotel,0,2015,July,27,BB,0
4,Resort Hotel,0,2015,July,27,BB,0


### Performing Quality Assurance

In [139]:
df.describe()

Unnamed: 0,is_canceled,arrival_date_year,arrival_date_week_number,is_repeated_guest
count,119390.0,119390.0,119390.0,119390.0
mean,0.370416,2016.156554,27.165173,0.031912
std,0.482918,0.707476,13.605138,0.175767
min,0.0,2015.0,1.0,0.0
25%,0.0,2016.0,16.0,0.0
50%,0.0,2016.0,28.0,0.0
75%,1.0,2017.0,38.0,0.0
max,1.0,2017.0,53.0,1.0


In [142]:
display_dtypes(df,num_rows_per_column=3)

DTypes


Unnamed: 0,0
hotel,object
is_canceled,int64
arrival_date_year,int64

Unnamed: 0,0
arrival_date_month,object
arrival_date_week_number,int64
meal,object

Unnamed: 0,0
is_repeated_guest,int64


In [143]:
arrival_date_month_month = DateHandler(
                                ["arrival_date_month"],
                                date_format="%B",include=["month"],
                                return_whole_df=False
                            ).fit_transform(df)["arrival_date_month_month"].unique()

assertions = [
    sorted(df["hotel"].unique()) == sorted(["City Hotel", "Resort Hotel"]),
    sorted(df["is_canceled"].unique()) == sorted([0,1]),
    sorted(df["arrival_date_year"].unique()) == [2015,2016,2017],
    all(arrival_date_month_month <= 12) and all(arrival_date_month_month >= 1),
    sorted(df["meal"].unique()) == sorted(['BB', 'FB', 'HB', 'SC', 'Undefined']),
    sorted(df["is_repeated_guest"].unique()) == sorted([0,1]),
]

for pos,assertion in enumerate(assertions,1):
    try:
        assert assertion
    except AssertionError:
        print(f"Not as per rules Assertion no: {pos}") 

### Fixing Problematic data

### DType Handling

In [155]:
# Quick Datetime Transformation
df["arrival_date_month"] = DateHandler(
                                ["arrival_date_month"],
                                date_format="%B",include=["month"],
                                return_whole_df=False
                            ).fit_transform(df)["arrival_date_month_month"]

In [158]:
mapping = {
        "hotel" : "category",
        "is_canceled" : "bool",
        "arrival_date_year" : "category",
        "arrival_date_month" : "int64",
        "meal" : "category",
        "is_repeated_guest" : "bool",
        "arrival_date_week_number" : "int64"
    }
dttf = DTypeTransformer(mapping)

In [160]:
df = dttf.fit_transform(df)
df.head()

Unnamed: 0,hotel,is_canceled,arrival_date_year,arrival_date_month,arrival_date_week_number,meal,is_repeated_guest
0,Resort Hotel,False,2015,7,27,BB,False
1,Resort Hotel,False,2015,7,27,BB,False
2,Resort Hotel,False,2015,7,27,BB,False
3,Resort Hotel,False,2015,7,27,BB,False
4,Resort Hotel,False,2015,7,27,BB,False


In [161]:
from json import dump

In [162]:
with open("../data/hotel_booking_dtypes.json","w") as f:
    dump(mapping,f)

In [163]:
df.to_csv("../data/hotel_booking_data.csv",index=False)

### Loading DType Mapping

In [164]:
from json import load

In [167]:
del mapping
with open("../data/hotel_booking_dtypes.json","r") as f:
    mapping = load(f)
mapping

{'hotel': 'category',
 'is_canceled': 'bool',
 'arrival_date_year': 'category',
 'arrival_date_month': 'int64',
 'meal': 'category',
 'is_repeated_guest': 'bool',
 'arrival_date_week_number': 'int64'}