# Data Quality Assurance

In [1]:
import pandas as pd
pd.set_option('display.max_columns', 500)

import numpy as np

from utils.common import * 
from utils.common_transformers import DateHandler, DTypeTransformer

In [2]:
df = pd.read_csv("../data/hotel_bookings.csv",
                 usecols=["hotel","is_canceled","arrival_date_year","arrival_date_month",
                          "arrival_date_week_number","meal","is_repeated_guest"]
                )
copy = df.copy()

### Performing Quality Assurance

In [4]:
df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
is_canceled,119390.0,0.370416,0.482918,0.0,0.0,0.0,1.0,1.0
arrival_date_year,119390.0,2016.156554,0.707476,2015.0,2016.0,2016.0,2017.0,2017.0
arrival_date_week_number,119390.0,27.165173,13.605138,1.0,16.0,28.0,38.0,53.0
is_repeated_guest,119390.0,0.031912,0.175767,0.0,0.0,0.0,0.0,1.0


In [5]:
display_dtypes(df,num_rows_per_column=3)

DTypes


Unnamed: 0,0
hotel,object
is_canceled,int64
arrival_date_year,int64

Unnamed: 0,0
arrival_date_month,object
arrival_date_week_number,int64
meal,object

Unnamed: 0,0
is_repeated_guest,int64


In [6]:
arrival_date_month_month = DateHandler(
                                ["arrival_date_month"],
                                date_format="%B",include=["month"],
                                return_whole_df=False
                            ).fit_transform(df)["arrival_date_month_month"].unique()

assertions = [
    sorted(df["hotel"].unique()) == sorted(["City Hotel", "Resort Hotel"]),
    sorted(df["is_canceled"].unique()) == sorted([0,1]),
    sorted(df["arrival_date_year"].unique()) == [2015,2016,2017],
    all(arrival_date_month_month <= 12) and all(arrival_date_month_month >= 1),
    sorted(df["meal"].unique()) == sorted(['BB', 'FB', 'HB', 'SC', 'Undefined']),
    sorted(df["is_repeated_guest"].unique()) == sorted([0,1]),
]

for pos,assertion in enumerate(assertions,1):
    try:
        assert assertion
    except AssertionError:
        print(f"Not as per rules Assertion no: {pos}") 

### Fixing Problematic data

### DType Handling

In [7]:
# Quick Datetime Transformation
df["arrival_date_month"] = DateHandler(
                                ["arrival_date_month"],
                                date_format="%B",include=["month"],
                                return_whole_df=False
                            ).fit_transform(df)["arrival_date_month_month"]

In [8]:
mapping = {
        "hotel" : "category",
        "is_canceled" : "bool",
        "arrival_date_year" : "category",
        "arrival_date_month" : "int64",
        "meal" : "category",
        "is_repeated_guest" : "bool",
        "arrival_date_week_number" : "int64"
    }
dttf = DTypeTransformer(mapping)

In [10]:
df = dttf.fit_transform(df)
head(df)

Unnamed: 0,0,1,2,3,4
hotel,Resort Hotel,Resort Hotel,Resort Hotel,Resort Hotel,Resort Hotel
is_canceled,False,False,False,False,False
arrival_date_year,2015,2015,2015,2015,2015
arrival_date_month,7,7,7,7,7
arrival_date_week_number,27,27,27,27,27
meal,BB,BB,BB,BB,BB
is_repeated_guest,False,False,False,False,False


In [11]:
from json import dump

In [12]:
with open("../data/hotel_booking_dtypes.json","w") as f:
    dump(mapping,f)

In [13]:
df.to_csv("../data/hotel_booking_data.csv",index=False)

### Loading DType Mapping

In [14]:
from json import load

In [15]:
del mapping
with open("../data/hotel_booking_dtypes.json","r") as f:
    mapping = load(f)
mapping

{'hotel': 'category',
 'is_canceled': 'bool',
 'arrival_date_year': 'category',
 'arrival_date_month': 'int64',
 'meal': 'category',
 'is_repeated_guest': 'bool',
 'arrival_date_week_number': 'int64'}