In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import parquet
import pyarrow

In [15]:
dataset = pd.read_parquet("lumen_dataset/train.parquet")
dataset.head()

Unnamed: 0,reservation_id,night_number,stay_date,guest_id,guest_country_id,reservation_status,reservation_date,date_from,date_to,resort_id,...,price,price_tax,total_price_tax,total_price,food_price,food_price_tax,other_price,other_price_tax,room_category_id,sales_channel_id
0,73710,1.0,2007-12-13,22897,HR,Checked-out,2007-11-28,2007-12-13,2007-12-15,1,...,4255.462,425.517,452.089,4564.69,265.428,26.572,43.8,0.0,3,10.0
1,73710,2.0,2007-12-14,22897,HR,Checked-out,2007-11-28,2007-12-13,2007-12-15,1,...,4243.709,424.349,450.921,4552.937,265.428,26.572,43.8,0.0,3,10.0
2,74464,1.0,2008-01-01,106278,HR,Checked-out,2007-12-29,2008-01-01,2008-01-02,1,...,4336.857,433.693,3806.147,19764.823,530.929,53.071,14897.037,3319.383,4,4.0
3,74461,1.0,2008-01-01,38936,GB,Cancelled,2007-12-29,2008-01-01,2008-01-02,1,...,8536.766,853.662,1012.948,10392.28,1592.714,159.286,262.8,0.0,5,3.0
4,74466,1.0,2008-01-01,106279,HR,Cancelled,2007-12-29,2008-01-01,2008-01-03,1,...,,,,,,,,,6,4.0


## Filtering out cancelled reservations

In [16]:
dataset_without_cancelled = dataset[dataset["reservation_status"] != "Cancelled"]

In [9]:
dataset_without_cancelled.head()

Unnamed: 0,reservation_id,night_number,stay_date,guest_id,guest_country_id,reservation_status,reservation_date,date_from,date_to,resort_id,...,price,price_tax,total_price_tax,total_price,food_price,food_price_tax,other_price,other_price_tax,room_category_id,sales_channel_id
0,73710,1.0,2007-12-13,22897,HR,Checked-out,2007-11-28,2007-12-13,2007-12-15,1,...,4255.462,425.517,452.089,4564.69,265.428,26.572,43.8,0.0,3,10.0
1,73710,2.0,2007-12-14,22897,HR,Checked-out,2007-11-28,2007-12-13,2007-12-15,1,...,4243.709,424.349,450.921,4552.937,265.428,26.572,43.8,0.0,3,10.0
2,74464,1.0,2008-01-01,106278,HR,Checked-out,2007-12-29,2008-01-01,2008-01-02,1,...,4336.857,433.693,3806.147,19764.823,530.929,53.071,14897.037,3319.383,4,4.0
6,74470,1.0,2008-01-01,38936,GB,Checked-out,2007-12-29,2008-01-01,2008-01-02,1,...,8536.766,853.662,1012.948,10392.28,1592.714,159.286,262.8,0.0,4,3.0
7,74460,1.0,2008-01-01,38936,GB,Checked-out,2007-12-29,2008-01-01,2008-01-02,1,...,8536.766,853.662,1012.948,10392.28,1592.714,159.286,262.8,0.0,5,3.0


In [8]:
dataset_without_cancelled.info()

<class 'pandas.core.frame.DataFrame'>
Index: 25596 entries, 0 to 31633
Data columns (total 24 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   reservation_id      25596 non-null  int64         
 1   night_number        25596 non-null  float64       
 2   stay_date           25596 non-null  datetime64[ns]
 3   guest_id            25596 non-null  int64         
 4   guest_country_id    25596 non-null  object        
 5   reservation_status  25596 non-null  object        
 6   reservation_date    25596 non-null  object        
 7   date_from           25596 non-null  object        
 8   date_to             25596 non-null  object        
 9   resort_id           25596 non-null  int64         
 10  cancel_date         113 non-null    object        
 11  room_cnt            25596 non-null  int64         
 12  adult_cnt           25596 non-null  int64         
 13  children_cnt        25596 non-null  int64         


## Price Estimation

### Price Calculation
- There is ~200 rows where we know the individual prices but total price isn't calculated. So here we do that :)
- But after dropping the cancelled values we determined it was actually 0 rows :)

In [17]:
count_missing_total_price = dataset_without_cancelled[
    dataset_without_cancelled['total_price'].isna() & 
    dataset_without_cancelled['price'].notna() & 
    dataset_without_cancelled['food_price'].notna() & 
    dataset_without_cancelled['other_price'].notna()
].shape[0]

count_missing_total_price

0

In [11]:
mask = dataset_without_cancelled['total_price'].isna() & (dataset['food_price'].notna() | dataset['other_price'].notna())

dataset_without_cancelled.loc[mask, 'total_price'] = (
    dataset_without_cancelled.loc[mask, 'price'].fillna(0) + 
    dataset_without_cancelled.loc[mask, 'food_price'].fillna(0) + 
    dataset_without_cancelled.loc[mask, 'other_price'].fillna(0)
)

### Estimating Missing Prices

In [23]:
missing_prices_count = len(dataset_without_cancelled[dataset_without_cancelled['price'].isnull()])
missing_prices_count

152