In [72]:
import numpy as np
import pandas as pd

In [73]:
df = pd.read_csv("/content/RoomBasePrice_2024_06_11.csv")

Drop NaN based on ac because usually if a one-hot feature like ac, balcony, if one is Nan the rest is also Nan together.

In [74]:
data = df.dropna(subset=['ac'])
data = data.dropna(subset=['bedroom'])
data = data.dropna(subset=['room_id'])
data = data.dropna(subset=['lat'])

In [75]:
data.rename(columns={'name':'unit_type_name'}, inplace=True)

In [76]:
def impute_with_mode(df, features):
    """
    Impute missing values in the specified features of a DataFrame with the most common (mode) values.

    Parameters:
    df (pd.DataFrame): The input DataFrame.
    features (list): List of column names to be imputed.

    Returns:
    pd.DataFrame: The DataFrame with imputed values.
    """
    for feature in features:
        if feature in df.columns:
            mode_value = df[feature].mode()[0]  # Get the mode value of the feature
            df[feature].fillna(mode_value, inplace=True)  # Impute missing values with the mode
        else:
            print(f"Warning: Column '{feature}' does not exist in the DataFrame.")
    return df

In [77]:
featuresToImpute = ['beds','capacity','bathroom', 'ac', 'balcony', 'beachfront', 'breakfast','building_staff', 'cable_tv', 'essentials', 'garden', 'gym', 'hair_dryer', 'hanger', 'heating', 'hot_water', 'kitchen', 'linens', 'lock', 'luggage_drop_off','parking', 'pool','private_entrance', 'shampoo', 'tv', 'washer', 'wifi', 'workspace']
data = impute_with_mode(data, featuresToImpute)

In [78]:
mean_value = data['distance_to_coastline'].mean()  # Get the mode value of the feature
data['distance_to_coastline'].fillna(mean_value, inplace=True)

In [79]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 142 entries, 1 to 309
Data columns (total 43 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   room_id                 142 non-null    float64
 1   average_baseline_price  142 non-null    float64
 2   number_of_bookings      142 non-null    int64  
 3   unit_id                 142 non-null    object 
 4   bathroom                142 non-null    float64
 5   bedroom                 142 non-null    float64
 6   beds                    142 non-null    float64
 7   capacity                142 non-null    float64
 8   unit_type_id            142 non-null    float64
 9   property_id             142 non-null    object 
 10  unit_type_name          142 non-null    object 
 11  ac                      142 non-null    float64
 12  balcony                 142 non-null    float64
 13  beachfront              142 non-null    float64
 14  breakfast               142 non-null    float64

In [80]:
#For FE and training prep
column_to_drop = ['unit_id', 'unit_type_id', 'property_id']
clean_data = data.drop(columns=column_to_drop)

In [82]:
dataProperOrder = ['room_id',
                        'unit_type_name',
                        'property_design',
                        'property_type',
                        'area_id',
                        'number_of_bookings',
                        'bedroom',
                        'bathroom',
                        'beds',
                        'capacity',
                        'ac',
                    'balcony', 'beachfront', 'breakfast','building_staff', 'cable_tv', 'essentials', 'garden', 'gym', 'hair_dryer', 'hanger', 'heating', 'hot_water', 'kitchen', 'linens', 'lock', 'luggage_drop_off','parking', 'pool','private_entrance', 'shampoo', 'tv', 'washer', 'wifi', 'workspace',
                   'lat', 'lng', 'distance_to_coastline', 'property_bedrooms', 'average_baseline_price']


In [83]:
clean_data = clean_data[dataProperOrder]

In [84]:
clean_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 142 entries, 1 to 309
Data columns (total 40 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   room_id                 142 non-null    float64
 1   unit_type_name          142 non-null    object 
 2   property_design         142 non-null    float64
 3   property_type           142 non-null    float64
 4   area_id                 142 non-null    object 
 5   number_of_bookings      142 non-null    int64  
 6   bedroom                 142 non-null    float64
 7   bathroom                142 non-null    float64
 8   beds                    142 non-null    float64
 9   capacity                142 non-null    float64
 10  ac                      142 non-null    float64
 11  balcony                 142 non-null    float64
 12  beachfront              142 non-null    float64
 13  breakfast               142 non-null    float64
 14  building_staff          142 non-null    float64

In [85]:
filepath = 'CLEAN_RoomBasePrice_11_06_2024.csv'
clean_data.to_csv(filepath)

In [71]:
clean_data.describe()

Unnamed: 0,room_id,property_design,property_type,number_of_bookings,bedroom,bathroom,beds,capacity,ac,balcony,...,shampoo,tv,washer,wifi,workspace,lat,lng,distance_to_coastline,property_bedrooms,average_baseline_price
count,142.0,142.0,142.0,142.0,142.0,142.0,142.0,142.0,142.0,142.0,...,142.0,142.0,142.0,142.0,142.0,142.0,142.0,142.0,142.0,142.0
mean,402840.626761,1.514085,3.598592,116.528169,1.316901,1.401408,1.323944,2.598592,0.978873,0.521127,...,0.71831,0.535211,0.084507,0.985915,0.880282,-8.654587,114.781158,2259.288484,7.34507,1035580.0
std,17173.842897,1.52862,0.825651,136.874519,0.836827,1.065646,0.838348,1.52548,0.144316,0.501322,...,0.451415,0.500524,0.279131,0.118257,0.325781,0.287746,1.414937,3532.922615,5.860126,1006192.0
min,378260.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,-8.837896,110.360967,12.9244,1.0,253792.6
25%,394914.25,0.0,4.0,20.0,1.0,1.0,1.0,2.0,1.0,0.0,...,0.0,0.0,0.0,1.0,1.0,-8.81394,115.110537,458.63825,4.0,438770.4
50%,397695.5,1.0,4.0,67.0,1.0,1.0,1.0,2.0,1.0,1.0,...,1.0,1.0,0.0,1.0,1.0,-8.709399,115.130054,1133.15,6.0,774304.9
75%,399751.5,3.0,4.0,158.0,1.0,1.0,1.0,2.0,1.0,1.0,...,1.0,1.0,0.0,1.0,1.0,-8.655257,115.206654,2259.288484,7.0,1220445.0
max,472846.0,5.0,4.0,674.0,5.0,5.0,5.0,9.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,-7.728722,115.586846,14402.1,27.0,6651338.0
