In [10]:
import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer
from scipy import stats

Data required to run this file:
- Cleaned base price data
- area_names.csv
- property_desings.csv
- property_types.csv

In [2]:
data = pd.read_csv('/content/CLEAN_RoomBasePrice_11_06_2024.csv')

In [4]:
def convert_column_with_reference(df, reference, source_column, source_reference, target_column):
  conversion_dict = dict(zip(reference[source_reference], reference[target_column]))
  if pd.api.types.is_float_dtype(df[source_column]):
    df[source_column] = df[source_column].astype(int).astype(str)
  df[target_column] = df[source_column].astype(str).apply(lambda x: conversion_dict.get(x))
  df = df.drop(columns=[source_column])
  return df

## Convert area_id into area_names

In [6]:
area_names = pd.read_csv('/content/area_names.csv')

In [7]:
data = convert_column_with_reference(data, area_names, 'area_id','area_id', 'area_name')

## Series of Feature Engineering activities


In [None]:
data['total_fas'] = data[['ac', 'balcony', 'beachfront', 'breakfast', 'building_staff',
                             'cable_tv', 'essentials', 'garden', 'gym', 'hair_dryer',
                             'hanger', 'heating', 'hot_water', 'kitchen', 'linens',
                             'lock', 'luggage_drop_off', 'parking', 'pool',
                             'private_entrance', 'shampoo', 'tv', 'washer', 'wifi',
                             'workspace']].sum(axis=1)
data['ratio_bedroom_bathroom'] = data['bedroom'] / data['bathroom']
data['ratio_bedroom_cap'] = data['capacity'] / data['bedroom']
data['avg_price_distance_to_coast'] = data['average_baseline_price'] / data['distance_to_coastline']
data['avg_price_distance_to_airport'] = data['average_baseline_price'] / data['area_distance_to_airport']
data['avg_price_bedroom'] = data['average_baseline_price'] / data['bedroom']
data['avg_price_beds'] = data['average_baseline_price'] / data['beds']
data['avg_price_bathroom'] = data['average_baseline_price'] / data['bathroom']
data['avg_price_total_fas'] = data['average_baseline_price'] / data['total_fas']

In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 142 entries, 0 to 141
Data columns (total 42 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Unnamed: 0                142 non-null    int64  
 1   room_id                   142 non-null    float64
 2   unit_type_name            142 non-null    object 
 3   property_design           142 non-null    float64
 4   property_type             142 non-null    float64
 5   number_of_bookings        142 non-null    int64  
 6   bedroom                   142 non-null    float64
 7   bathroom                  142 non-null    float64
 8   beds                      142 non-null    float64
 9   capacity                  142 non-null    float64
 10  ac                        142 non-null    float64
 11  balcony                   142 non-null    float64
 12  beachfront                142 non-null    float64
 13  breakfast                 142 non-null    float64
 14  building_s

## Remove Outliers

In [11]:
numerical_cols = ['bedroom','bathroom','beds','capacity','lat','lng','distance_to_coastline','area_distance_to_airport','average_baseline_price']
z_scores = stats.zscore(data[numerical_cols])
threshold = 5
outlier_mask = (abs(z_scores) > threshold).any(axis=1)
fe_data = data[~outlier_mask]

## Save dataset

In [12]:
filepath=  'FE_RoomBasePrice.csv'
data.to_csv(filepath)