# Clean and Preprocessing Data

In [1]:
# import necessary packages
import pandas as pd
import wandb

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector as selector

from sklearn.pipeline import Pipeline, make_pipeline

from sklearn.impute import SimpleImputer

import category_encoders as ce

In [2]:
# read train set as a pandas dataframe
run = wandb.init(
project='rental-prices-ny', entity='vitorabdo')
local_path = wandb.use_artifact('train_set:latest').file()
train_set = pd.read_csv(local_path)
train_set.head()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mvitorbeltraoo[0m ([33mvitorabdo[0m). Use [1m`wandb login --relogin`[0m to force relogin


  train_set = pd.read_csv(local_path)


Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365,number_of_reviews_ltm,license
0,718031653455200639,Cozy Work from Home Studio in Upper East Side,2867137,Avi,Manhattan,Upper East Side,40.76939,-73.95498,Entire home/apt,150,30,0,,,80,180,0,
1,53570786,NEW Renovated room and bathroom. (2FL),19303369,Hiroki,Queens,Elmhurst,40.74515,-73.87187,Private room,39,30,1,2022-10-09,0.52,196,0,1,
2,2142092,"Furnished room - W. 181 St. by A, 1",8280182,Alejandro,Manhattan,Washington Heights,40.85098,-73.93664,Private room,300,30,0,,,1,0,0,
3,26916746,Brooklyn Home,193502084,Linda,Brooklyn,Borough Park,40.64045,-74.00404,Private room,40,30,26,2019-03-20,0.49,8,0,0,
4,74333,Alcove Studio w/ outdoor Patio Deck,331328,Amir,Manhattan,East Harlem,40.80834,-73.94075,Entire home/apt,100,30,39,2022-10-04,0.29,2,347,3,


## Cleaning Step

In [3]:
# Drop outliers
df_clean = train_set.loc[
    (train_set['price'] >= 10) &
    (train_set['price'] <= 5944) &
    (train_set['minimum_nights'] >= 1) &
    (train_set['minimum_nights'] <= 370)
]

## Preprocessing Step

In [4]:
# categorical values
ordinal_categorical = ['room_type']
non_ordinal_categorical = ['neighbourhood_group']

# numerical values
zero_imputed = [
    'minimum_nights',
    'number_of_reviews',
    'reviews_per_month',
    'calculated_host_listings_count',
    'availability_365']

In [5]:
# select only the features that we are going to use
X = df_clean.drop(['price'], axis=1)
y = df_clean['price']

In [6]:
# categorical preprocessing
ordinal_categorical_preproc = ce.OrdinalEncoder(
    cols = ordinal_categorical, 
    mapping = [
        {'col':'room_type',
         'mapping':{'Shared room':0,
                    'Private room':1,
                    'Entire home/apt':2,
                    'Hotel room':3}}])
    
non_ordinal_categorical_preproc = make_pipeline(
        SimpleImputer(strategy='most_frequent'),
        OneHotEncoder(drop='first'))

In [7]:
# numerical preprocessing
zero_imputer = SimpleImputer(strategy='constant', fill_value=0)

In [8]:
# apply the respective transformations with columntransformer method
preprocessor = ColumnTransformer([
    ('ordinal_cat', ordinal_categorical_preproc, ordinal_categorical),
    ('non_ordinal_cat', non_ordinal_categorical_preproc, non_ordinal_categorical),
    ('impute_zero', zero_imputer, zero_imputed)],
    remainder='drop')

In [9]:
X_transformed = preprocessor.fit(X)
X_transformed

In [10]:
processed_features = ordinal_categorical + non_ordinal_categorical + zero_imputed

In [11]:
print(processed_features)

['room_type', 'neighbourhood_group', 'minimum_nights', 'number_of_reviews', 'reviews_per_month', 'calculated_host_listings_count', 'availability_365']


In [1]:
run.finish()

NameError: name 'run' is not defined