In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# menyimpan dataset 'air_bnb.csv' dalam bentuk dataframe
data = pd.read_csv('air_bnb.csv')

In [2]:
data.head()
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22552 entries, 0 to 22551
Data columns (total 16 columns):
id                                22552 non-null int64
name                              22493 non-null object
host_id                           22552 non-null int64
host_name                         22526 non-null object
neighbourhood_group               22552 non-null object
neighbourhood                     22552 non-null object
latitude                          22552 non-null float64
longitude                         22552 non-null float64
room_type                         22552 non-null object
price                             22552 non-null int64
minimum_nights                    22552 non-null int64
number_of_reviews                 22552 non-null int64
last_review                       18644 non-null object
reviews_per_month                 18638 non-null float64
calculated_host_listings_count    22552 non-null int64
availability_365                  22552 non-null int64

In [3]:
# mengecek apakah terdapat missing values
print(data.isna().any().any())

True


In [4]:
# karena terdapat missing values, maka cek jumlahnya
print(data.isna().sum())

id                                   0
name                                59
host_id                              0
host_name                           26
neighbourhood_group                  0
neighbourhood                        0
latitude                             0
longitude                            0
room_type                            0
price                                0
minimum_nights                       0
number_of_reviews                    0
last_review                       3908
reviews_per_month                 3914
calculated_host_listings_count       0
availability_365                     0
dtype: int64


In [5]:
# atribut "nama", dan "host_name" harus valid, bila data yang kosong tidak bisa didapatkan, maka data tersebut akan didrop
# index direset kembali karena telah terjadi drop baris/record
data.dropna(subset=['name', 'host_name'],inplace=True)
data.reset_index(drop=True)

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2015,Berlin-Mitte Value! Quiet courtyard/very central,2217,Ian,Mitte,Brunnenstr. Süd,52.534537,13.402557,Entire home/apt,60,4,118,2018-10-28,3.76,4,141
1,2695,Prenzlauer Berg close to Mauerpark,2986,Michael,Pankow,Prenzlauer Berg Nordwest,52.548513,13.404553,Private room,17,2,6,2018-10-01,1.42,1,0
2,3176,Fabulous Flat in great Location,3718,Britta,Pankow,Prenzlauer Berg Südwest,52.534996,13.417579,Entire home/apt,90,62,143,2017-03-20,1.25,1,220
3,3309,BerlinSpot Schöneberg near KaDeWe,4108,Jana,Tempelhof - Schöneberg,Schöneberg-Nord,52.498855,13.349065,Private room,26,5,25,2018-08-16,0.39,1,297
4,7071,BrightRoom with sunny greenview!,17391,Bright,Pankow,Helmholtzplatz,52.543157,13.415091,Private room,42,2,197,2018-11-04,1.75,1,26
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22462,29856708,Cozy Apartment right in the center of Berlin,87555909,Ulisses,Mitte,Brunnenstr. Süd,52.533865,13.400731,Entire home/apt,60,2,0,,,1,314
22463,29857108,Altbau/ Schöneberger Kiez / Schlafsofa,67537363,Jörg,Tempelhof - Schöneberg,Schöneberg-Nord,52.496211,13.341738,Shared room,20,1,0,,,6,78
22464,29864272,Artists loft with garden in the center of Berlin,3146923,Martin,Pankow,Prenzlauer Berg Südwest,52.531800,13.411999,Entire home/apt,85,3,0,,,2,15
22465,29866805,Room for two with private shower / WC,36961901,Arte Luise,Mitte,Alexanderplatz,52.520802,13.378688,Private room,99,1,0,,,3,6


In [6]:
# atribut 'last_review' tidak begitu perlukan, isi dengan forward fill
data[['last_review']] = data[['last_review']].fillna(method='ffill')

In [7]:
# untuk mengisi missing values pada reviews_per_month, digunakan library SimpleImputer dengan strategy median
from sklearn.impute import SimpleImputer
imp=SimpleImputer(missing_values=np.nan, strategy="median" )
imp.fit(data[["reviews_per_month"]])
data["reviews_per_month"]=imp.transform(data[["reviews_per_month"]]).ravel()

In [8]:
# mengubah data kategorikal yang akan diperlukan menjadi numerik dengan library LabelEncoder
from sklearn.preprocessing import LabelEncoder
encode = LabelEncoder()
data[["room_type"]] = encode.fit_transform(data[["room_type"]]).ravel()

  y = column_or_1d(y, warn=True)


In [9]:
# dilakukan feature scaling (Standard Scaling) dengan menggunakan library StandardScaler
from sklearn.preprocessing import StandardScaler
s = StandardScaler()
data['latitude'] = s.fit_transform(data[['latitude']])
data['longitude'] = s.fit_transform(data[['longitude']])
data['price'] = s.fit_transform(data[['price']])
data['number_of_reviews'] = s.fit_transform(data[['number_of_reviews']])
data['reviews_per_month'] = s.fit_transform(data[['reviews_per_month']])
data['availability_365'] = s.fit_transform(data[['availability_365']])

In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 22467 entries, 0 to 22551
Data columns (total 16 columns):
id                                22467 non-null int64
name                              22467 non-null object
host_id                           22467 non-null int64
host_name                         22467 non-null object
neighbourhood_group               22467 non-null object
neighbourhood                     22467 non-null object
latitude                          22467 non-null float64
longitude                         22467 non-null float64
room_type                         22467 non-null int32
price                             22467 non-null float64
minimum_nights                    22467 non-null int64
number_of_reviews                 22467 non-null float64
last_review                       22467 non-null object
reviews_per_month                 22467 non-null float64
calculated_host_listings_count    22467 non-null int64
availability_365                  22467 non-null fl

In [11]:
# Menyimpan data hasil preprocessing
data.to_csv('fix_air_bnb.csv', index=False)