<a href="https://colab.research.google.com/github/cboyda/MachineLearning/blob/main/PA5_Team1_W23_ipynd.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Project Assignment #4: Decision Tree**

Team member names:

*  Brett Adams
*  Cailenys Leslie
*  Clinton Boyda 
*  Tanvir Hossain
*  Ram Dershan

Dataset: 
[New York City Airbnb Open Data](https://www.kaggle.com/datasets/dgomonov/new-york-city-airbnb-open-data)

In [709]:
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn import model_selection
from  sklearn import neighbors
import plotly.graph_objects as go
import math
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import warnings
warnings.filterwarnings("ignore") # disable warnings when making remote calls

In [710]:
# Connect to Dataset

#original filename = "https://raw.githubusercontent.com/cboyda/MachineLearning/main/AB_NYC_2019.csv"
#df = pd.read_csv(filename)

# load both data sets in
original = "https://raw.githubusercontent.com/cboyda/MachineLearning/main/AB_NYC_2019.csv"
df_original = pd.read_csv(original)
additional = "https://raw.githubusercontent.com/cboyda/MachineLearning/main/full_nyc_dataset_cleaned_table-1.csv"
df_additional = pd.read_csv(additional)

In [711]:
# Merge the two datasets with an inner join, validate that no duplicate id values exist for a one to one join
df = pd.merge(df_original, df_additional, how = "inner", on = "id", validate="one_to_one", suffixes=("_original","_additional"))
df.shape

(16005, 22)

In [712]:
df.columns

Index(['id', 'name', 'host_id', 'host_name', 'neighbourhood_group',
       'neighbourhood', 'latitude', 'longitude', 'room_type_original', 'price',
       'minimum_nights', 'number_of_reviews', 'last_review',
       'reviews_per_month', 'calculated_host_listings_count',
       'availability_365', 'property_type', 'room_type_additional',
       'accommodates', 'bathrooms_text', 'bedrooms', 'beds'],
      dtype='object')

In [713]:
df

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type_original,price,...,last_review,reviews_per_month,calculated_host_listings_count,availability_365,property_type,room_type_additional,accommodates,bathrooms_text,bedrooms,beds
0,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,...,2019-05-21,0.38,2,355,Entire rental unit,Entire home/apt,1,1 bath,,1.0
1,5121,BlissArtsSpace!,7356,Garon,Brooklyn,Bedford-Stuyvesant,40.68688,-73.95596,Private room,60,...,2017-10-05,0.40,1,0,Private room in rental unit,Private room,2,,1.0,1.0
2,5178,Large Furnished Room Near B'way,8967,Shunichi,Manhattan,Hell's Kitchen,40.76489,-73.98493,Private room,79,...,2019-06-24,3.47,1,220,Private room in rental unit,Private room,2,1 bath,1.0,1.0
3,5203,Cozy Clean Guest Room - Family Apt,7490,MaryEllen,Manhattan,Upper West Side,40.80178,-73.96723,Private room,79,...,2017-07-21,0.99,1,0,Private room in rental unit,Private room,1,1 shared bath,1.0,1.0
4,5803,"Lovely Room 1, Garden, Best Area, Legal rental",9744,Laurie,Brooklyn,South Slope,40.66829,-73.98779,Private room,89,...,2019-06-24,1.34,3,314,Private room in townhouse,Private room,2,1.5 baths,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16000,36457832,"❥NYC Apt: 4min/subway, 25m/city, 20m/LGA,JFK❥",63272360,Annie Lawrence,Queens,Woodhaven,40.69482,-73.86618,Entire home/apt,85,...,,,6,300,Entire home,Entire home/apt,2,1 bath,1.0,3.0
16001,36471896,Private Bedroom & PRIVATE BATHROOM in Manhattan,23548340,Sarah,Manhattan,Upper East Side,40.77192,-73.95369,Private room,95,...,,,1,2,Private room in rental unit,Private room,2,1 private bath,1.0,1.0
16002,36477307,Brooklyn paradise,241945355,Clement & Rose,Brooklyn,Flatlands,40.63116,-73.92616,Entire home/apt,170,...,,,2,363,Entire rental unit,Entire home/apt,6,1 bath,2.0,2.0
16003,36481615,"Peaceful space in Greenpoint, BK",274298453,Adrien,Brooklyn,Greenpoint,40.72585,-73.94001,Private room,54,...,,,1,15,Private room in rental unit,Private room,2,1 shared bath,1.0,1.0


# **Data Cleaning**

In [714]:
# check value counts for property_type
df['property_type'].value_counts()

Entire rental unit                    6975
Private room in rental unit           5153
Private room in home                   844
Entire home                            513
Entire condo                           418
Private room in townhouse              352
Entire loft                            326
Entire townhouse                       297
Private room in condo                  180
Shared room in rental unit             178
Private room in loft                   149
Entire guest suite                     133
Entire serviced apartment               98
Room in boutique hotel                  68
Room in hotel                           56
Private room in guest suite             37
Entire place                            33
Room in serviced apartment              24
Shared room in loft                     19
Entire guesthouse                       19
Private room                            18
Private room in resort                  17
Private room in bed and breakfast       14
Shared room

There are property types that we do not want to consider in our analysis (Boats, Caves and Villa's) so we will remove these examples.

In [715]:
# Check shape before dropping examples
df.shape

(16005, 22)

In [716]:
df = df.drop(df[(df['property_type'] == 'Cave') | (df['property_type'] == 'Boat') | 
                (df['property_type'] == 'Floor') | (df['property_type'] == 'Private room in farm stay') |
                (df['property_type'] == 'Entire villa') | (df['property_type'] == 'Private room in houseboat') |
                (df['property_type'] == 'Private room in villa') | (df['property_type'] == 'Private room in tent') |
                (df['property_type'] == 'Houseboat')].index)

In [717]:
# Check shape after dropping examples
df.shape

(15986, 22)

In [718]:
# assess new value counts for property_type
df['property_type'].value_counts()

Entire rental unit                    6975
Private room in rental unit           5153
Private room in home                   844
Entire home                            513
Entire condo                           418
Private room in townhouse              352
Entire loft                            326
Entire townhouse                       297
Private room in condo                  180
Shared room in rental unit             178
Private room in loft                   149
Entire guest suite                     133
Entire serviced apartment               98
Room in boutique hotel                  68
Room in hotel                           56
Private room in guest suite             37
Entire place                            33
Room in serviced apartment              24
Entire guesthouse                       19
Shared room in loft                     19
Private room                            18
Private room in resort                  17
Private room in bed and breakfast       14
Shared room

In [719]:
# extract the numerical values from the bathroom_text column for consideration 
df['bathrooms_text'].mask(df['bathrooms_text'] == 'Half-bath', 0.5, inplace=True)
df['bathrooms_text'].mask(df['bathrooms_text'] == 'Shared half-bath', 0.5, inplace=True)
df['bathrooms_text'].mask(df['bathrooms_text'] == 'Private half-bath', 0.5, inplace=True)
df['bathrooms'] = df['bathrooms_text'].str.extract(r'\b([\d.]+)\b')

In [720]:
# let's look closer at the property_type values, perhaps this can be simplified
print(df['property_type'].unique())
print("Number of property_type unique values:",df['property_type'].nunique())

['Entire rental unit' 'Private room in rental unit'
 'Private room in townhouse' 'Entire guest suite' 'Entire loft'
 'Private room in home' 'Entire condo' 'Private room in condo'
 'Private room in loft' 'Entire home' 'Entire townhouse'
 'Private room in bed and breakfast' 'Entire guesthouse'
 'Private room in guest suite' 'Room in boutique hotel'
 'Shared room in rental unit' 'Shared room in home' 'Private room'
 'Entire place' 'Entire serviced apartment' 'Private room in guesthouse'
 'Room in serviced apartment' 'Entire cottage' 'Shared room in loft'
 'Private room in serviced apartment' 'Entire bungalow' 'Room in hotel'
 'Shared room in townhouse' 'Private room in hostel'
 'Private room in bungalow' 'Shared room in condo'
 'Private room in resort' 'Shared room in floor' 'Private room in floor'
 'Tiny home' 'Entire home/apt' 'Shared room in guest suite'
 'Room in resort' 'Room in aparthotel' 'Shared room in guesthouse'
 'Room in bed and breakfast']
Number of property_type unique value

In [721]:
df['property_type'] = df.property_type.str.replace(r'(^.*Private room.*$)', 'Private Room')
#df.property_type.replace(['Private room in rental unit', 'female'], [1, 0], inplace=True)
#replace_property_values = {'Small' : 1, 'Medium' : 2, 'High' : 3 }
#replace_property_values = df.loc[df['property_type'].str.contains('Private room', case=False), 'property_type'] = 'Private Room'

In [722]:
df['property_type'] = df.property_type.str.replace(r'(^.*Entire.*$)', 'Entire Unit')

In [723]:
df['property_type'] = df.property_type.str.replace(r'(^.*Shared room.*$)', 'Shared Room')

In [724]:
df['property_type'] = df.property_type.str.replace(r'(^.*Room in.*$)', 'Room In')

In [725]:
df['property_type'].value_counts()

Entire Unit     8826
Private Room    6780
Shared Room      220
Room In          154
Tiny home          6
Name: property_type, dtype: int64

In [726]:
print(df['property_type'].unique())
print("Number of property_type unique values:",df['property_type'].nunique())

['Entire Unit' 'Private Room' 'Room In' 'Shared Room' 'Tiny home']
Number of property_type unique values: 5


In [727]:
# Convert bathroom to float type
df['bathrooms'] = df['bathrooms'].astype(float)

In [728]:
# drop bathroom_text, beds, and duplicated room_type column
df.drop(['bathrooms_text', 'room_type_additional', 'beds'], axis = 1, inplace = True)

In [729]:
# drop suffix from room_type_original
df = df.rename(columns = {'room_type_original' : 'room_type'})

In [730]:
df['room_type'].value_counts()

Entire home/apt    8803
Private room       6963
Shared room         220
Name: room_type, dtype: int64

In [731]:
# check for null values
df.isnull().sum()

id                                   0
name                                11
host_id                              0
host_name                           10
neighbourhood_group                  0
neighbourhood                        0
latitude                             0
longitude                            0
room_type                            0
price                                0
minimum_nights                       0
number_of_reviews                    0
last_review                       3010
reviews_per_month                 3010
calculated_host_listings_count       0
availability_365                     0
property_type                        0
accommodates                         0
bedrooms                          1562
bathrooms                           52
dtype: int64

For bedrooms and bathrooms with null values, fill with zero as properties can have no bedrooms or bathrooms

In [732]:
df[['bedrooms', 'bathrooms']] = df[['bedrooms', 'bathrooms']].fillna(value=0)

In [733]:
# Check null values again to confirm
df.isnull().sum()

id                                   0
name                                11
host_id                              0
host_name                           10
neighbourhood_group                  0
neighbourhood                        0
latitude                             0
longitude                            0
room_type                            0
price                                0
minimum_nights                       0
number_of_reviews                    0
last_review                       3010
reviews_per_month                 3010
calculated_host_listings_count       0
availability_365                     0
property_type                        0
accommodates                         0
bedrooms                             0
bathrooms                            0
dtype: int64

All other columns with null values are not important for this analysis as these columns will be dropped.

In [734]:
df.duplicated().any()

False

In [735]:
# any duplicates in the data?
duplicate_rows = df.duplicated()
df_no_dups = df[~duplicate_rows]
print ("There are " + str(duplicate_rows.sum()) + " duplicate rows in our dataframe that need to be considered.")

There are 0 duplicate rows in our dataframe that need to be considered.


In [736]:
df.shape

(15986, 20)

In [737]:
# really only needed if duplicate_rows > 0
df = df_no_dups
df.reset_index(inplace=True)

In [738]:
df.shape

(15986, 21)

In [739]:
df_no_dups

Unnamed: 0,index,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,...,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365,property_type,accommodates,bedrooms,bathrooms
0,0,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,...,1,45,2019-05-21,0.38,2,355,Entire Unit,1,0.0,1.0
1,1,5121,BlissArtsSpace!,7356,Garon,Brooklyn,Bedford-Stuyvesant,40.68688,-73.95596,Private room,...,45,49,2017-10-05,0.40,1,0,Private Room,2,1.0,0.0
2,2,5178,Large Furnished Room Near B'way,8967,Shunichi,Manhattan,Hell's Kitchen,40.76489,-73.98493,Private room,...,2,430,2019-06-24,3.47,1,220,Private Room,2,1.0,1.0
3,3,5203,Cozy Clean Guest Room - Family Apt,7490,MaryEllen,Manhattan,Upper West Side,40.80178,-73.96723,Private room,...,2,118,2017-07-21,0.99,1,0,Private Room,1,1.0,1.0
4,4,5803,"Lovely Room 1, Garden, Best Area, Legal rental",9744,Laurie,Brooklyn,South Slope,40.66829,-73.98779,Private room,...,4,167,2019-06-24,1.34,3,314,Private Room,2,1.0,1.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15981,16000,36457832,"❥NYC Apt: 4min/subway, 25m/city, 20m/LGA,JFK❥",63272360,Annie Lawrence,Queens,Woodhaven,40.69482,-73.86618,Entire home/apt,...,3,0,,,6,300,Entire Unit,2,1.0,1.0
15982,16001,36471896,Private Bedroom & PRIVATE BATHROOM in Manhattan,23548340,Sarah,Manhattan,Upper East Side,40.77192,-73.95369,Private room,...,1,0,,,1,2,Private Room,2,1.0,1.0
15983,16002,36477307,Brooklyn paradise,241945355,Clement & Rose,Brooklyn,Flatlands,40.63116,-73.92616,Entire home/apt,...,1,0,,,2,363,Entire Unit,6,2.0,1.0
15984,16003,36481615,"Peaceful space in Greenpoint, BK",274298453,Adrien,Brooklyn,Greenpoint,40.72585,-73.94001,Private room,...,6,0,,,1,15,Private Room,2,1.0,1.0


# **Feature Scaling**


In [740]:
df.columns

Index(['index', 'id', 'name', 'host_id', 'host_name', 'neighbourhood_group',
       'neighbourhood', 'latitude', 'longitude', 'room_type', 'price',
       'minimum_nights', 'number_of_reviews', 'last_review',
       'reviews_per_month', 'calculated_host_listings_count',
       'availability_365', 'property_type', 'accommodates', 'bedrooms',
       'bathrooms'],
      dtype='object')

In [741]:
# drop all columns not necessary
# over simplifying for our first iteration

df.drop(['index','neighbourhood','name','host_name','number_of_reviews','last_review','reviews_per_month',
         'calculated_host_listings_count','id','host_id','latitude','longitude'], axis=1, inplace = True)
# df.drop('a', inplace=True, axis=1)

In [742]:
#define clean as duplicate
df_clean = df.copy()

In [743]:
df_clean

Unnamed: 0,neighbourhood_group,room_type,price,minimum_nights,availability_365,property_type,accommodates,bedrooms,bathrooms
0,Manhattan,Entire home/apt,225,1,355,Entire Unit,1,0.0,1.0
1,Brooklyn,Private room,60,45,0,Private Room,2,1.0,0.0
2,Manhattan,Private room,79,2,220,Private Room,2,1.0,1.0
3,Manhattan,Private room,79,2,0,Private Room,1,1.0,1.0
4,Brooklyn,Private room,89,4,314,Private Room,2,1.0,1.5
...,...,...,...,...,...,...,...,...,...
15981,Queens,Entire home/apt,85,3,300,Entire Unit,2,1.0,1.0
15982,Manhattan,Private room,95,1,2,Private Room,2,1.0,1.0
15983,Brooklyn,Entire home/apt,170,1,363,Entire Unit,6,2.0,1.0
15984,Brooklyn,Private room,54,6,15,Private Room,2,1.0,1.0


In [744]:
df_clean.shape

(15986, 9)

In [745]:
zero_availability = df_clean.loc[df_clean.availability_365 == 0, 'availability_365'].index
# zero availability means unit is NOT available so best drop from out model
df_clean.drop(zero_availability,axis=0,inplace=True)

DROP units that are simply not able to be rented. This includes availability = 0

In [746]:
df_clean.shape

(8624, 9)

In [747]:
# dropping availability_365 feature at this stage since it was a filter not a feature
df_clean.drop(['availability_365'], axis=1, inplace = True)

In [748]:
df_clean.shape

(8624, 8)

In [749]:
numeric_data = df_clean.select_dtypes(include=[np.number])
categorical_data = df_clean.select_dtypes(exclude=[np.number])

In [750]:
df_clean['neighbourhood_group'] = df_clean['neighbourhood_group'].astype('category')

In [751]:
numeric_data

Unnamed: 0,price,minimum_nights,accommodates,bedrooms,bathrooms
0,225,1,1,0.0,1.0
2,79,2,2,1.0,1.0
4,89,4,2,1.0,1.5
5,140,2,3,0.0,1.0
6,215,2,4,1.0,1.0
...,...,...,...,...,...
15981,85,3,2,1.0,1.0
15982,95,1,2,1.0,1.0
15983,170,1,6,2.0,1.0
15984,54,6,2,1.0,1.0


In [752]:
categorical_data

Unnamed: 0,neighbourhood_group,room_type,property_type
0,Manhattan,Entire home/apt,Entire Unit
2,Manhattan,Private room,Private Room
4,Brooklyn,Private room,Private Room
5,Brooklyn,Entire home/apt,Entire Unit
6,Brooklyn,Entire home/apt,Entire Unit
...,...,...,...
15981,Queens,Entire home/apt,Entire Unit
15982,Manhattan,Private room,Private Room
15983,Brooklyn,Entire home/apt,Entire Unit
15984,Brooklyn,Private room,Private Room


In [753]:
# any null values? 0 means none found == no need to fix nulls
df_clean.isna().sum()

neighbourhood_group    0
room_type              0
price                  0
minimum_nights         0
property_type          0
accommodates           0
bedrooms               0
bathrooms              0
dtype: int64

In [754]:
# what are the unique values for each column?
# label can be category but others should be binary for simplicity
for col in df_clean:
    print(col, df_clean[col].unique(), df_clean[col].nunique() )

neighbourhood_group ['Manhattan', 'Brooklyn', 'Queens', 'Staten Island', 'Bronx']
Categories (5, object): ['Bronx', 'Brooklyn', 'Manhattan', 'Queens', 'Staten Island'] 5
room_type ['Entire home/apt' 'Private room' 'Shared room'] 3
price [  225    79    89   140   215   120   150    52    70    68   130   110
    80   228   144   180   375   200    99   230    65   105    98   175
   500   220   100   170   185   115    77    76   135   195    69   125
   475   165   350   265    64   159   250   305   155    60    92   285
    90   390    95    75   190   212   124   122   575   229    59   113
   179    71   349   249   169   599    55   189   260    97   495   259
   451   129   300    72    88   450    37    85    91   255    50   160
   248   145   199    42   400    96   299   325    45    34    56   402
   800   275   219   178   119    87   395    49   142   174   235   311
    39   102   209   104    82   118    36    93   295   107   151   700
   331   149   128   136  1000   

In [755]:
# how many of each unique value exists in our cleaned data?
for col in df_clean:
  print("\nFor column", col)
  print(df_clean[col].value_counts(sort=True))



For column neighbourhood_group
Brooklyn         3556
Manhattan        3308
Queens           1353
Bronx             303
Staten Island     104
Name: neighbourhood_group, dtype: int64

For column room_type
Entire home/apt    4980
Private room       3523
Shared room         121
Name: room_type, dtype: int64

For column price
150     361
100     349
50      222
200     220
125     212
       ... 
995       1
337       1
429       1
2800      1
393       1
Name: price, Length: 440, dtype: int64

For column minimum_nights
2      2144
1      1811
3      1497
30     1134
4       562
       ... 
23        1
62        1
265       1
185       1
85        1
Name: minimum_nights, Length: 80, dtype: int64

For column property_type
Entire Unit     4990
Private Room    3364
Room In          144
Shared Room      120
Tiny home          6
Name: property_type, dtype: int64

For column accommodates
2     3884
4     1358
1     1193
3      893
6      473
5      418
8      154
7      119
10      41
9       24

In [756]:
df_clean.dtypes

neighbourhood_group    category
room_type                object
price                     int64
minimum_nights            int64
property_type            object
accommodates              int64
bedrooms                float64
bathrooms               float64
dtype: object

In [757]:
zero_beds = df_clean[df_clean['bathrooms'] == 0]
zero_beds


Unnamed: 0,neighbourhood_group,room_type,price,minimum_nights,property_type,accommodates,bedrooms,bathrooms
171,Brooklyn,Entire home/apt,260,30,Entire Unit,4,2.0,0.0
179,Manhattan,Entire home/apt,185,2,Entire Unit,2,2.0,0.0
344,Manhattan,Private room,130,1,Private Room,2,1.0,0.0
366,Queens,Private room,55,1,Private Room,1,1.0,0.0
656,Manhattan,Entire home/apt,73,30,Entire Unit,2,1.0,0.0
781,Manhattan,Entire home/apt,121,2,Entire Unit,2,1.0,0.0
833,Brooklyn,Private room,62,2,Private Room,3,0.0,0.0
885,Brooklyn,Private room,99,1,Private Room,2,1.0,0.0
923,Brooklyn,Entire home/apt,100,365,Entire Unit,2,1.0,0.0
1081,Brooklyn,Private room,55,1,Private Room,1,1.0,0.0


In [758]:
min_nights_7 = df_clean[df_clean['minimum_nights'] < 3]
min_nights_7

Unnamed: 0,neighbourhood_group,room_type,price,minimum_nights,property_type,accommodates,bedrooms,bathrooms
0,Manhattan,Entire home/apt,225,1,Entire Unit,1,0.0,1.0
2,Manhattan,Private room,79,2,Private Room,2,1.0,1.0
5,Brooklyn,Entire home/apt,140,2,Entire Unit,3,0.0,1.0
6,Brooklyn,Entire home/apt,215,2,Entire Unit,4,1.0,1.0
8,Brooklyn,Entire home/apt,120,2,Entire Unit,5,1.0,1.0
...,...,...,...,...,...,...,...,...
15978,Queens,Private room,34,1,Private Room,2,1.0,2.0
15979,Queens,Private room,35,1,Private Room,2,1.0,2.0
15980,Manhattan,Shared room,55,2,Shared Room,2,1.0,1.0
15982,Manhattan,Private room,95,1,Private Room,2,1.0,1.0


In [759]:
min_nights_more_7 = df_clean[df_clean['minimum_nights'] >= 3]
min_nights_more_7

Unnamed: 0,neighbourhood_group,room_type,price,minimum_nights,property_type,accommodates,bedrooms,bathrooms
4,Brooklyn,Private room,89,4,Private Room,2,1.0,1.5
9,Manhattan,Entire home/apt,150,10,Entire Unit,2,0.0,1.0
12,Brooklyn,Private room,89,4,Private Room,2,1.0,1.0
15,Queens,Private room,130,3,Private Room,4,1.0,1.0
16,Brooklyn,Entire home/apt,110,7,Entire Unit,2,1.0,1.0
...,...,...,...,...,...,...,...,...
15975,Brooklyn,Entire home/apt,150,7,Entire Unit,2,0.0,1.0
15976,Queens,Entire home/apt,140,3,Entire Unit,6,3.0,1.0
15981,Queens,Entire home/apt,85,3,Entire Unit,2,1.0,1.0
15984,Brooklyn,Private room,54,6,Private Room,2,1.0,1.0


In [760]:
#for column in features:
for column in df_clean.columns:
  fig = px.histogram(df_clean, x=column, marginal="box")
  fig.show()

Consider how to manage extreme values.

In [761]:
extreme_values = []
for column in numeric_data.columns:
  # Select the first quantile
  q1 = df[column].quantile(0.25)

  # Select the third quantile
  q3 = df[column].quantile(0.75)

  max = df[column].quantile(1)

  # Create a mask inbetween q1 & q3
  IQR = q3 - q1

  # Filtering the initial dataframe with a mask
  #filtered = df.query('(@q1 - 1.5 * @IQR) <= [column] <= (@q3 + 1.5 * @IQR)')
  # Filtering Values between Q1-1.5IQR and Q3+1.5IQR  

  #maximum outliers
  bottom_fence = 0 if (q1 - 1.5 * IQR) < 0 else q1 - 1.5 * IQR
  upper_fence = max if (q3 + 1.5 * IQR) > max else (q3 + 1.5 * IQR)
  #display(column, bottom_fence, upper_fence)
  extreme_values.append([column, bottom_fence, upper_fence])


In [762]:
  extreme_values

[['price', 0, 332.5],
 ['minimum_nights', 0, 11.0],
 ['accommodates', 0, 7.0],
 ['bedrooms', 1.0, 1.0],
 ['bathrooms', 1.0, 1.0]]

In [763]:
# lookup in extreme_values UPPER/LOWER FENCE values
def get_upperfence(name=''):
  for i in range(len(extreme_values)):
    if extreme_values[i][0] == name:
      return extreme_values[i][2]
    else:
      continue

def get_lowerfence(name=''):
  for i in range(len(extreme_values)):
    if extreme_values[i][0] == name:
      return extreme_values[i][1]
    else:
      continue

In [764]:
# calculate percentage of values over our extreme, if under 5% consider dropping
display ('Pricing percentage over extreme:')
(df_clean.loc[df_clean.price > get_upperfence('price'), 'price'].count() / df_clean.price.count()) * 100 

'Pricing percentage over extreme:'

6.8181818181818175

In [765]:
# drop upperfence extreme prices
df_clean.drop(df_clean[df_clean['price'] > get_upperfence('price')].index, inplace = True)


In [766]:
# calculate percentage of values over our extreme, if under 5% consider dropping
display ('Minimum nights percentage over extreme:')
(df_clean.loc[df_clean.minimum_nights > get_upperfence('minimum_nights'), 'minimum_nights'].count() / df_clean.minimum_nights.count()) * 100 

'Minimum nights percentage over extreme:'

19.051767048282727

In [767]:
# NOT DROPPING minimum_nights because of high percentage
# drop upperfence extreme minimum nights
# df_clean.drop(df_clean[df_clean['minimum_nights'] > get_upperfence('minimum_nights')].index, inplace = True)

In [768]:
# calculate percentage of values over our extreme, if under 5% consider dropping
display ('Accommodates percentage over extreme:')
(df_clean.loc[df_clean.accommodates > get_upperfence('accommodates'), 'accommodates'].count() / df_clean.accommodates.count()) * 100 

'Accommodates percentage over extreme:'

1.991040318566451

In [769]:
# drop upperfence extreme accomodations
df_clean.drop(df_clean[df_clean['accommodates'] > get_upperfence('accommodates')].index, inplace = True)

In [770]:
# after extreme values dropped, how do histograms look now?
for column in df_clean.columns:
  fig = px.histogram(df_clean, x=column, marginal="box")
  fig.show()

In [771]:
# log of zero fails so we count how many have zero, if small, then drop
df_clean.loc[df_clean.price  == 0, 'price'].count()

3

In [772]:
zero_price = df_clean.loc[df_clean.price  == 0, 'price'].index

In [773]:
df_clean.shape

(7876, 8)

In [774]:
# zero price rows is low, dropping
df_clean.drop(zero_price,axis=0,inplace=True)

In [775]:
df_clean.shape

(7873, 8)

In [776]:
# add log of price to dataframe
df_clean['log_price'] = np.log(df_clean['price'])

Minimum_nights needs log to get gaussian graph.

In [777]:
# log of zero fails so we count how many have zero, if small, then drop
df_clean.loc[df_clean.minimum_nights  == 0, 'minimum_nights'].count()

0

In [778]:
zero_minimum_nights = df_clean.loc[df_clean.minimum_nights  == 0, 'minimum_nights'].index

In [779]:
# zero price rows is low, dropping
df_clean.drop(zero_minimum_nights,axis=0,inplace=True)

In [780]:
# add log of price to dataframe
#df_clean['log_minimum_nights'] = np.log(df_clean['minimum_nights'])

# after FILTERING rows with zero minimum nights, we don't need minimium nights any longer, just drop that entire feature
df_clean.drop(['minimum_nights'], axis=1, inplace = True)

In [781]:
# after price and minimum_nights LOGGED, how do histograms look now?
for column in df_clean.columns:
  fig = px.histogram(df_clean, x=column, marginal="box")
  fig.show()

Consider dropping minimum_nights original features now... Then choose between Price_Group Price or Log_price

Now lets create the price_group instead of different individual prices.

In [782]:
df_clean.dtypes

neighbourhood_group    category
room_type                object
price                     int64
property_type            object
accommodates              int64
bedrooms                float64
bathrooms               float64
log_price               float64
dtype: object

In [783]:
df_clean.head()

Unnamed: 0,neighbourhood_group,room_type,price,property_type,accommodates,bedrooms,bathrooms,log_price
0,Manhattan,Entire home/apt,225,Entire Unit,1,0.0,1.0,5.4161
2,Manhattan,Private room,79,Private Room,2,1.0,1.0,4.369448
4,Brooklyn,Private room,89,Private Room,2,1.0,1.5,4.488636
5,Brooklyn,Entire home/apt,140,Entire Unit,3,0.0,1.0,4.941642
6,Brooklyn,Entire home/apt,215,Entire Unit,4,1.0,1.0,5.370638


In [784]:
df_clean.dtypes

neighbourhood_group    category
room_type                object
price                     int64
property_type            object
accommodates              int64
bedrooms                float64
bathrooms               float64
log_price               float64
dtype: object

In [785]:
q1 = df_clean['price'].quantile(0.25)
q1

70.0

In [786]:
mean = df_clean['price'].quantile(0.5)
mean

101.0

In [787]:
q3 = df_clean['price'].quantile(0.75)
q3

159.0

!!! as a model **VARIATION** might be worth changing price_group based on LOG_PRICE vs PRICE here

In [788]:
# create method to compare results for these models and variations of data inputted
#v1 = Variant 1 = using price_group by log_price for input data
#v2 = Variant 2 = using price_group by price for input data
#v3 = Variant 3 = using log_price instead of price_group
#v4 = Variant 4 = using price instead of price_group
#v5 = Variant 5 = same as v2 but without stratify condition in test_train_split

recording = 'accuracy_v5'
variant_number = int(recording[-1])

In [789]:
if variant_number == 1:
  feature_name = 'log_price'
else:
  feature_name = 'price'

In [790]:
df_clean['price'].value_counts()

150    349
100    348
50     222
200    217
125    208
      ... 
256      1
277      1
22       1
323      1
223      1
Name: price, Length: 280, dtype: int64

In [791]:
for col in df_clean:
    print(col, df_clean[col].unique(), df_clean[col].nunique() )

neighbourhood_group ['Manhattan', 'Brooklyn', 'Queens', 'Staten Island', 'Bronx']
Categories (5, object): ['Bronx', 'Brooklyn', 'Manhattan', 'Queens', 'Staten Island'] 5
room_type ['Entire home/apt' 'Private room' 'Shared room'] 3
price [225  79  89 140 215 120 150  52  70  68 130 110  80 228 144 180 200  99
 230  65 105  98 175 220 100 170 185 115  77  76 135 195  69 125 165  64
 159 250 305 155  60  92 285  90  95  75 190 212 124 122 229  59 113 179
  71 249 169  55 260  97 259 129  72  88  37  85  91 189 300 255  50 160
 145 199  42  96 299 325  45  34  56 275 219 178 265 119  87  49 142 174
 235 311  39 102 209 104  82 118  36  93 295 107 151 331 149 128 136 263
  61 234 109 197 127 167  54 134  62  73 240 210 171 103  81  57 121  51
 131 166  44 108  35  53  78 191 187 172  38  46 139  83  40 182 158 133
  47  94 152  41 290 147 269 188  67 111 217 112  66  84  31 226  74  29
 143 184 193 106 320 221 162  63 176 117 218 116 288 316 146 318 148 216
  58  30  86 198 245 239 247 205 

In [792]:
column_names= df_clean.columns
features = column_names[column_names != 'log_price']
label = column_names[7]
display(features, label)
# set our label to type category to be explicit
df_clean['neighbourhood_group'] = df_clean['neighbourhood_group'].astype('category')

Index(['neighbourhood_group', 'room_type', 'price', 'property_type',
       'accommodates', 'bedrooms', 'bathrooms'],
      dtype='object')

'log_price'

In [793]:
df_clean.dtypes

neighbourhood_group    category
room_type                object
price                     int64
property_type            object
accommodates              int64
bedrooms                float64
bathrooms               float64
log_price               float64
dtype: object

## Normalization and Scaling of Data

In [794]:
Example_Count = len(df_clean)
Feature_Count = len(df_clean.columns) - 1

print("Number of Examples:", Example_Count)
print("Number Features:", Feature_Count)

Number of Examples: 7873
Number Features: 7


In [795]:
fig = px.scatter_matrix(df_clean, dimensions=features, color=label)

fig.update_layout(width=(Feature_Count + 1) * 200,
                 height=(Feature_Count + 1) * 200,
                 margin=dict(l=0, r=0, t=0, b=0))

fig.show()

## Convert Strings to Numerical 

In [796]:
df_clean.dtypes

neighbourhood_group    category
room_type                object
price                     int64
property_type            object
accommodates              int64
bedrooms                float64
bathrooms               float64
log_price               float64
dtype: object

For room_type, price_group and property_type, from objects to mode/numerics.

In [797]:
# features need to be numerical for decision trees, only really needed for correlation graph/comparison
# but maybe NOT needed for decision tree?
# df_clean = pd.get_dummies(df_clean, columns=["room_type","property_type","price_group"], prefix='mode')
# df_clean = pd.get_dummies(df_clean, columns=["room_type","property_type"], prefix='mode')


In [798]:
# leaving features as objects is a problem when it comes to calculating precision & classification_reports
# if get_dummies is not used ERROR is
#ValueError                                Traceback (most recent call last)
#
#<ipython-input-565-2fe0e0137cb5> in <module>
#      2 print(classification_report(y_train, yhat_train))
#      3 print()
#----> 4 yhat_test = dtree.predict(X_test)
#      5 
#      6 print("Results on test data:")
#
#4 frames
#
#/usr/local/lib/python3.8/dist-packages/sklearn/utils/_array_api.py in _asarray_with_order(array, dtype, order, copy, xp)
#    183     if xp.__name__ in {"numpy", "numpy.array_api"}:
#    184         # Use NumPy API to support order
#--> 185         array = numpy.asarray(array, order=order, dtype=dtype)
#    186         return xp.asarray(array, copy=copy)
#    187     else:
#ValueError: could not convert string to float: 'Private room'

In [799]:
df_clean.dtypes

neighbourhood_group    category
room_type                object
price                     int64
property_type            object
accommodates              int64
bedrooms                float64
bathrooms               float64
log_price               float64
dtype: object

In [800]:
column_names= df_clean.columns
features = column_names[column_names != 'price']
#label = column_names[0] # already define above, and has not changed
display(features, label)

Index(['neighbourhood_group', 'room_type', 'property_type', 'accommodates',
       'bedrooms', 'bathrooms', 'log_price'],
      dtype='object')

'log_price'

### So our business question for this classification question (this assignment), tell the user what area (neighbourhood_group) has a unit with the price/bedroom/accomodates/bathrooms/property_type/room_type that he is looking for.

**You should not use a preprocessing method that is fitted on the whole dataset, to transform the test or train data.**

In [801]:
df_clean.dtypes

neighbourhood_group    category
room_type                object
price                     int64
property_type            object
accommodates              int64
bedrooms                float64
bathrooms               float64
log_price               float64
dtype: object

In [802]:
df_clean.drop('price', axis= 1, inplace=True)

In [803]:
df_clean.select_dtypes(include=['int64','float64']).columns

Index(['accommodates', 'bedrooms', 'bathrooms', 'log_price'], dtype='object')

In [804]:
column_names= df_clean.columns
features = column_names[column_names != 'log_price']
#label = column_names[0] # already define above, and has not changed
display(features, label)

Index(['neighbourhood_group', 'room_type', 'property_type', 'accommodates',
       'bedrooms', 'bathrooms'],
      dtype='object')

'log_price'

In [805]:
#numerical_features = np.array(["accomodates", "bedrooms", "bathrooms"])

numerical_features = np.array(df_clean.select_dtypes(include=['int64','float64']).columns)

non_numerical_features = np.array([column for column in df_clean.columns \
                                   if column not in numerical_features and \
                                      column != label])

In [806]:
numerical_features = numerical_features[:-1]
numerical_features

array(['accommodates', 'bedrooms', 'bathrooms'], dtype=object)

In [807]:
non_numerical_features

array(['neighbourhood_group', 'room_type', 'property_type'], dtype='<U19')

In [808]:
# from sklearn.compose import make_column_transformer
# from sklearn.preprocessing import OrdinalEncoder, LabelEncoder, OneHotEncoder

# #OrdinalEncoder assumes EVERYTHING is categorical
# # purpose here is convert strings (categorical) to numbers

# X_preprocess = make_column_transformer((OneHotEncoder(), non_numerical_features), remainder='passthrough')

In [809]:
# X = X_preprocess.fit_transform(df_clean[features])
# y = df_clean[label]

In [810]:
# X

In [811]:
# y

In [812]:
# from sklearn.model_selection import train_test_split

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42) 

In [813]:
# from sklearn.preprocessing import MinMaxScaler
# scaler = MinMaxScaler()

# X_train = scaler.fit_transform(X_train)
# X_train

In [814]:
# X_test = scaler.transform(X_test)

In [815]:
# from sklearn.linear_model import LinearRegression
# linreg = LinearRegression()
# linreg.fit(X_train, y_train)

In [816]:
# linreg.score(X_test, y_test)

In [817]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression, Ridge, Lasso

# numeric_features = ["age", "fare"]
numeric_transformer = Pipeline(
    steps=[("scaler", MinMaxScaler())]
)

# categorical_features = ["embarked", "sex", "pclass"]
categorical_transformer = Pipeline(
    steps=[("encoder", OneHotEncoder())])

preprocessor = ColumnTransformer(
    transformers=[
        ("numeric features", numeric_transformer, numerical_features),
        ("categorical features", categorical_transformer, non_numerical_features),
    ]
)

In [818]:
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import r2_score

clf = Pipeline(
     steps=[("preprocessor", preprocessor), ("Regressor", LinearRegression())]
 )

X = df_clean[features]
y = df_clean[label]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf.fit(X_train, y_train)
print("model score: %.3f" % clf.score(X_test, y_test))

yhat_test = clf.predict(X_test)

MAE_Lin_reg =  metrics.mean_absolute_error(y_test, yhat_test)
MSE_Lin_reg = metrics.mean_squared_error(y_test,yhat_test)
RMS_Lin_reg = np.sqrt(metrics.mean_squared_error(y_test, yhat_test))
R2_Lin_reg = r2_score (y_test,yhat_test)

model score: 0.544


In [819]:
clf = Pipeline(
     steps=[("preprocessor", preprocessor), ("Regressor", Ridge())]
 )

clf.fit(X_train, y_train)
print("model score: %.3f" % clf.score(X_test, y_test))

yhat_test = clf.predict(X_test)

MAE_Ridge_reg =  metrics.mean_absolute_error(y_test, yhat_test)
MSE_Ridge_reg = metrics.mean_squared_error(y_test,yhat_test)
RMS_Ridge_reg = np.sqrt(metrics.mean_squared_error(y_test, yhat_test))
R2_Ridge_reg = r2_score (y_test,yhat_test)

model score: 0.544


In [820]:
clf = Pipeline(
     steps=[("preprocessor", preprocessor), ("Regressor", Ridge(alpha = 0.1))]
 )

clf.fit(X_train, y_train)
print("model score: %.3f" % clf.score(X_test, y_test))
yhat_test = clf.predict(X_test)

MAE_Ridge_reg2 =  metrics.mean_absolute_error(y_test, yhat_test)
MSE_Ridge_reg2 = metrics.mean_squared_error(y_test,yhat_test)
RMS_Ridge_reg2 = np.sqrt(metrics.mean_squared_error(y_test, yhat_test))
R2_Ridge_reg2 = r2_score (y_test,yhat_test)

model score: 0.544


In [821]:
clf = Pipeline(
     steps=[("preprocessor", preprocessor), ("Regressor", Lasso())]
 )

clf.fit(X_train, y_train)
print("model score: %.3f" % clf.score(X_test, y_test))
yhat_test = clf.predict(X_test)

MAE_Lasso_reg =  metrics.mean_absolute_error(y_test, yhat_test)
MSE_Lasso_reg = metrics.mean_squared_error(y_test,yhat_test)
RMS_Lasso_reg = np.sqrt(metrics.mean_squared_error(y_test, yhat_test))
R2_Lasso_reg = r2_score (y_test,yhat_test)

model score: -0.000


In [822]:
clf = Pipeline(
     steps=[("preprocessor", preprocessor), ("Regressor", Lasso(alpha = 0.01))]
 )

clf.fit(X_train, y_train)

print("model score: %.3f" % clf.score(X_test, y_test))

yhat_test = clf.predict(X_test)
yhat_test = clf.predict(X_test)

MAE_Lasso_reg2 =  metrics.mean_absolute_error(y_test, yhat_test)
MSE_Lasso_reg2 = metrics.mean_squared_error(y_test,yhat_test)
RMS_Lasso_reg2 = np.sqrt(metrics.mean_squared_error(y_test, yhat_test))
R2_Lasso_reg2 = r2_score (y_test,yhat_test)

model score: 0.522


In [823]:
from sklearn import metrics
from sklearn.metrics import r2_score
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, yhat_test))
print('Mean Squared Error:',metrics.mean_squared_error(y_test,yhat_test))
print('Root Mean Squared Error:',np.sqrt(metrics.mean_squared_error(y_test, yhat_test)))
print('Coefficient of determination: %.2f' % r2_score (y_test,yhat_test))

Mean Absolute Error: 0.30454558509884067
Mean Squared Error: 0.14766065910981285
Root Mean Squared Error: 0.3842663908147743
Coefficient of determination: 0.52


In [824]:
losses_list = [[MAE_Lin_reg, MSE_Lin_reg, RMS_Lin_reg, R2_Lin_reg],
               [MAE_Ridge_reg, MSE_Ridge_reg, RMS_Ridge_reg, R2_Ridge_reg],
               [MAE_Ridge_reg2, MSE_Ridge_reg2, RMS_Ridge_reg2, R2_Ridge_reg2],
               [MAE_Lasso_reg, MSE_Lasso_reg, RMS_Lasso_reg, R2_Lasso_reg],
               [MAE_Lasso_reg2, MSE_Lasso_reg2, RMS_Lasso_reg2, R2_Lasso_reg2]]
row_names = ["Linear regression", "Ridge regression", "Ridge regression 2", "Lasso regression" , "Lasso regression 2"]               
column_names = ["MAE", "MSE", "RMSE", "R2 Score"]
display(pd.DataFrame(losses_list, index=row_names, columns=column_names))

Unnamed: 0,MAE,MSE,RMSE,R2 Score
Linear regression,0.297008,0.140849,0.375299,0.543903
Ridge regression,0.296888,0.140857,0.375309,0.543877
Ridge regression 2,0.296894,0.140901,0.375368,0.543734
Lasso regression,0.458142,0.308881,0.55577,-0.000217
Lasso regression 2,0.304546,0.147661,0.384266,0.521846
