# Approach to a Solution Exercise Week 5

This is a draft for an approach towards a possible solution. Of course, you should also test additional models, compare multiple approaches, add visualizations, etc.

In [1]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import bioinfokit.visuz  
import numpy as np
import matplotlib.pyplot as plt

## 0 Data exploration

In [2]:
df = pd.read_csv("airbnb-datafile.csv")
df

Unnamed: 0.1,Unnamed: 0,room_id,survey_id,host_id,room_type,country,city,borough,neighborhood,reviews,...,accommodates,bedrooms,bathrooms,price,minstay,name,last_modified,latitude,longitude,location
0,33,10201214,1476,8497487,Entire home/apt,,Amsterdam,,Ijburg / Eiland Zeeburg,19,...,8,4.0,,763.0,,Beautiful Watervilla in Amsterdam,2017-07-23 13:02:10.468528,52.348254,5.001477,0101000020E6100000E7C8CA2F83011440C0594A96932C...
1,34,3119298,1476,15847782,Entire home/apt,,Amsterdam,,Westerpark,1,...,8,3.0,,445.0,,Modern 5-8 person apartment,2017-07-23 12:58:15.945759,52.377581,4.873119,0101000020E61000009D103AE8127E1340A54BFF925430...
2,35,5372074,1476,14390964,Entire home/apt,,Amsterdam,,Centrum West,10,...,4,3.0,,721.0,,Prinsengracht Appartement,2017-07-23 12:52:40.980471,52.373078,4.884269,0101000020E61000006C770FD07D891340B56D1805C12F...
3,36,10511291,1476,5421078,Entire home/apt,,Amsterdam,,Watergraafsmeer,1,...,8,3.0,,1412.0,,Whole house 200m2 Frankendael Park,2017-07-23 12:30:42.249974,52.350968,4.929821,0101000020E61000003B8908FF22B81340A5F8F884EC2C...
4,37,18047871,1476,9302267,Entire home/apt,,Amsterdam,,Centrum West,0,...,4,4.0,,541.0,,Great 4 bedroom apartment at the flower market,2017-07-23 12:30:27.637777,52.367890,4.889273,0101000020E6100000FDDCD0949D8E13404243FF04172F...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18653,18718,17789893,1476,47501089,Private room,,Amsterdam,,Bijlmer Centrum,10,...,3,1.0,,32.0,,"1-3 pers. Cozy Rm AFAS Live, ArenA, ZIGGODOME",2017-07-22 16:05:14.158963,52.319794,4.955638,0101000020E6100000684293C492D2134080BA8102EF28...
18654,18719,16877166,1476,67093870,Private room,,Amsterdam,,Bijlmer Centrum,6,...,4,1.0,,24.0,,"Modern Room by Arena, ZIGGO, HmH",2017-07-22 16:05:14.151986,52.319080,4.954822,0101000020E61000005801BEDBBCD1134062670A9DD728...
18655,18720,19859427,1476,29724632,Private room,,Amsterdam,,Geuzenveld / Slotermeer,0,...,1,1.0,,38.0,,Private single room,2017-07-22 16:05:14.149610,52.384028,4.838403,0101000020E61000002079E750865A1340C85F5AD42731...
18656,18721,17132164,1476,115156569,Private room,,Amsterdam,,Centrum West,13,...,2,1.0,,36.0,,City Center studio in Touristic Amsterdam 1,2017-07-22 16:05:14.146183,52.372120,4.890982,0101000020E6100000774CDD955D9013400118CFA0A12F...


In [3]:
# Let's see what columns we have
df.columns

Index(['Unnamed: 0', 'room_id', 'survey_id', 'host_id', 'room_type', 'country',
       'city', 'borough', 'neighborhood', 'reviews', 'overall_satisfaction',
       'accommodates', 'bedrooms', 'bathrooms', 'price', 'minstay', 'name',
       'last_modified', 'latitude', 'longitude', 'location'],
      dtype='object')

In [4]:
# here, you could add some stuff with .describe() and .value_counts()

In [5]:
# and also maybe some visualizations (e.g., histograms etc) - see last week

## 1 Unsupervised ML

In [6]:
# of course, use a loop like in example 7.20 to figure out which k to use
# below, we are just assuming 3 clusters for demonstration purposes.

In [7]:
_tmp = df[['bedrooms', 'price', 'accommodates']]
mykm = KMeans(n_clusters=3).fit(_tmp)
df['cluster'] = mykm.labels_

In [8]:
df['cluster'].value_counts()

0    13818
1     4327
2      513
Name: cluster, dtype: int64

In [9]:
df.groupby('cluster')['overall_satisfaction'].describe().T

cluster,0,1,2
count,13818.0,4327.0,513.0
mean,3.365248,3.154611,2.919103
std,2.175608,2.299951,2.359693
min,0.0,0.0,0.0
25%,0.0,0.0,0.0
50%,4.5,4.5,4.5
75%,5.0,5.0,5.0
max,5.0,5.0,5.0


## 2 Supervised ML

First, some recoding:

In [10]:
df['overall_satisfaction'].value_counts()

5.0    7697
0.0    5719
4.5    4543
4.0     573
3.5     105
3.0      19
2.5       1
1.0       1
Name: overall_satisfaction, dtype: int64

In [11]:
df['toprating'] = df['overall_satisfaction'].apply(lambda x: x>=4.5).replace({True:"top", False:"not top"})

### Create a train and a test dataset

In [12]:
df_train, df_test = train_test_split(df, test_size=.20)

X_train = df_train[['bedrooms', 'price', 'accommodates']]
y_train = df_train['toprating']

X_test = df_test[['bedrooms', 'price', 'accommodates']]
y_test = df_test['toprating']

### Naive Bayes

In [13]:
model01 = GaussianNB()
model01.fit(X_train, y_train)
y_pred = model01.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

     not top       0.45      0.11      0.17      1326
         top       0.65      0.93      0.77      2406

    accuracy                           0.64      3732
   macro avg       0.55      0.52      0.47      3732
weighted avg       0.58      0.64      0.56      3732



### Logistic Regression

In [14]:
model02 = LogisticRegression()
model02.fit(X_train, y_train)
y_pred = model02.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

     not top       0.30      0.00      0.00      1326
         top       0.64      1.00      0.78      2406

    accuracy                           0.64      3732
   macro avg       0.47      0.50      0.39      3732
weighted avg       0.52      0.64      0.51      3732



# PAY ATTENTION: Best coding practices

**Of course, to systematically compare multiple models, it's better to use functions and for loops rather than copy-pasting code!!!**