In [1]:
from pandas import read_csv
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import KFold, cross_val_score

In [2]:
df = read_csv("https://raw.githubusercontent.com/wooihaw/datasets/main/ips_basic.csv")
df.head()

Unnamed: 0,bedroom,foyer,kitchen,living,soho,location
0,-59,-90,-76,-67,-80,bedroom
1,-59,-95,-79,-77,-70,bedroom
2,-52,-81,-86,-90,-76,bedroom
3,-66,-83,-81,-69,-70,bedroom
4,-71,-91,-90,-77,-71,bedroom


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 629 entries, 0 to 628
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   bedroom   629 non-null    int64 
 1   foyer     629 non-null    int64 
 2   kitchen   629 non-null    int64 
 3   living    629 non-null    int64 
 4   soho      629 non-null    int64 
 5   location  629 non-null    object
dtypes: int64(5), object(1)
memory usage: 29.6+ KB


In [4]:
df.isna().sum()

bedroom     0
foyer       0
kitchen     0
living      0
soho        0
location    0
dtype: int64

In [5]:
X = df.drop(columns=["location"])
y = df["location"]

In [6]:
models = {}
models['lgr'] = LogisticRegression()
models['knn'] = KNeighborsClassifier()
models['gnb'] = GaussianNB()
models['svc'] = SVC()
models['dtc'] = DecisionTreeClassifier()
models['rfc'] = RandomForestClassifier()
models['gbc'] = GradientBoostingClassifier()
models['mlp'] = MLPClassifier()

kf = KFold(n_splits=3, shuffle=True, random_state=42)
for m in models:
    scores = cross_val_score(models[m], X, y, cv=kf, n_jobs=-1)
    print(f"{m}: {scores.mean():.3%}, {scores.std():.3%}")


lgr: 81.715%, 1.818%
knn: 81.080%, 0.628%
gnb: 81.240%, 0.975%
svc: 81.874%, 1.072%
dtc: 74.085%, 1.200%
rfc: 80.763%, 0.473%
gbc: 77.900%, 0.643%
mlp: 80.758%, 3.008%


In [7]:
win_size = 5
df['mean_bedroom'] = df['bedroom'].rolling(win_size).mean()
df['mean_foyer'] = df['foyer'].rolling(win_size).mean()
df['mean_kitchen'] = df['kitchen'].rolling(win_size).mean()
df['mean_living'] = df['living'].rolling(win_size).mean()
df['mean_soho'] = df['soho'].rolling(win_size).mean()
print(df.shape)
print(df.head(10))

(629, 11)
   bedroom  foyer  kitchen  living  soho location  mean_bedroom  mean_foyer  \
0      -59    -90      -76     -67   -80  bedroom           NaN         NaN   
1      -59    -95      -79     -77   -70  bedroom           NaN         NaN   
2      -52    -81      -86     -90   -76  bedroom           NaN         NaN   
3      -66    -83      -81     -69   -70  bedroom           NaN         NaN   
4      -71    -91      -90     -77   -71  bedroom         -61.4       -88.0   
5      -68    -94      -74     -77   -71  bedroom         -63.2       -88.8   
6      -83    -88      -75     -74   -76  bedroom         -68.0       -87.4   
7      -67    -78      -76     -81   -66  bedroom         -71.0       -86.8   
8      -61    -85      -84     -81   -84  bedroom         -70.0       -87.2   
9      -68    -94      -88     -92   -87  bedroom         -69.4       -87.8   

   mean_kitchen  mean_living  mean_soho  
0           NaN          NaN        NaN  
1           NaN          NaN        

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 629 entries, 0 to 628
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   bedroom       629 non-null    int64  
 1   foyer         629 non-null    int64  
 2   kitchen       629 non-null    int64  
 3   living        629 non-null    int64  
 4   soho          629 non-null    int64  
 5   location      629 non-null    object 
 6   mean_bedroom  625 non-null    float64
 7   mean_foyer    625 non-null    float64
 8   mean_kitchen  625 non-null    float64
 9   mean_living   625 non-null    float64
 10  mean_soho     625 non-null    float64
dtypes: float64(5), int64(5), object(1)
memory usage: 54.2+ KB


In [9]:
df.isna().sum()

bedroom         0
foyer           0
kitchen         0
living          0
soho            0
location        0
mean_bedroom    4
mean_foyer      4
mean_kitchen    4
mean_living     4
mean_soho       4
dtype: int64

In [10]:
df = df.dropna()
df.isna().sum()

bedroom         0
foyer           0
kitchen         0
living          0
soho            0
location        0
mean_bedroom    0
mean_foyer      0
mean_kitchen    0
mean_living     0
mean_soho       0
dtype: int64

In [11]:
X2 = df.drop(columns=["location"])
y2 = df["location"]

In [12]:
# Performance by adding new features
for m in models:
    scores = cross_val_score(models[m], X2, y2, cv=kf, n_jobs=-1)
    print(f"{m}: {scores.mean():.3%}, {scores.std():.3%}")

lgr: 96.960%, 0.230%
knn: 98.400%, 0.228%
gnb: 98.079%, 0.395%
svc: 98.240%, 0.224%
dtc: 96.957%, 1.488%
rfc: 98.400%, 0.600%
gbc: 98.080%, 0.393%
mlp: 94.880%, 0.221%
