# Libraries

In [147]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
import warnings
warnings.filterwarnings('ignore')

# Dataset

In [63]:
dataset1 = pd.read_csv("Occupancy_Estimation.csv")
dataset1 = dataset1.iloc[:, 2:].values
dataset1
dataset2 = pd.read_csv("Occupancy_Estimation.csv")

# Preprocessing

### Using Standard Scaler

In [164]:
dataset1[:, :-1] = StandardScaler().fit_transform(dataset1[:, :-1])
dataset1

array([[-1.46303347, -1.35777517, -1.16233429, ..., -0.31474885,
        -0.29402867,  1.        ],
       [-1.46303347, -1.35777517, -1.16233429, ..., -0.31474885,
        -0.29402867,  1.        ],
       [-1.29225542, -1.35777517, -1.30276354, ..., -0.31474885,
        -0.29402867,  1.        ],
       ...,
       [-0.92223631, -0.82903247, -0.85807092, ..., -0.31474885,
        -0.29402867,  0.        ],
       [-0.92223631, -0.82903247, -0.85807092, ..., -0.31474885,
        -0.29402867,  0.        ],
       [-0.92223631, -0.82903247, -0.85807092, ..., -0.31474885,
        -0.29402867,  0.        ]])

### Manually by standardization

#### Keeping all the features

In [73]:
dataset2

array([[-1.46303347, -1.35777517, -1.16233429, ..., -0.31474885,
        -0.29402867,  1.        ],
       [-1.46303347, -1.35777517, -1.16233429, ..., -0.31474885,
        -0.29402867,  1.        ],
       [-1.29225542, -1.35777517, -1.30276354, ..., -0.31474885,
        -0.29402867,  1.        ],
       ...,
       [-0.92223631, -0.82903247, -0.85807092, ..., -0.31474885,
        -0.29402867,  0.        ],
       [-0.92223631, -0.82903247, -0.85807092, ..., -0.31474885,
        -0.29402867,  0.        ],
       [-0.92223631, -0.82903247, -0.85807092, ..., -0.31474885,
        -0.29402867,  0.        ]])

#### Combining same features into one

In [159]:
dataset3

Unnamed: 0,Date,Time,S5_CO2,S5_CO2_Slope,Room_Occupancy_Count,avg_temp,avg_light,avg_sound,avg_pir
0,2017/12/22,10:49:41,-0.354364,0.664436,1,-1.365738,0.850633,-0.172809,-0.343706
1,2017/12/22,10:50:12,-0.354364,0.558789,1,-1.328163,0.844927,0.593059,-0.343706
2,2017/12/22,10:50:42,-0.354364,0.449842,1,-1.328163,0.850633,0.140010,-0.343706
3,2017/12/22,10:51:13,-0.354364,0.337592,1,-1.290588,0.850633,0.161584,-0.343706
4,2017/12/22,10:51:44,-0.354364,0.222042,1,-1.290588,0.856339,-0.205170,-0.343706
...,...,...,...,...,...,...,...,...,...
10124,2018/01/11,08:58:07,-0.579404,0.004146,0,-1.015038,-0.176495,-0.302252,-0.343706
10125,2018/01/11,08:58:37,-0.579404,0.004146,0,-1.096450,-0.170789,-0.323826,-0.343706
10126,2018/01/11,08:59:08,-0.579404,0.004146,0,-1.052613,-0.170789,-0.269891,-0.343706
10127,2018/01/11,08:59:39,-0.579404,0.004146,0,-1.052613,-0.170789,-0.226744,-0.343706


In [102]:
x = dataset3[['avg_temp', 'avg_light', 'avg_sound', 'avg_pir', 'S5_CO2', 'S5_CO2_Slope']].iloc[:].values
y = dataset3[['Room_Occupancy_Count']].iloc[:].values

#### First std then mean

In [116]:
dataset4 = pd.read_csv("Occupancy_Estimation.csv")
dataset4 = dataset4.iloc[:, 2:19]
dataset4.iloc[:, 0:16]

Unnamed: 0,S1_Temp,S2_Temp,S3_Temp,S4_Temp,S1_Light,S2_Light,S3_Light,S4_Light,S1_Sound,S2_Sound,S3_Sound,S4_Sound,S5_CO2,S5_CO2_Slope,S6_PIR,S7_PIR
0,24.94,24.75,24.56,25.38,121,34,53,40,0.08,0.19,0.06,0.06,390,0.769231,0,0
1,24.94,24.75,24.56,25.44,121,33,53,40,0.93,0.05,0.06,0.06,390,0.646154,0,0
2,25.00,24.75,24.50,25.44,121,34,53,40,0.43,0.11,0.08,0.06,390,0.519231,0,0
3,25.00,24.75,24.56,25.44,121,34,53,40,0.41,0.10,0.10,0.09,390,0.388462,0,0
4,25.00,24.75,24.56,25.44,121,34,54,40,0.18,0.06,0.06,0.06,390,0.253846,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10124,25.06,25.13,24.69,25.31,6,7,33,22,0.09,0.04,0.06,0.08,345,0.000000,0,0
10125,25.06,25.06,24.69,25.25,6,7,34,22,0.07,0.05,0.05,0.08,345,0.000000,0,0
10126,25.13,25.06,24.69,25.25,6,7,34,22,0.11,0.05,0.06,0.08,345,0.000000,0,0
10127,25.13,25.06,24.69,25.25,6,7,34,22,0.08,0.08,0.10,0.08,345,0.000000,0,0


In [117]:
for i in range(0, 16):
    column = dataset4.columns[i]
    mean = dataset4[column].mean()
    std = dataset4[column].std()
    dataset4[column] = (dataset4[column] - mean) / std

In [118]:
dataset4

Unnamed: 0,S1_Temp,S2_Temp,S3_Temp,S4_Temp,S1_Light,S2_Light,S3_Light,S4_Light,S1_Sound,S2_Sound,S3_Sound,S4_Sound,S5_CO2,S5_CO2_Slope,S6_PIR,S7_PIR,Room_Occupancy_Count
0,-1.462961,-1.357708,-1.162277,-1.049633,1.873213,0.118621,0.321083,1.366159,-0.278418,0.262413,-0.237211,-0.363270,-0.354364,0.664436,-0.314733,-0.294014,1
1,-1.462961,-1.357708,-1.162277,-0.881298,1.873213,0.103763,0.321083,1.366159,2.405433,-0.262910,-0.237211,-0.363270,-0.354364,0.558789,-0.314733,-0.294014,1
2,-1.292192,-1.357708,-1.302699,-0.881298,1.873213,0.118621,0.321083,1.366159,0.826697,-0.037771,-0.188860,-0.363270,-0.354364,0.449842,-0.314733,-0.294014,1
3,-1.292192,-1.357708,-1.162277,-0.881298,1.873213,0.118621,0.321083,1.366159,0.763548,-0.075294,-0.140508,-0.114685,-0.354364,0.337592,-0.314733,-0.294014,1
4,-1.292192,-1.357708,-1.162277,-0.881298,1.873213,0.118621,0.338206,1.366159,0.037329,-0.225387,-0.237211,-0.363270,-0.354364,0.222042,-0.314733,-0.294014,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10124,-1.121422,-0.709604,-0.858029,-1.246022,-0.381191,-0.282543,-0.021378,0.447895,-0.246843,-0.300433,-0.237211,-0.197546,-0.579404,0.004146,-0.314733,-0.294014,0
10125,-1.121422,-0.828992,-0.858029,-1.414356,-0.381191,-0.282543,-0.004255,0.447895,-0.309993,-0.262910,-0.261387,-0.197546,-0.579404,0.004146,-0.314733,-0.294014,0
10126,-0.922191,-0.828992,-0.858029,-1.414356,-0.381191,-0.282543,-0.004255,0.447895,-0.183694,-0.262910,-0.237211,-0.197546,-0.579404,0.004146,-0.314733,-0.294014,0
10127,-0.922191,-0.828992,-0.858029,-1.414356,-0.381191,-0.282543,-0.004255,0.447895,-0.278418,-0.150341,-0.140508,-0.197546,-0.579404,0.004146,-0.314733,-0.294014,0


In [119]:
temp = ['S1_Temp', 'S2_Temp', 'S3_Temp', 'S4_Temp']
light = ['S1_Light', 'S2_Light', 'S3_Light', 'S4_Light']
sound = ['S1_Sound', 'S2_Sound', 'S3_Sound', 'S4_Sound']
pir = ['S6_PIR', 'S7_PIR']
dataset4['avg_temp'] = dataset4[temp].mean(axis=1)
dataset4['avg_sound'] = dataset4[sound].mean(axis=1)
dataset4['avg_light'] = dataset4[light].mean(axis=1)
dataset4['avg_pir'] = dataset4[pir].mean(axis=1)
dataset4.drop(columns = temp + light + sound + pir, inplace = True)

In [120]:
x1 = dataset4[['avg_temp', 'avg_sound', 'avg_light', 'avg_pir', 'S5_CO2', 'S5_CO2_Slope']]
y1 = dataset4[['Room_Occupancy_Count']]

## Training Testing Split

In [106]:
x_train1, x_test1, y_train1, y_test1 = train_test_split(dataset1[:, :-1], dataset1[:, -1], test_size = 0.2, random_state = 42)
x_train2, x_test2, y_train2, y_test2 = train_test_split(dataset2[:, :-1], dataset2[:, -1], test_size = 0.2, random_state = 42)
x_train3, x_test3, y_train3, y_test3 = train_test_split(x, y, test_size = 0.2, random_state = 42)

In [121]:
x_train4, x_test4, y_train4, y_test4 = train_test_split(x1, y1, test_size = 0.2, random_state = 42)

## Testing Models

### KMeans - 0.8123 for combining features, 0.1841 for keeping all features

In [130]:
kmeans4 = KMeans(n_clusters = 4, init = "k-means++", random_state = 42)
y_pred4 = kmeans4.fit_predict(dataset1[:, :-1])
accuracy_score(dataset1[:, -1], y_pred4)

0.18412479020633823

### Logistic Regression 
#### newton-cg ~ lbfgs
#### Dataset 1

In [132]:
lr1 = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000, random_state=42)
lr2 = LogisticRegression(multi_class='multinomial', solver='saga', max_iter=10000, random_state=42)
scores = cross_val_score(lr1, x_train1, y_train1, cv = 10, scoring = 'accuracy')
lr1.fit(x_train1, y_train1)
y_test_pred1 = lr1.predict(x_test1)
scores = cross_val_score(lr2, x_train1, y_train1, cv = 10, scoring = 'accuracy')
lr2.fit(x_train1, y_train1)
y_test_pred2 = lr2.predict(x_test1)
accuracy_score(y_test_pred1, y_test1), accuracy_score(y_test_pred2, y_test1)

(0.9925962487660415, 0.9921026653504442)

#### Dataset 2

In [133]:
lr3 = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000, random_state=42)
lr4 = LogisticRegression(multi_class='multinomial', solver='saga', max_iter=10000, random_state=42)
scores = cross_val_score(lr3, x_train2, y_train2, cv = 10, scoring = 'accuracy')
lr3.fit(x_train2, y_train2)
y_test_pred3 = lr3.predict(x_test2)
scores = cross_val_score(lr4, x_train2, y_train2, cv = 10, scoring = 'accuracy')
lr4.fit(x_train2, y_train2)
y_test_pred4 = lr4.predict(x_test2)
accuracy_score(y_test_pred3, y_test2), accuracy_score(y_test_pred4, y_test2)

(0.9925962487660415, 0.9921026653504442)

#### Dataset 3

In [137]:
lr5 = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000, random_state=42)
lr6 = LogisticRegression(multi_class='multinomial', solver='saga', max_iter=10000, random_state=42)
scores = cross_val_score(lr5, x_train3, y_train3, cv = 10, scoring = 'accuracy')
lr5.fit(x_train3, y_train3)
y_test_pred5 = lr5.predict(x_test3)
scores = cross_val_score(lr6, x_train3, y_train3, cv = 10, scoring = 'accuracy')
lr6.fit(x_train3, y_train3)
y_test_pred6 = lr6.predict(x_test3)
accuracy_score(y_test_pred5, y_test3), accuracy_score(y_test_pred6, y_test3)

(0.9521224086870681, 0.9526159921026653)

#### Dataset 4

In [138]:
lr7 = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000, random_state=42)
lr8 = LogisticRegression(multi_class='multinomial', solver='saga', max_iter=10000, random_state=42)
scores = cross_val_score(lr7, x_train4, y_train4, cv = 10, scoring = 'accuracy')
lr7.fit(x_train4, y_train4)
y_test_pred7 = lr7.predict(x_test4)
scores = cross_val_score(lr8, x_train4, y_train4, cv = 10, scoring = 'accuracy')
lr8.fit(x_train4, y_train4)
y_test_pred8 = lr8.predict(x_test4)
accuracy_score(y_test_pred7, y_test4), accuracy_score(y_test_pred8, y_test4)

(0.9427443237907206, 0.9427443237907206)

### KNeighbors
#### Dataset 1

In [139]:
knn1 = KNeighborsClassifier()
scores = cross_val_score(knn1, x_train1, y_train1, cv = 10, scoring = 'accuracy')
knn1.fit(x_train1, y_train1)
y_test_pred1 = knn1.predict(x_test1)
accuracy_score(y_test_pred1, y_test1)

0.9901283316880553

#### Dataset 2

In [140]:
knn2 = KNeighborsClassifier()
scores = cross_val_score(knn2, x_train2, y_train2, cv = 10, scoring = 'accuracy')
knn2.fit(x_train2, y_train2)
y_test_pred2 = knn2.predict(x_test2)
accuracy_score(y_test_pred2, y_test2)

0.9901283316880553

#### Dataset 3

In [141]:
knn3 = KNeighborsClassifier()
scores = cross_val_score(knn3, x_train3, y_train3, cv = 10, scoring = 'accuracy')
knn3.fit(x_train3, y_train3)
y_test_pred3 = knn3.predict(x_test3)
accuracy_score(y_test_pred3, y_test3)

0.9846989141164857

#### Dataset 4

In [142]:
knn4 = KNeighborsClassifier()
scores = cross_val_score(knn4, x_train4, y_train4, cv = 10, scoring = 'accuracy')
knn4.fit(x_train4, y_train4)
y_test_pred4 = knn4.predict(x_test4)
accuracy_score(y_test_pred4, y_test4)

0.9837117472852912

### Gradient Boosting
#### Dataset 1

In [143]:
gsb1 = GradientBoostingClassifier()
scores = cross_val_score(gsb1, x_train1, y_train1, cv = 10, scoring = 'accuracy')
gsb1.fit(x_train1, y_train1)
y_test_pred1 = gsb1.predict(x_test1)
accuracy_score(y_test_pred1, y_test1)

0.9950641658440277

#### Dataset 2

In [144]:
gsb2 = GradientBoostingClassifier()
scores = cross_val_score(gsb2, x_train2, y_train2, cv = 10, scoring = 'accuracy')
gsb2.fit(x_train2, y_train2)
y_test_pred2 = gsb2.predict(x_test2)
accuracy_score(y_test_pred2, y_test2)

0.9950641658440277

#### Dataset 3

In [145]:
gsb3 = GradientBoostingClassifier()
scores = cross_val_score(gsb3, x_train3, y_train3, cv = 10, scoring = 'accuracy')
gsb3.fit(x_train3, y_train3)
y_test_pred3 = gsb3.predict(x_test3)
accuracy_score(y_test_pred3, y_test3)

0.9965449160908193

#### Dataset 4

In [146]:
gsb4 = GradientBoostingClassifier()
scores = cross_val_score(gsb4, x_train4, y_train4, cv = 10, scoring = 'accuracy')
gsb4.fit(x_train4, y_train4)
y_test_pred4 = gsb4.predict(x_test4)
accuracy_score(y_test_pred4, y_test4)

0.9960513326752222

### Random Forest
#### Dataset 1

In [148]:
rf1 = RandomForestClassifier(random_state = 42)
scores = cross_val_score(rf1, x_train1, y_train1, cv = 10, scoring = 'accuracy')
rf1.fit(x_train1, y_train1)
y_test_pred1 = rf1.predict(x_test1)
accuracy_score(y_test_pred1, y_test1)

0.9985192497532083

#### Dataset 2

In [149]:
rf2 = RandomForestClassifier(random_state = 42)
scores = cross_val_score(rf2, x_train2, y_train2, cv = 10, scoring = 'accuracy')
rf2.fit(x_train2, y_train2)
y_test_pred2 = rf2.predict(x_test2)
accuracy_score(y_test_pred2, y_test2)

0.9985192497532083

#### Dataset 3

In [150]:
rf3 = RandomForestClassifier(random_state = 42)
scores = cross_val_score(rf3, x_train3, y_train3, cv = 10, scoring = 'accuracy')
rf3.fit(x_train3, y_train3)
y_test_pred3 = rf3.predict(x_test3)
accuracy_score(y_test_pred3, y_test3)

0.9970384995064165

#### Dataset 4

In [151]:
rf4 = RandomForestClassifier(random_state = 42)
scores = cross_val_score(rf4, x_train4, y_train4, cv = 10, scoring = 'accuracy')
rf4.fit(x_train4, y_train4)
y_test_pred4 = rf4.predict(x_test4)
accuracy_score(y_test_pred4, y_test4)

0.9960513326752222