In [111]:
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.datasets import load_iris
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import LinearSVR
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.multioutput import RegressorChain
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
# from bayes_opt import BayesianOptimization
import shap


In [86]:
sample_data = pd.read_csv("sample.csv")
print(sample_data.info(verbose=True, show_counts=True))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14733 entries, 0 to 14732
Data columns (total 22 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  14733 non-null  int64  
 1   co2         14733 non-null  int64  
 2   temp        14733 non-null  float64
 3   humid       14733 non-null  float64
 4   csd         14733 non-null  int64  
 5   sound       14733 non-null  int64  
 6   time        14733 non-null  int64  
 7   state       14733 non-null  object 
 8   age         14733 non-null  int64  
 9   gender      14733 non-null  object 
 10  height      14733 non-null  int64  
 11  weight      14733 non-null  int64  
 12  disease     14733 non-null  object 
 13  depressive  14733 non-null  int64  
 14  disorder    14733 non-null  object 
 15  media       14733 non-null  int64  
 16  liquor      14733 non-null  int64  
 17  smoke       14733 non-null  int64  
 18  caffeine    14733 non-null  int64  
 19  exercise    14733 non-nul

In [35]:
sample_data = remove_outliers(sample_data, "co2", 0.1, 0.9)
sample_data = pd.get_dummies(sample_data)                       # Embedding

# Train - Test Split
x_data = sample_data.iloc[:, 6:]
y_data = sample_data.iloc[:, [1, 2, 3, 4, 5]]
train_x, valid_x, train_y, valid_y = train_test_split(x_data, y_data, test_size=0.2, shuffle=True, random_state=1)

11964/14733 data points remain.


In [52]:
def remove_outliers(df, column_name, lower, upper):
    removed_outliers = df[column_name].between(df[column_name].quantile(lower), df[column_name].quantile(upper))

    print(str(df[column_name][removed_outliers].size) + "/" + str(sample_data[column_name].size) + " data points remain.")

    index_names = df[~removed_outliers].index
    return df.drop(index_names)


def PlotMultiplePie(df, categorical_features=None, dropna=False):
    # set a threshold of 30 unique variables, more than 50 can lead to ugly pie charts
    threshold = 30

    # if user did not set categorical_features
    if categorical_features is None:
        categorical_features = df.select_dtypes(['object', 'category']).columns.to_list()

    print("The Categorical Features are:", categorical_features)

    # loop through the list of categorical_features
    for cat_feature in categorical_features:
        num_unique = df[cat_feature].nunique(dropna=dropna)
        num_missing = df[cat_feature].isna().sum()
        # prints pie chart and info if unique values below threshold
        if num_unique <= threshold:
            print('Pie Chart for: ', cat_feature)
            print('Number of Unique Values: ', num_unique)
            print('Number of Missing Values: ', num_missing)
            fig = px.pie(df[cat_feature].value_counts(dropna=dropna), values=cat_feature,
                         names=df[cat_feature].value_counts(dropna=dropna).index, title=cat_feature, template='ggplot2')
            fig.show()
        else:
            print('Pie Chart for ', cat_feature, ' is unavailable due high number of Unique Values ')
            print('Number of Unique Values: ', num_unique)
            print('Number of Missing Values: ', num_missing)
            print('\n')


def evaluateRegressor(true, predicted, message="    Test Set"):
    MSE = mean_squared_error(true, predicted, squared=True)
    MAE = mean_absolute_error(true, predicted)
    RMSE = mean_squared_error(true, predicted, squared=False)
    R_squared = r2_score(true, predicted)

    print(message)
    print("MSE :", MSE)
    print("MAE :", MAE)
    print("RMSE :", RMSE)
    print("R-Squared :", R_squared)
    
    
def evaluateKFold(model):
    cv = RepeatedKFold(n_splits=5, n_repeats=3, random_state=1)
    n_scores = cross_val_score(model, x_data, y_data, scoring='neg_mean_squared_error', cv=cv, n_jobs=-1)
    rmse = np.sqrt(-1*n_scores)
    print('RMSE: %.3f'%np.mean(rmse))
    
    
def 

In [129]:
sample_data = pd.read_csv("sample.csv")
sample_data = remove_outliers(sample_data, "co2", 0.1, 0.9)
sample_data = pd.get_dummies(sample_data)                       # Embedding

x_data = sample_data.iloc[:, 6:]
y_data = sample_data.iloc[:, [1, 2, 3, 4, 5]]
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.1, random_state=42)
x_data.info(verbose=True, show_counts=True)

sc = StandardScaler()
x_train_scaled = sc.fit_transform(x_train)
x_test_scaled = sc.transform(x_test)
y_train_scaled = sc.fit_transform(y_train)
y_test_scaled = sc.transform(y_test)

x_train_scaled = np.array(x_train_scaled, dtype=np.float32)
y_train_scaled = np.array(y_train_scaled, dtype=np.float32)
x_test_scaled = np.array(x_test_scaled, dtype=np.float32)
y_test_scaled = np.array(y_test_scaled, dtype=np.float32)

inputs = torch.from_numpy(x_train_scaled)
targets = torch.from_numpy(y_train_scaled)

test_inputs = torch.from_numpy(x_test_scaled)
test_targets = torch.from_numpy(y_test_scaled)

11964/14733 data points remain.
<class 'pandas.core.frame.DataFrame'>
Int64Index: 11964 entries, 233 to 14732
Data columns (total 19 columns):
 #   Column         Non-Null Count  Dtype
---  ------         --------------  -----
 0   time           11964 non-null  int64
 1   age            11964 non-null  int64
 2   height         11964 non-null  int64
 3   weight         11964 non-null  int64
 4   depressive     11964 non-null  int64
 5   media          11964 non-null  int64
 6   liquor         11964 non-null  int64
 7   smoke          11964 non-null  int64
 8   caffeine       11964 non-null  int64
 9   exercise       11964 non-null  int64
 10  stress         11964 non-null  int64
 11  nap            11964 non-null  int64
 12  state_asleep   11964 non-null  uint8
 13  state_awake    11964 non-null  uint8
 14  gender_female  11964 non-null  uint8
 15  gender_male    11964 non-null  uint8
 16  disease_none   11964 non-null  uint8
 17  disorder_no    11964 non-null  uint8
 18  disorder_yes

In [130]:
print("Random Forest Regressor")
RFRegModel = RandomForestRegressor(random_state=0).fit(inputs, targets)
predict_train_y = RFRegModel.predict(inputs)
evaluateRegressor(targets, predict_train_y, "    Training Set")
predict_valid_y = RFRegModel.predict(test_inputs)
evaluateRegressor(test_targets, predict_valid_y)
predict_valid_y = sc.inverse_transform(predict_valid_y)
evaluateRegressor(y_test, predict_valid_y,"    Valid Set")

Random Forest Regressor
    Training Set
MSE : 0.21282695962886905
MAE : 0.17481054727271722
RMSE : 0.45810987846040235
R-Squared : 0.7871730385051497
    Test Set
MSE : 0.5324825903698143
MAE : 0.2874270507676731
RMSE : 0.6862615546984424
R-Squared : 0.543861072193424
    Valid Set
MSE : 30640.823163754092
MAE : 49.36049442623866
RMSE : 82.86436243854753
R-Squared : 0.5438610932717622


In [131]:
print("Linear Regression")
LinearModel = LinearRegression().fit(inputs, targets)
predict_train_y = LinearModel.predict(inputs)
evaluateRegressor(targets, predict_train_y, "    Training Set")
predict_valid_y = LinearModel.predict(test_inputs)
evaluateRegressor(test_targets, predict_valid_y)
predict_valid_y = sc.inverse_transform(predict_valid_y)
evaluateRegressor(y_test, predict_valid_y,"    Valid Set")

Linear Regression
    Training Set
MSE : 0.7465827
MAE : 0.3963996
RMSE : 0.85488796
R-Squared : 0.25341426251852184
    Test Set
MSE : 1.1032459
MAE : 0.41297412
RMSE : 1.0085735
R-Squared : 0.2445805607796104
    Valid Set
MSE : 53098.35658385869
MAE : 87.83183923683536
RMSE : 109.97615236260947
R-Squared : 0.24458057395398886


In [133]:
print("Decision Tree")
DTModel = DecisionTreeRegressor().fit(inputs, targets)
predict_train_y = DTModel.predict(inputs)
evaluateRegressor(targets, predict_train_y, "    Training Set")
predict_valid_y = DTModel.predict(test_inputs)
evaluateRegressor(test_targets, predict_valid_y)
predict_valid_y = sc.inverse_transform(predict_valid_y)
evaluateRegressor(y_test, predict_valid_y,"    Valid Set")

Decision Tree
    Training Set
MSE : 0.1844346406583246
MAE : 0.14337311493214291
RMSE : 0.42686504038713535
R-Squared : 0.8155653579447601
    Test Set
MSE : 0.6413739521228625
MAE : 0.31504017149974173
RMSE : 0.7526414566399733
R-Squared : 0.44336268449380345
    Valid Set
MSE : 38596.23070366688
MAE : 55.05880725116339
RMSE : 92.91951924263375
R-Squared : 0.4433627117138833


In [134]:
print("K-Nearest Neighbors")
KNNModel = KNeighborsRegressor().fit(inputs, targets)
predict_train_y = KNNModel.predict(inputs)
evaluateRegressor(targets, predict_train_y, "    Training Set")
predict_valid_y = KNNModel.predict(test_inputs)
evaluateRegressor(test_targets, predict_valid_y)
predict_valid_y = sc.inverse_transform(predict_valid_y)
evaluateRegressor(y_test, predict_valid_y,"    Valid Set")

K-Nearest Neighbors
    Training Set
MSE : 0.3265744
MAE : 0.22061
RMSE : 0.5609566
R-Squared : 0.673424550724245
    Test Set
MSE : 0.54549974
MAE : 0.26752687
RMSE : 0.71626014
R-Squared : 0.5622341696635311
    Valid Set
MSE : 22383.92792640093
MAE : 42.29160953146603
RMSE : 71.84273833729908
R-Squared : 0.5622341883403782


In [136]:
print("Support Vector Machine - Direct Multioutput")
model = LinearSVR(C=0.1, random_state=1,max_iter=10000000)
model = MultiOutputRegressor(model)
model.fit(inputs, targets)
predict_train_y = model.predict(inputs)
evaluateRegressor(targets, predict_train_y, "    Training Set")
predict_valid_y = model.predict(test_inputs)
evaluateRegressor(test_targets, predict_valid_y)
predict_valid_y = sc.inverse_transform(predict_valid_y)
evaluateRegressor(y_test, predict_valid_y,"    Valid Set")

Support Vector Machine - Direct Multioutput
    Training Set
MSE : 0.7837528988616091
MAE : 0.36431455077677805
RMSE : 0.8779997259875788
R-Squared : 0.21624709553414484
    Test Set
MSE : 1.1519434875432242
MAE : 0.3790461291792468
RMSE : 1.034740184466337
R-Squared : 0.20541327801521633
    Valid Set
MSE : 62484.80885698184
MAE : 83.68494066662603
RMSE : 118.8481415476241
R-Squared : 0.20541332094093398


In [137]:
print("Support Vector Machine - Chained Multioutput")
model = LinearSVR(C=0.1, random_state=1,max_iter=10000000)
model = RegressorChain(model)
model.fit(inputs, targets)
predict_train_y = model.predict(inputs)
evaluateRegressor(targets, predict_train_y, "    Training Set")
predict_valid_y = model.predict(test_inputs)
evaluateRegressor(test_targets, predict_valid_y)
predict_valid_y = sc.inverse_transform(predict_valid_y)
evaluateRegressor(y_test, predict_valid_y,"    Valid Set")

Support Vector Machine - Chained Multioutput
    Training Set
MSE : 0.7796906753483435
MAE : 0.3700710829137071
RMSE : 0.8757549688692958
R-Squared : 0.22030931903716602
    Test Set
MSE : 1.1493596411834028
MAE : 0.38566303700126797
RMSE : 1.0333806532205085
R-Squared : 0.2077046131376465
    Valid Set
MSE : 62484.52650616178
MAE : 83.74716605107486
RMSE : 118.83362555424824
R-Squared : 0.20770465600389007


In [138]:
sample_data = pd.read_csv("sample.csv")
sample_data = remove_outliers(sample_data, "co2", 0.1, 0.9)
sample_data = pd.get_dummies(sample_data)                       # Embedding

x_data = sample_data.iloc[:, 6:]
y_data = sample_data.iloc[:, [1, 2, 3, 4, 5]]
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.1, random_state=42)
x_data.info(verbose=True, show_counts=True)

sc = MinMaxScaler()
x_train_scaled = sc.fit_transform(x_train)
x_test_scaled = sc.transform(x_test)
y_train_scaled = sc.fit_transform(y_train)
y_test_scaled = sc.transform(y_test)

x_train_scaled = np.array(x_train_scaled, dtype=np.float32)
y_train_scaled = np.array(y_train_scaled, dtype=np.float32)
x_test_scaled = np.array(x_test_scaled, dtype=np.float32)
y_test_scaled = np.array(y_test_scaled, dtype=np.float32)

inputs = torch.from_numpy(x_train_scaled)
targets = torch.from_numpy(y_train_scaled)

test_inputs = torch.from_numpy(x_test_scaled)
test_targets = torch.from_numpy(y_test_scaled)

11964/14733 data points remain.
<class 'pandas.core.frame.DataFrame'>
Int64Index: 11964 entries, 233 to 14732
Data columns (total 19 columns):
 #   Column         Non-Null Count  Dtype
---  ------         --------------  -----
 0   time           11964 non-null  int64
 1   age            11964 non-null  int64
 2   height         11964 non-null  int64
 3   weight         11964 non-null  int64
 4   depressive     11964 non-null  int64
 5   media          11964 non-null  int64
 6   liquor         11964 non-null  int64
 7   smoke          11964 non-null  int64
 8   caffeine       11964 non-null  int64
 9   exercise       11964 non-null  int64
 10  stress         11964 non-null  int64
 11  nap            11964 non-null  int64
 12  state_asleep   11964 non-null  uint8
 13  state_awake    11964 non-null  uint8
 14  gender_female  11964 non-null  uint8
 15  gender_male    11964 non-null  uint8
 16  disease_none   11964 non-null  uint8
 17  disorder_no    11964 non-null  uint8
 18  disorder_yes

In [113]:
print("Linear Regression")
LinearModel = LinearRegression().fit(inputs, targets)
predict_train_y = LinearModel.predict(inputs)
evaluateRegressor(targets, predict_train_y, "    Training Set")
predict_valid_y = LinearModel.predict(test_inputs)
evaluateRegressor(test_targets, predict_valid_y)

Linear Regression
    Training Set
MSE : 0.019822303
MAE : 0.0790712
RMSE : 0.12488464
R-Squared : 0.2533967113057486
    Test Set
MSE : 0.021200707
MAE : 0.08029294
RMSE : 0.13335395
R-Squared : 0.24416215362859636


In [114]:
print("Decision Tree")
DTModel = DecisionTreeRegressor().fit(inputs, targets)
predict_train_y = DTModel.predict(inputs)
evaluateRegressor(targets, predict_train_y, "    Training Set")
predict_valid_y = DTModel.predict(test_inputs)
evaluateRegressor(test_targets, predict_valid_y)

Decision Tree
    Training Set
MSE : 0.005396248611602863
MAE : 0.028314099432858324
RMSE : 0.06452840056054732
R-Squared : 0.8155653493613697
    Test Set
MSE : 0.02309544121435432
MAE : 0.06484064444410063
RMSE : 0.12830083910558016
R-Squared : 0.44575200184624447


In [116]:
print("K-Nearest Neighbors")
KNNModel = KNeighborsRegressor().fit(inputs, targets)
predict_train_y = KNNModel.predict(inputs)
evaluateRegressor(targets, predict_train_y, "    Training Set")
predict_valid_y = KNNModel.predict(test_inputs)
evaluateRegressor(test_targets, predict_valid_y)

K-Nearest Neighbors
    Training Set
MSE : 0.00995218
MAE : 0.044617157
RMSE : 0.08639766
R-Squared : 0.6739333354876149
    Test Set
MSE : 0.015262583
MAE : 0.053751815
RMSE : 0.105503485
R-Squared : 0.5924820974157642
