In [85]:
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.datasets import load_iris
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import LinearSVR
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.multioutput import RegressorChain
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
# from bayes_opt import BayesianOptimization
import shap


In [86]:
sample_data = pd.read_csv("sample.csv")
print(sample_data.info(verbose=True, show_counts=True))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14733 entries, 0 to 14732
Data columns (total 22 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  14733 non-null  int64  
 1   co2         14733 non-null  int64  
 2   temp        14733 non-null  float64
 3   humid       14733 non-null  float64
 4   csd         14733 non-null  int64  
 5   sound       14733 non-null  int64  
 6   time        14733 non-null  int64  
 7   state       14733 non-null  object 
 8   age         14733 non-null  int64  
 9   gender      14733 non-null  object 
 10  height      14733 non-null  int64  
 11  weight      14733 non-null  int64  
 12  disease     14733 non-null  object 
 13  depressive  14733 non-null  int64  
 14  disorder    14733 non-null  object 
 15  media       14733 non-null  int64  
 16  liquor      14733 non-null  int64  
 17  smoke       14733 non-null  int64  
 18  caffeine    14733 non-null  int64  
 19  exercise    14733 non-nul

In [35]:
sample_data = remove_outliers(sample_data, "co2", 0.1, 0.9)
sample_data = pd.get_dummies(sample_data)                       # Embedding

# Train - Test Split
x_data = sample_data.iloc[:, 6:]
y_data = sample_data.iloc[:, [1, 2, 3, 4, 5]]
train_x, valid_x, train_y, valid_y = train_test_split(x_data, y_data, test_size=0.2, shuffle=True, random_state=1)

11964/14733 data points remain.


In [52]:
def remove_outliers(df, column_name, lower, upper):
    removed_outliers = df[column_name].between(df[column_name].quantile(lower), df[column_name].quantile(upper))

    print(str(df[column_name][removed_outliers].size) + "/" + str(sample_data[column_name].size) + " data points remain.")

    index_names = df[~removed_outliers].index
    return df.drop(index_names)


def PlotMultiplePie(df, categorical_features=None, dropna=False):
    # set a threshold of 30 unique variables, more than 50 can lead to ugly pie charts
    threshold = 30

    # if user did not set categorical_features
    if categorical_features is None:
        categorical_features = df.select_dtypes(['object', 'category']).columns.to_list()

    print("The Categorical Features are:", categorical_features)

    # loop through the list of categorical_features
    for cat_feature in categorical_features:
        num_unique = df[cat_feature].nunique(dropna=dropna)
        num_missing = df[cat_feature].isna().sum()
        # prints pie chart and info if unique values below threshold
        if num_unique <= threshold:
            print('Pie Chart for: ', cat_feature)
            print('Number of Unique Values: ', num_unique)
            print('Number of Missing Values: ', num_missing)
            fig = px.pie(df[cat_feature].value_counts(dropna=dropna), values=cat_feature,
                         names=df[cat_feature].value_counts(dropna=dropna).index, title=cat_feature, template='ggplot2')
            fig.show()
        else:
            print('Pie Chart for ', cat_feature, ' is unavailable due high number of Unique Values ')
            print('Number of Unique Values: ', num_unique)
            print('Number of Missing Values: ', num_missing)
            print('\n')


def evaluateRegressor(true, predicted, message="    Test Set"):
    MSE = mean_squared_error(true, predicted, squared=True)
    MAE = mean_absolute_error(true, predicted)
    RMSE = mean_squared_error(true, predicted, squared=False)
    R_squared = r2_score(true, predicted)

    print(message)
    print("MSE :", MSE)
    print("MAE :", MAE)
    print("RMSE :", RMSE)
    print("R-Squared :", R_squared)
    
    
def evaluateKFold(model):
    cv = RepeatedKFold(n_splits=5, n_repeats=3, random_state=1)
    n_scores = cross_val_score(model, x_data, y_data, scoring='neg_mean_squared_error', cv=cv, n_jobs=-1)
    rmse = np.sqrt(-1*n_scores)
    print('RMSE: %.3f'%np.mean(rmse))
    

In [9]:
print("Random Forest Regressor")
RFRegModel = RandomForestRegressor(random_state=0).fit(train_x, train_y)
predict_train_y = RFRegModel.predict(train_x)
evaluateRegressor(train_y, predict_train_y, "    Training Set")
predict_valid_y = RFRegModel.predict(valid_x)
evaluateRegressor(valid_y, predict_valid_y)

Random Forest Regressor
    Training Set
MSE : 11220.805026085482
MAE : 26.540365094686575
RMSE : 50.37367219658843
R-Squared : 0.8234099125127645
    Test Set
MSE : 22554.46964063669
MAE : 40.020219152364646
RMSE : 71.47945783825769
R-Squared : 0.5033426403891046


In [11]:
print("Linear Regression")
LinearModel = LinearRegression().fit(train_x, train_y)
predict_train_y = LinearModel.predict(train_x)
evaluateRegressor(train_y, predict_train_y, "    Training Set")
predict_valid_y =LinearModel.predict(valid_x)
evaluateRegressor(valid_y, predict_valid_y)

Linear Regression
    Training Set
MSE : 42227.40739074471
MAE : 78.69411648569721
RMSE : 98.13463580045149
R-Squared : 0.231980148517252
    Test Set
MSE : 44805.73367211919
MAE : 80.68214482199919
RMSE : 100.25197430896894
R-Squared : 0.2189286764034044


In [14]:
print("K-Nearest Neighbors")
KNNModel = KNeighborsRegressor().fit(train_x, train_y)
predict_train_y = KNNModel.predict(train_x)
evaluateRegressor(train_y, predict_train_y, "    Training Set")
predict_valid_y = KNNModel.predict(valid_x)
evaluateRegressor(valid_y, predict_valid_y)

K-Nearest Neighbors
    Training Set
MSE : 17744.023845028394
MAE : 40.806171915332996
RMSE : 63.57574106562765
R-Squared : 0.6728434000154995
    Test Set
MSE : 26812.53876662881
MAE : 49.592807434176564
RMSE : 78.37401441075362
R-Squared : 0.388076176682229


In [17]:
print("Decision Tree")
DTModel = DecisionTreeRegressor().fit(train_x, train_y)
predict_train_y = DTModel.predict(train_x)
evaluateRegressor(train_y, predict_train_y, "    Training Set")
predict_valid_y = DTModel.predict(valid_x)
evaluateRegressor(valid_y, predict_valid_y)

Decision Tree
    Training Set
MSE : 10932.62268883695
MAE : 23.79475082095386
RMSE : 49.43537462113815
R-Squared : 0.8549891675019519
    Test Set
MSE : 27310.35023391786
MAE : 43.56721525044679
RMSE : 78.76822797563337
R-Squared : 0.3740313102278825


In [53]:
print("Random Forest Regressor")
RFRegModel = RandomForestRegressor(random_state=0)
evaluateKFold(RFRegModel)
print("K-Nearest Neighbors")
KNNModel = KNeighborsRegressor()
evaluateKFold(KNNModel)
print("Linear Regression")
LinearModel = LinearRegression()
evaluateKFold(LinearModel)
print("Decision Tree ")
DTModel = DecisionTreeRegressor()
evaluateKFold(DTModel)

Random Forest Regressor
RMSE: 177.052
Decision Tree
RMSE: 198.555
K-Nearest Neighbors
RMSE: 185.799
Linear Regression
RMSE: 231.237


In [55]:
print("Support Vector Machine - Direct Multioutput")
model = LinearSVR()
wrapper = MultiOutputRegressor(model)
evaluateKFold(wrapper)
print("Support Vector Machine - Chained Multioutput")
model = LinearSVR()
wrapper = RegressorChain(model)
evaluateKFold(wrapper)

Support Vector Machine - Direct Multioutput
RMSE: 257.493
Support Vector Machine - Chained Multioutput
RMSE: 254.937


In [87]:
sample_data = pd.read_csv("sample.csv")
sample_data = remove_outliers(sample_data, "co2", 0.1, 0.9)
sample_data = pd.get_dummies(sample_data)                       # Embedding

x_data = sample_data.iloc[:, 6:]
y_data = sample_data.iloc[:, [1, 2, 3, 4, 5]]
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.1, random_state=42)
x_data.info(verbose=True, show_counts=True)

sc_x = StandardScaler()
x_train_scaled = sc_x.fit_transform(x_train)
x_test_scaled = sc_x.transform(x_test)

sc_y = StandardScaler()
y_train_scaled = sc_y.fit_transform(y_train)
y_test_scaled = sc_y.transform(y_test)

x_train_scaled = np.array(x_train_scaled, dtype=np.float32)
y_train_scaled = np.array(y_train_scaled, dtype=np.float32)
x_test_scaled = np.array(x_test_scaled, dtype=np.float32)
y_test_scaled = np.array(y_test_scaled, dtype=np.float32)

inputs = torch.from_numpy(x_train_scaled)
targets = torch.from_numpy(y_train_scaled)

test_inputs = torch.from_numpy(x_test_scaled)
test_targets = torch.from_numpy(y_test_scaled)

11964/14733 data points remain.
<class 'pandas.core.frame.DataFrame'>
Int64Index: 11964 entries, 233 to 14732
Data columns (total 19 columns):
 #   Column         Non-Null Count  Dtype
---  ------         --------------  -----
 0   time           11964 non-null  int64
 1   age            11964 non-null  int64
 2   height         11964 non-null  int64
 3   weight         11964 non-null  int64
 4   depressive     11964 non-null  int64
 5   media          11964 non-null  int64
 6   liquor         11964 non-null  int64
 7   smoke          11964 non-null  int64
 8   caffeine       11964 non-null  int64
 9   exercise       11964 non-null  int64
 10  stress         11964 non-null  int64
 11  nap            11964 non-null  int64
 12  state_asleep   11964 non-null  uint8
 13  state_awake    11964 non-null  uint8
 14  gender_female  11964 non-null  uint8
 15  gender_male    11964 non-null  uint8
 16  disease_none   11964 non-null  uint8
 17  disorder_no    11964 non-null  uint8
 18  disorder_yes

In [88]:
print("Random Forest Regressor")
RFRegModel = RandomForestRegressor(random_state=0).fit(inputs, targets)
predict_train_y = RFRegModel.predict(inputs)
evaluateRegressor(targets, predict_train_y, "    Training Set")
predict_valid_y = RFRegModel.predict(test_inputs)
evaluateRegressor(test_targets, predict_valid_y)

Random Forest Regressor
    Training Set
MSE : 0.21282695962886905
MAE : 0.17481054727271722
RMSE : 0.45810987846040235
R-Squared : 0.7871730385051497
    Test Set
MSE : 0.5324825903698143
MAE : 0.2874270507676731
RMSE : 0.6862615546984424
R-Squared : 0.543861072193424


In [90]:
print("Linear Regression")
LinearModel = LinearRegression().fit(inputs, targets)
predict_train_y = LinearModel.predict(inputs)
evaluateRegressor(targets, predict_train_y, "    Training Set")
predict_valid_y = LinearModel.predict(test_inputs)
evaluateRegressor(test_targets, predict_valid_y)

Linear Regression
    Training Set
MSE : 0.7465827
MAE : 0.3963996
RMSE : 0.85488796
R-Squared : 0.25341426251852184
    Test Set
MSE : 1.1032459
MAE : 0.41297412
RMSE : 1.0085735
R-Squared : 0.2445805607796104
