In [15]:
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn import svm
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
# from bayes_opt import BayesianOptimization
import shap


In [4]:
sample_data = pd.read_csv("sample.csv")
print(sample_data.info(verbose=True, show_counts=True))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14733 entries, 0 to 14732
Data columns (total 22 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  14733 non-null  int64  
 1   co2         14733 non-null  int64  
 2   temp        14733 non-null  float64
 3   humid       14733 non-null  float64
 4   csd         14733 non-null  int64  
 5   sound       14733 non-null  int64  
 6   time        14733 non-null  int64  
 7   state       14733 non-null  object 
 8   age         14733 non-null  int64  
 9   gender      14733 non-null  object 
 10  height      14733 non-null  int64  
 11  weight      14733 non-null  int64  
 12  disease     14733 non-null  int64  
 13  depressive  14733 non-null  int64  
 14  disorder    14733 non-null  object 
 15  media       14733 non-null  int64  
 16  liquor      14733 non-null  int64  
 17  smoke       14733 non-null  int64  
 18  caffeine    14733 non-null  int64  
 19  exercise    14733 non-nul

In [5]:
def remove_outliers(df, column_name, lower, upper):
    removed_outliers = df[column_name].between(df[column_name].quantile(lower), df[column_name].quantile(upper))

    print(str(df[column_name][removed_outliers].size) + "/" + str(sample_data[column_name].size) + " data points remain.")

    index_names = df[~removed_outliers].index
    return df.drop(index_names)


def PlotMultiplePie(df, categorical_features=None, dropna=False):
    # set a threshold of 30 unique variables, more than 50 can lead to ugly pie charts
    threshold = 30

    # if user did not set categorical_features
    if categorical_features is None:
        categorical_features = df.select_dtypes(['object', 'category']).columns.to_list()

    print("The Categorical Features are:", categorical_features)

    # loop through the list of categorical_features
    for cat_feature in categorical_features:
        num_unique = df[cat_feature].nunique(dropna=dropna)
        num_missing = df[cat_feature].isna().sum()
        # prints pie chart and info if unique values below threshold
        if num_unique <= threshold:
            print('Pie Chart for: ', cat_feature)
            print('Number of Unique Values: ', num_unique)
            print('Number of Missing Values: ', num_missing)
            fig = px.pie(df[cat_feature].value_counts(dropna=dropna), values=cat_feature,
                         names=df[cat_feature].value_counts(dropna=dropna).index, title=cat_feature, template='ggplot2')
            fig.show()
        else:
            print('Pie Chart for ', cat_feature, ' is unavailable due high number of Unique Values ')
            print('Number of Unique Values: ', num_unique)
            print('Number of Missing Values: ', num_missing)
            print('\n')


def evaluateRegressor(true, predicted, message="    Test Set"):
    MSE = mean_squared_error(true, predicted, squared=True)
    MAE = mean_absolute_error(true, predicted)
    RMSE = mean_squared_error(true, predicted, squared=False)
    R_squared = r2_score(true, predicted)

    print(message)
    print("MSE :", MSE)
    print("MAE :", MAE)
    print("RMSE :", RMSE)
    print("R-Squared :", R_squared)

In [7]:
sample_data = remove_outliers(sample_data, "co2", 0.1, 0.9)
sample_data = pd.get_dummies(sample_data)                       # Embedding

# Train - Test Split
x_data = sample_data.iloc[:, 6:]
y_data = sample_data.iloc[:, [1, 2, 3, 4, 5]]
train_x, valid_x, train_y, valid_y = train_test_split(x_data, y_data, test_size=0.2, shuffle=True, random_state=1)

9685/11964 data points remain.


In [9]:
print("Random Forest Regressor")
RFRegModel = RandomForestRegressor(random_state=0).fit(train_x, train_y)
predict_train_y = RFRegModel.predict(train_x)
evaluateRegressor(train_y, predict_train_y, "    Training Set")
predict_valid_y = RFRegModel.predict(valid_x)
evaluateRegressor(valid_y, predict_valid_y)

Random Forest Regressor
    Training Set
MSE : 11220.805026085482
MAE : 26.540365094686575
RMSE : 50.37367219658843
R-Squared : 0.8234099125127645
    Test Set
MSE : 22554.46964063669
MAE : 40.020219152364646
RMSE : 71.47945783825769
R-Squared : 0.5033426403891046


In [11]:
print("Linear Regression")
LinearModel = LinearRegression().fit(train_x, train_y)
predict_train_y = LinearModel.predict(train_x)
evaluateRegressor(train_y, predict_train_y, "    Training Set")
predict_valid_y =LinearModel.predict(valid_x)
evaluateRegressor(valid_y, predict_valid_y)

Linear Regression
    Training Set
MSE : 42227.40739074471
MAE : 78.69411648569721
RMSE : 98.13463580045149
R-Squared : 0.231980148517252
    Test Set
MSE : 44805.73367211919
MAE : 80.68214482199919
RMSE : 100.25197430896894
R-Squared : 0.2189286764034044


In [14]:
print("K-Nearest Neighbors")
KNNModel = KNeighborsRegressor().fit(train_x, train_y)
predict_train_y = KNNModel.predict(train_x)
evaluateRegressor(train_y, predict_train_y, "    Training Set")
predict_valid_y = KNNModel.predict(valid_x)
evaluateRegressor(valid_y, predict_valid_y)

K-Nearest Neighbors
    Training Set
MSE : 17744.023845028394
MAE : 40.806171915332996
RMSE : 63.57574106562765
R-Squared : 0.6728434000154995
    Test Set
MSE : 26812.53876662881
MAE : 49.592807434176564
RMSE : 78.37401441075362
R-Squared : 0.388076176682229


In [17]:
print("Decision Tree")
DTModel = DecisionTreeRegressor().fit(train_x, train_y)
predict_train_y = DTModel.predict(train_x)
evaluateRegressor(train_y, predict_train_y, "    Training Set")
predict_valid_y = DTModel.predict(valid_x)
evaluateRegressor(valid_y, predict_valid_y)

Decision Tree
    Training Set
MSE : 10932.62268883695
MAE : 23.79475082095386
RMSE : 49.43537462113815
R-Squared : 0.8549891675019519
    Test Set
MSE : 27310.35023391786
MAE : 43.56721525044679
RMSE : 78.76822797563337
R-Squared : 0.3740313102278825
