In [None]:
import pandas as pd
import sklearn.ensemble as skle
import sklearn.svm as svm
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import r2_score

In [None]:
df = pd.read_csv('V5_final_dataset.csv', index_col=0)

In [None]:
df.columns

In [None]:
norm_all_features = preprocessing.normalize(df)

In [None]:
norm_df = pd.DataFrame({df.columns[i]: norm_all_features.T[i] for i in range(len(df.columns))})

In [None]:
corr = df.corr()
corr_norm = norm_df.corr()

In [None]:
norm_df.shape

In [None]:
df_w_out_zeros_in_osm_and_na = df[(df['Na + (vein)'] != 0) & (df['Osmolarity (vein)'] != 0)]

In [None]:
na, osm, glu = preprocessing.normalize(df_w_out_zeros_in_osm_and_na[['Na + (vein)', 
                                                                     'Osmolarity (vein)', 
                                                                     'Glu (vein)']]).T

In [None]:
df_norm_na_osm = df_w_out_zeros_in_osm_and_na.drop(columns=['Na + (vein)', 'Osmolarity (vein)'])
df_norm_na_osm['Na + (vein) norm'] = na
df_norm_na_osm['Osmolarity (vein) norm'] = osm


In [None]:
df_norm_na_osm['Glu (vein) norm'] = glu

In [None]:
corr1 = df_norm_na_osm.corr()

In [None]:
df_norm_na_osm['Na + (vein) norm']

In [None]:
norm_df['Na + (vein)']

In [None]:
sns.set_theme(style="dark")
f, ax = plt.subplots(figsize=(11, 9))
# Generate a custom diverging colormap
cmap = sns.diverging_palette(230, 20, as_cmap=True)
# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, cmap=cmap, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

In [None]:
sns.set_theme(style="dark")
f1, ax1 = plt.subplots(figsize=(11, 9))
# Generate a custom diverging colormap
cmap1 = sns.diverging_palette(230, 20, as_cmap=True)
# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr1, cmap=cmap1, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

In [None]:
sns.set_theme(style="dark")
f1, ax1 = plt.subplots(figsize=(11, 9))
# Generate a custom diverging colormap
cmap1 = sns.diverging_palette(230, 20, as_cmap=True)
# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr_norm, cmap=cmap1, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

In [None]:
# there are some strong dependencies, so i'll start with models which are ok with multicollinearity problem
corr

In [None]:
X = df.drop(columns=['Glu (vein)'])
y = df['Glu (vein)']

In [None]:
y

In [None]:
min(y), max(y), np.mean(y), np.std(y)

In [None]:
plt.hist(y, bins=50)
plt.title('Distribution of Glucose level test results')

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
X_train_s = preprocessing.scale(X_train)
X_test_s = preprocessing.scale(X_test)

In [None]:
# ensemble methods

In [None]:
grad_boosting = skle.GradientBoostingRegressor(random_state=42)
grad_boosting.fit(X_train, y_train)
grad_boosting_y = grad_boosting.predict(X_test)

In [None]:
plt.figure(figsize=(15,5)) 
plt.plot(y_test.tolist(), label='Y test')
plt.plot(grad_boosting_y, label='GBR')

In [None]:
bagging = skle.BaggingRegressor(random_state=42)
bagging.fit(X_train, y_train)
bagging_y = bagging.predict(X_test)

In [None]:
plt.figure(figsize=(15,5)) 
plt.plot(y_test.tolist(), label='Y test')
plt.plot(bagging_y, label='Bagging')

In [None]:
rand_forest = skle.RandomForestRegressor(random_state=42)
rand_forest.fit(X_train, y_train)
rand_forest_y = rand_forest.predict(X_test)

In [None]:
plt.figure(figsize=(15,5)) 
plt.plot(y_test.tolist(), label='Y test')
plt.plot(rand_forest_y, label='Random Forest')

In [None]:
Ada_boost = skle.AdaBoostRegressor(random_state=42)
Ada_boost.fit(X_train, y_train)
Ada_boost_y = Ada_boost.predict(X_test)

In [None]:
plt.figure(figsize=(15,5)) 
plt.plot(y_test.tolist(), label='Y test')
plt.plot(Ada_boost_y, label='AdaBoost')

In [None]:
HGBR = skle.HistGradientBoostingRegressor(random_state=42)
HGBR.fit(X_train, y_train)
HGBR_y = HGBR.predict(X_test)

In [None]:
plt.figure(figsize=(15,5)) 
plt.plot(y_test.tolist(), label='Y test')
plt.plot(HGBR_y, label='HGBR')

In [None]:
ereg = skle.VotingRegressor(estimators=[('gb', grad_boosting), ('bg', bagging), 
                                        ('rf', rand_forest), ('adab', Ada_boost)])
ereg.fit(X_train, y_train)
ereg_y = ereg.predict(X_test)

In [None]:
plt.figure(figsize=(15,5)) 
plt.plot(y_test.tolist(), label='Y test')
plt.plot(ereg_y, label='HGBR')

In [None]:
reg = skle.StackingRegressor(estimators=[('bg', bagging), 
                                    ('rf', rand_forest), ('adab', Ada_boost)],
                        final_estimator=grad_boosting)
reg.fit(X_train, y_train)
reg_y = reg.predict(X_test)

In [None]:
plt.figure(figsize=(15,5)) 
plt.plot(y_test.tolist(), label='Y test')
plt.plot(reg_y, label='SVM')

In [None]:
# linear SVM failed to converge (but if we do GS it might be succsessful)

In [None]:
supp_vec_machine2 = svm.LinearSVR()
supp_vec_machine2.fit(X_train_s, y_train)
svm2_y = supp_vec_machine2.predict(X_test_s)

In [None]:
plt.figure(figsize=(15,5)) 
plt.plot(y_test.tolist(), label='Y test')
plt.plot(svm2_y, label='linear SVM')

In [None]:
supp_vec_machine = svm.NuSVR()
supp_vec_machine.fit(X_train, y_train)
svm_y = supp_vec_machine.predict(X_test)

In [None]:
plt.figure(figsize=(15,5)) 
plt.plot(y_test.tolist(), label='Y test')
plt.plot(svm_y, label='SVM')

In [None]:
supp_vec_machine_s = svm.NuSVR()
supp_vec_machine_s.fit(X_train_s, y_train)
svm_s_y = supp_vec_machine_s.predict(X_test_s)

In [None]:
plt.figure(figsize=(15,5)) 
plt.plot(y_test.tolist(), label='Y test')
plt.plot(svm_s_y, label='SVM')

In [None]:
supp_vec_machine1 = svm.SVR()
supp_vec_machine1.fit(X_train, y_train)
svm1_y = supp_vec_machine1.predict(X_test)

In [None]:
plt.figure(figsize=(15,5)) 
plt.plot(y_test.tolist(), label='Y test')
plt.plot(svm1_y, label='SVM')

In [None]:
supp_vec_machine1_s = svm.SVR()
supp_vec_machine1_s.fit(X_train_s, y_train)
svm1_s_y = supp_vec_machine1_s.predict(X_test_s)

In [None]:
plt.figure(figsize=(15,5)) 
plt.plot(y_test.tolist(), label='Y test')
plt.plot(svm1_s_y, label='SVM')

In [None]:
df.shape

In [None]:
df.drop_duplicates().shape

In [None]:
df_plots = df.drop_duplicates()

In [None]:
df_plots.columns

In [None]:
df_plots['Na + (vein)']

In [None]:
from sklearn import preprocessing
df_plt = preprocessing.normalize(df_plots)

In [None]:
df_plots.shape

In [None]:
dd.shape

In [None]:
dd = pd.DataFrame({df_plots.columns[i]: df_plt.T[i] for i in range(len(df_plots.columns)) })

In [None]:
from mpl_toolkits.mplot3d import Axes3D
plt.figure().add_subplot(111, projection='3d').scatter(df_norm_na_osm['Na + (vein) norm'], 
                                                       df_norm_na_osm['Osmolarity (vein) norm'], 
                                                       df_norm_na_osm['Glu (vein) norm'])
plt.xlabel("Na +")
plt.ylabel("Osmolarity")


In [None]:
plt.scatter(df_norm_na_osm['Na + (vein) norm'],  df_norm_na_osm['Osmolarity (vein) norm'])
plt.xlabel("Na +")
plt.ylabel("Osmolarity")


In [None]:
plt.scatter(df_norm_na_osm['Na + (vein) norm'],  df_norm_na_osm['Glu (vein) norm'])
plt.xlabel("Na +")
plt.ylabel("Glu")


In [None]:
plt.scatter(df_norm_na_osm['Osmolarity (vein) norm'],  df_norm_na_osm['Glu (vein) norm'])
plt.xlabel("Osmolarity")
plt.ylabel("Glu")

In [None]:
df.columns

In [None]:
df_norm_na_osm['Glu (vein) norm']

In [None]:
df1 = pd.read_csv('V3_redundant_columns_removed.csv')

In [None]:
df1.drop(columns=['EventCode', 'Date', 'HbA1c'])

In [None]:
df1.drop_duplicates().shape

In [None]:
df[['Glu (vein)']['']

In [None]:
!tar chvfz notebook.tar.gz *