# Goal: Find drivers of cost of care.

In [None]:
!pip3 install dataprep

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import dataprep
import statsmodels.api as sm

from sklearn.model_selection import train_test_split
from sklearn import metrics
from statsmodels.formula.api import glm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from dataprep.eda import create_report

In [None]:
rootdir = "C:/Users/young/Dropbox/2 - Projects/Data Science/Holmusk/Healthcare Data Challenge Data/"

In [None]:
df_bill_amount = pd.read_csv(rootdir + "bill_amount.csv")
df_bill = pd.read_csv(rootdir + "bill_id.csv")
df_clinical_data = pd.read_csv(rootdir + "clinical_data.csv")
df_demo = pd.read_csv(rootdir + "demographics.csv")

In [None]:
# Quicklook Function
def ql(df, rows=5, part='head'):
    """
    DataFrame Quicklook
    
    Prints shape of dataframe as well as top or bottom X rows.
    Inputs:
    - df  : dataframe
    - rows: number of rows, default = 5
    - part: "head" or "tail", default = "head"
    
    Outputs:
    - top and bottom rows
    """
    
    # Print dataframe dimensions
    print(df.shape)
    
    # Print top or bottom rows
    if part != 'head':
        return df.tail(rows)
    else:
        return df.head(rows)

In [None]:
# Initial merging
df = pd.merge(df_bill, df_bill_amount)
df = df.groupby(["patient_id", "date_of_admission"]).agg({"amount": 'sum'}).reset_index()
df = pd.merge(df, df_clinical_data, left_on=["patient_id", "date_of_admission"], right_on=["id", "date_of_admission"], how="inner")
df = pd.merge(df, df_demo, left_on="patient_id", right_on="patient_id", how="left")

In [None]:
# Aggregated down by hospital visits. 
ql(df)

In [None]:
create_report(df)

# Descriptive statistics

In [None]:
df.info()

In [None]:
dates=['date_of_birth', 'date_of_admission', 'date_of_discharge']
continuous_independent = ['weight', 'height',  'lab_result_1', 'lab_result_2',
       'lab_result_3']
continuous = ['weight', 'height',  'lab_result_1', 'lab_result_2',
       'lab_result_3', 'amount']
categorical = ['gender', 'race', 'resident_status', 'medical_history_1', 'medical_history_2', 'medical_history_3',
       'medical_history_4', 'medical_history_5', 'medical_history_6',
       'medical_history_7', 'preop_medication_1', 'preop_medication_2',
       'preop_medication_3', 'preop_medication_4', 'preop_medication_5',
       'preop_medication_6', 'symptom_1', 'symptom_2', 'symptom_3',
       'symptom_4', 'symptom_5']
descriptive = ['weight', 'height','medical_history_1', 'medical_history_2', 'medical_history_3',
       'medical_history_4', 'medical_history_5', 'medical_history_6',
       'medical_history_7', 'preop_medication_1', 'preop_medication_2',
       'preop_medication_3', 'preop_medication_4', 'preop_medication_5',
       'preop_medication_6', 'symptom_1', 'symptom_2', 'symptom_3',
       'symptom_4', 'symptom_5','amount']

In [None]:
df[descriptive].describe()

In [None]:
df[descriptive].describe().to_csv(rootdir + "descriptive_statistics.csv")


In [None]:
plt.figure(figsize=(10, 8), dpi=80)
sns.boxplot(data=df[continuous_independent], showfliers=True, palette=sns.color_palette("husl", len(continuous_independent)+1))
plt.show()

# Data Cleansing

In [None]:
df["date_of_admission"] = pd.to_datetime(df["date_of_admission"])
df["date_of_admission"] = pd.to_datetime(df["date_of_admission"])
df["date_of_discharge"] = pd.to_datetime(df["date_of_discharge"])
df["date_of_birth"] = pd.to_datetime(df["date_of_birth"])

df["medical_history_3"].replace({"Yes": 1, "No": 0}, inplace=True)
df["medical_history_3"] = df["medical_history_3"].astype(int)

df["gender"].replace({"f": "Female", "m": "Male"}, inplace=True)

df["race"].replace({"India": "Indian"}, inplace=True)
df["race"].replace({"chinese": "Chinese"}, inplace=True)

df["resident_status"].replace({"Singapore citizen": "Singaporean"}, inplace=True)

In [None]:
create_report(df)

# Missing values

Little's MCAR test for Python:
https://www.kaggle.com/yassirarezki/handling-missing-data-mcar-mar-and-mnar-part-i

In [None]:
plt.figure(figsize=(10,8))
sns.heatmap(df.isna().transpose(),
            cmap="YlGnBu",
            cbar_kws={'label': 'Missing Data'})
plt.savefig("visualizing_missing_data_with_heatmap_Seaborn_Python.png", dpi=100)

#Missing values + descriptive statistics after

# Data Cleaning

In [None]:
#Start with "medical_history_2"
df_medical_history_2_missing = df[df["medical_history_2"].isnull()]
df_medical_history_2_present = df[~df["medical_history_2"].isnull()]

print("# of missing medical_history_2:", len(df_medical_history_2_missing))
print("# of present medical_history_2:", len(df_medical_history_2_present))
print("% of missing medical_history_2:", len(df_medical_history_2_missing) / len(df))

In [None]:
#53 of it can be found from existing records.
print("# of missing medical_history_2 can be found in existing record:", len(df_medical_history_2_missing[df_medical_history_2_missing["id"].isin(df_medical_history_2_present["id"])]) )

#56 of it can be found in the existing records. Meaning there's duplicate values. 
print("# of missing record has more than 1 existing record:", len(df_medical_history_2_present[df_medical_history_2_present["id"].isin(df_medical_history_2_missing["id"])]) )


In [None]:
df_medical_2_present_to_be_imputed = df_medical_history_2_present[df_medical_history_2_present["id"].isin(df_medical_history_2_missing["id"])]
df_medical_2_present_to_be_imputed.loc[df_medical_2_present_to_be_imputed.duplicated(subset=["id"])]

In [None]:
df_medical_2_present_to_be_imputed[df_medical_2_present_to_be_imputed["id"]=="0eacfb2daed1f3ba2adf32e293bc05a6"]

In [None]:
#Find out the index
df_medical_2_present_to_be_imputed.index[df_medical_2_present_to_be_imputed["id"] == "0eacfb2daed1f3ba2adf32e293bc05a6"]

In [None]:
#Keep the record with medical_history_2 = 1
df_medical_2_present_to_be_imputed.drop(202, inplace=True)

In [None]:
df_medical_2_present_to_be_imputed[df_medical_2_present_to_be_imputed["id"]=="cebd42e84733dae9898687cfb750fbaf"]

In [None]:
# Drop this so that it will not inflate the records during the merge later
df_medical_2_present_to_be_imputed.drop([2732,2733], inplace=True)

In [None]:
df_medical_2_present_to_be_imputed[df_medical_2_present_to_be_imputed["id"]=="cebd42e84733dae9898687cfb750fbaf"]

In [None]:
df_medical_2_present_to_be_imputed[df_medical_2_present_to_be_imputed["id"]=="5e9e8508e8098fc220a12db23c698ec6"]

In [None]:
#Keep the record with medical_history_2 = 1
df_medical_2_present_to_be_imputed.drop(1237, inplace=True)

In [None]:
df_medical_2_present_to_be_imputed[df_medical_2_present_to_be_imputed["id"]=="5e9e8508e8098fc220a12db23c698ec6"]

In [None]:
# Drop the ones with disagreement so that it can be imputed with 0.
#df_medical_2_present_to_be_imputed = df_medical_2_present_to_be_imputed[~df_medical_2_present_to_be_imputed["id"].isin(["5e9e8508e8098fc220a12db23c698ec6", "0eacfb2daed1f3ba2adf32e293bc05a6","cebd42e84733dae9898687cfb750fbaf"])]

# Replace the "medical_history_2" 
df_medical_history_2_missing_imputed = pd.merge(df_medical_history_2_missing.loc[:, df_medical_history_2_missing.columns != "medical_history_2"], df_medical_2_present_to_be_imputed[["id", "medical_history_2"]], on="id", how="left")

print(len(df_medical_history_2_missing_imputed))
print(len(df_medical_history_2_missing_imputed[df_medical_history_2_missing_imputed["medical_history_2"].isnull()]))

In [None]:
df_2 = pd.concat( [df_medical_history_2_missing_imputed, df_medical_history_2_present], ignore_index = True)

print(len(df_2))

print(len( df_2[df_2["medical_history_2"].isnull()] ))

# Impute the rest with the modal class "0"
df_2.fillna(value={"medical_history_2":0}, inplace=True)

print(len( df_2[df_2["medical_history_2"].isnull()] ))

In [None]:
#Start with "medical_history_5"

df_medical_history_5_missing = df_2[df_2["medical_history_5"].isnull()]
df_medical_history_5_present = df_2[~df_2["medical_history_5"].isnull()]

print("# of missing medical_history_5:", len(df_medical_history_5_missing))
print("# of present medical_history_5:", len(df_medical_history_5_present))
print("% of missing medical_history_5:", len(df_medical_history_5_missing) / len(df_2))


In [None]:
#66 of it can be found from existing records.
print("# of missing medical_history_5 can be found in existing record:", len(df_medical_history_5_missing[df_medical_history_5_missing["id"].isin(df_medical_history_5_present["id"])]) )

#72 of it can be found in the existing records. Meaning there's duplicate values. 
print("# of missing record has more than 1 existing record:", len(df_medical_history_5_present[df_medical_history_5_present["id"].isin(df_medical_history_5_missing["id"])]) )

df_medical_5_present_to_be_imputed = df_medical_history_5_present[df_medical_history_5_present["id"].isin(df_medical_history_5_missing["id"])]

df_medical_5_present_to_be_imputed.loc[df_medical_5_present_to_be_imputed.duplicated(subset=["id"])]

In [None]:
df_medical_5_present_to_be_imputed[df_medical_5_present_to_be_imputed["id"]=="b2d15cda8c4e1f86ba43356434df6718"]

In [None]:
#Keep the record with medical_history_5 = 1
df_medical_5_present_to_be_imputed.drop(2407, inplace=True)

In [None]:
df_medical_5_present_to_be_imputed[df_medical_5_present_to_be_imputed["id"]=="b2d15cda8c4e1f86ba43356434df6718"]

In [None]:
df_medical_5_present_to_be_imputed[df_medical_5_present_to_be_imputed["id"]=="d01386ff66ee5ecef47c5ef7980ff10a"]

In [None]:
# Drop this so that it will not inflate the records during the merge later
df_medical_5_present_to_be_imputed.drop([2787, 2788], inplace=True)

In [None]:
df_medical_5_present_to_be_imputed[df_medical_5_present_to_be_imputed["id"]=="d01386ff66ee5ecef47c5ef7980ff10a"]

In [None]:
df_medical_5_present_to_be_imputed[df_medical_5_present_to_be_imputed["id"]=="8239986dfdf8f4e0bb351ace4742ef95"]

In [None]:
# Drop this so that it will not inflate the records during the merge later
df_medical_5_present_to_be_imputed.drop([1797, 1798], inplace=True)

In [None]:
df_medical_5_present_to_be_imputed[df_medical_5_present_to_be_imputed["id"]=="8239986dfdf8f4e0bb351ace4742ef95"]

In [None]:
df_medical_5_present_to_be_imputed[df_medical_5_present_to_be_imputed["id"]=="0eacfb2daed1f3ba2adf32e293bc05a6"]

In [None]:
#Keep the record with medical_history_5 = 1
df_medical_5_present_to_be_imputed.drop(423, inplace=True)

In [None]:
df_medical_5_present_to_be_imputed[df_medical_5_present_to_be_imputed["id"]=="0eacfb2daed1f3ba2adf32e293bc05a6"]

In [None]:
df_medical_5_present_to_be_imputed[df_medical_5_present_to_be_imputed["id"]=="e3270f450ee164e596ca933a25bab61d"]

In [None]:
# Drop this so that it will not inflate the records during the merge later
df_medical_5_present_to_be_imputed.drop([3031, 3032], inplace=True)

df_medical_5_present_to_be_imputed[df_medical_5_present_to_be_imputed["id"]=="e3270f450ee164e596ca933a25bab61d"]

In [None]:
df_medical_5_present_to_be_imputed[df_medical_5_present_to_be_imputed["id"]=="cebd42e84733dae9898687cfb750fbaf"]

In [None]:
# Drop this so that it will not inflate the records during the merge later
df_medical_5_present_to_be_imputed.drop([193, 2771], inplace=True)

df_medical_5_present_to_be_imputed[df_medical_5_present_to_be_imputed["id"]=="cebd42e84733dae9898687cfb750fbaf"]

In [None]:
len(df_medical_5_present_to_be_imputed)

In [None]:
df_medical_history_5_missing_imputed = pd.merge(df_medical_history_5_missing.loc[:, df_medical_history_5_missing.columns != "medical_history_5"], df_medical_5_present_to_be_imputed[["id", "medical_history_5"]], on="id", how="left")

print(len(df_medical_history_5_missing_imputed))

print(len(df_medical_history_5_missing_imputed[df_medical_history_5_missing_imputed["medical_history_5"].isnull()]))

df_3 = pd.concat( [df_medical_history_5_missing_imputed, df_medical_history_5_present], ignore_index = True)

print( len(df_3) )

print( len( df_3[df_3["medical_history_5"].isnull()] ) )

In [None]:
# Impute the rest with the modal class "0"
df_3.fillna(value={"medical_history_5":0}, inplace=True)

print(len( df_3[df_3["medical_history_5"].isnull()] ))

print(len(df_3))

df_3["medical_history_2"] = df_3["medical_history_2"].astype(int)
df_3["medical_history_5"] = df_3["medical_history_5"].astype(int)

df_3.dtypes

In [None]:
df = df_3.copy()

# Add features

1. ✔️ Age = Current Year - date_of_birth
2. ✔️ hospitalisation_duration = date_of_discharge - date_of_admission
3. ✔️ hospitalisations_to_date
4. 🚧 Lab_result -> Comorbidity.

In [None]:
df["date_of_admission_year"] = df["date_of_admission"].dt.year

In [None]:
df["age"] = df["date_of_admission_year"] - df["date_of_birth"].dt.year

In [None]:
df["hospitalisation_duration"] = (df["date_of_discharge"] - df["date_of_admission"]).dt.days

In [None]:
df.sort_values(by=['date_of_admission'], inplace=True, ascending=True)
df["hospitalisations_to_date"] = df.groupby("patient_id").cumcount() + 1

# One Hot Encoding to prepare the data

In [None]:
#Gender
for col, col_data in df.iteritems():
    if str(col)=='gender':
        col_data = pd.get_dummies(col_data, prefix = col)
        df = df.join(col_data)

In [None]:
#Race
for col, col_data in df.iteritems():
    if str(col)=='race':
        col_data = pd.get_dummies(col_data, prefix = col)
        df = df.join(col_data)

In [None]:
#resident_status
for col, col_data in df.iteritems():
    if str(col)=='resident_status':
        col_data = pd.get_dummies(col_data, prefix = col)
        df = df.join(col_data)

In [None]:
df["log_amount"] = np.log(df["amount"])

In [None]:
dataframe_order = ['date_of_admission_year',
                   'age', 'weight', 'height', 
                   'hospitalisation_duration', 'hospitalisations_to_date', 
                   'gender_Female', 'gender_Male',
                   'race_Chinese', 'race_Indian', 'race_Malay', 'race_Others',
                   'resident_status_Foreigner', 'resident_status_PR', 'resident_status_Singaporean',
                   'medical_history_1', 'medical_history_2', 'medical_history_3', 'medical_history_4', 'medical_history_5',
                   'medical_history_6', 'medical_history_7', 
                   'preop_medication_1', 'preop_medication_2', 'preop_medication_3', 'preop_medication_4', 'preop_medication_5', 'preop_medication_6',
                   'symptom_1', 'symptom_2', 'symptom_3', 'symptom_4', 'symptom_5', 
                   'lab_result_1', 'lab_result_2', 'lab_result_3', 
                   'log_amount']
data = df[dataframe_order].copy()

# Preliminary Analysis
0. Distribution of the cost on an annual basis.
1. Average annual cost per person. 
- To examine the growth of cost.


In [None]:
ax = sns.distplot( df["age"])

In [None]:
ax = sns.distplot( df["weight"])

In [None]:
ax = sns.distplot( df["height"])

In [None]:
ax = sns.distplot( df["hospitalisation_duration"])

In [None]:
ax = sns.displot( df["hospitalisations_to_date"])

In [None]:
ax = sns.distplot( df["lab_result_1"])

In [None]:
ax = sns.distplot( df["lab_result_2"])

In [None]:
ax = sns.distplot( df["lab_result_3"])

In [None]:
ax = sns.displot( df["amount"])

In [None]:
ax = sns.displot( np.log( df["amount"] ) )

In [None]:
ax = sns.displot(data=df, x="amount", hue="gender", log_scale=True )

In [None]:
ax = sns.displot(data=df, x="amount", hue="race", log_scale=True )

In [None]:
ax = sns.displot(data=df, x="amount", hue="resident_status", log_scale=True )

In [None]:
ax = sns.displot(data=df, x="amount", hue="medical_history_1", log_scale=True )

In [None]:
ax = sns.displot(data=df, x="amount", hue="medical_history_2", log_scale=True )

In [None]:
ax = sns.displot(data=df, x="amount", hue="medical_history_3", log_scale=True )

In [None]:
ax = sns.displot(data=df, x="amount", hue="medical_history_4", log_scale=True )

In [None]:
ax = sns.displot(data=df, x="amount", hue="medical_history_5", log_scale=True )

In [None]:
ax = sns.displot(data=df, x="amount", hue="medical_history_6", log_scale=True )

In [None]:
ax = sns.displot(data=df, x="amount", hue="medical_history_7", log_scale=True )

In [None]:
ax = sns.displot(data=df, x="amount", hue="preop_medication_1", log_scale=True )

In [None]:
ax = sns.displot(data=df, x="amount", hue="preop_medication_2", log_scale=True )

In [None]:
ax = sns.displot(data=df, x="amount", hue="preop_medication_3", log_scale=True )

In [None]:
ax = sns.displot(data=df, x="amount", hue="preop_medication_4", log_scale=True )

In [None]:
ax = sns.displot(data=df, x="amount", hue="preop_medication_5", log_scale=True )

In [None]:
ax = sns.displot(data=df, x="amount", hue="preop_medication_6", log_scale=True )

In [None]:
ax = sns.displot(data=df, x="amount", hue="symptom_1", log_scale=True )

In [None]:
ax = sns.displot(data=df, x="amount", hue="symptom_2", log_scale=True )

In [None]:
ax = sns.displot(data=df, x="amount", hue="symptom_3", log_scale=True )

In [None]:
ax = sns.displot(data=df, x="amount", hue="symptom_4", log_scale=True )

In [None]:
ax = sns.displot(data=df, x="amount", hue="symptom_5", log_scale=True )

# Feature Selection

## Correlation Heatmap

In [None]:
# Correlation to the amount
corrmat = data.corr()
top_corr_features = corrmat.index
plt.figure(figsize=(20,20))
#plot heat map
g=sns.heatmap(data[top_corr_features].corr(),annot=True,cmap="RdYlGn")

# Remove Collinearity

In [None]:
X = df[list(data.columns[:-1])]

vif_info = pd.DataFrame()
vif_info['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif_info['Column'] = X.columns
vif_info.sort_values('VIF', ascending=False)

In [None]:
independent_data = ['age',
                    'weight',
                    'gender_Male',
                    'race_Malay', 
                    'hospitalisation_duration', 
                    'hospitalisations_to_date', 
                    'resident_status_Foreigner',
                    'medical_history_1', 
                    'medical_history_2', 
                    'medical_history_3', 
                    'medical_history_4',
                    'medical_history_5',
                    'medical_history_6', 
                    'medical_history_7', 
                   'preop_medication_1', 
                    'preop_medication_2', 
                    'preop_medication_3',
                    'preop_medication_4',
                    'preop_medication_5', 
                    'preop_medication_6',
                   'symptom_1', 
                    'symptom_2',
                    'symptom_3',
                    'symptom_4', 
                    'symptom_5', 
                   'log_amount']
data_2 = data[independent_data]

In [None]:
X = df[list(data_2.columns[:-1])]

vif_info = pd.DataFrame()
vif_info['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif_info['Column'] = X.columns
vif_info.sort_values('VIF', ascending=False)

In [None]:
independent_data = ['age',
                    'weight',
                    'gender_Male',
                    'race_Malay', 
                    'resident_status_Foreigner',
                    'medical_history_1', 
                    'medical_history_2', 
                    'medical_history_3', 
                    'medical_history_4',
                    'medical_history_5',
                    'medical_history_6', 
                    'medical_history_7', 
                   'preop_medication_1', 
                    'preop_medication_2', 
                    'preop_medication_3',
                    'preop_medication_4',
                    'preop_medication_5', 
                    'preop_medication_6',
                   'symptom_1', 
                    'symptom_2',
                    'symptom_3',
                    'symptom_4', 
                    'symptom_5', 
                   'log_amount']
data_3 = data[independent_data]

In [None]:
X = df[list(data_3.columns[:-1])]

vif_info = pd.DataFrame()
vif_info['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif_info['Column'] = X.columns
vif_info.sort_values('VIF', ascending=False)

In [None]:
independent_data = ['age',
                    'gender_Male',
                    'race_Malay', 
                    'resident_status_Foreigner',
                    'medical_history_1', 
                    'medical_history_2', 
                    'medical_history_3', 
                    'medical_history_4',
                    'medical_history_5',
                    'medical_history_6', 
                    'medical_history_7', 
                   'preop_medication_1', 
                    'preop_medication_2', 
                    'preop_medication_3',
                    'preop_medication_4',
                    'preop_medication_5', 
                    'preop_medication_6',
                   'symptom_1', 
                    'symptom_2',
                    'symptom_3',
                    'symptom_4', 
                    'symptom_5', 
                   'log_amount']
data_4 = data[independent_data]

In [None]:
X = df[list(data_4.columns[:-1])]

vif_info = pd.DataFrame()
vif_info['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif_info['Column'] = X.columns
vif_info.sort_values('VIF', ascending=False)

In [None]:
independent_data = ['age',
                    'race_Malay', 
                    'resident_status_Foreigner',
                    'medical_history_1', 
                    'medical_history_6', 
                    'symptom_1', 
                    'symptom_2',
                    'symptom_3',
                    'symptom_4', 
                    'symptom_5', 
                   'log_amount']
data_5 = data[independent_data]

In [None]:
X = df[list(data_5.columns[:-1])]

vif_info = pd.DataFrame()
vif_info['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif_info['Column'] = X.columns
vif_info.sort_values('VIF', ascending=False)

In [None]:
independent_data = ['age',
                    'race_Chinese', 
                    'race_Malay', 
                    'resident_status_Foreigner',
                    'resident_status_Singaporean',
                    'medical_history_1', 
                    'symptom_3',
                    'symptom_5', 
                   'log_amount']
data_6 = data[independent_data]

In [None]:
X = df[list(data_6.columns[:-1])]

vif_info = pd.DataFrame()
vif_info['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif_info['Column'] = X.columns
vif_info.sort_values('VIF', ascending=False)

In [None]:
independent_data = ['age',
                    'race_Chinese', 
                    'race_Malay', 
                    'resident_status_Foreigner',
                    'resident_status_Singaporean',
                    'symptom_3',
                    'symptom_5', 
                   'log_amount']
data_7 = data[independent_data]

In [None]:
X = df[list(data_7.columns[:-1])]

vif_info = pd.DataFrame()
vif_info['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif_info['Column'] = X.columns
vif_info.sort_values('VIF', ascending=False)

In [None]:
X = data_4.drop(['log_amount'],axis=1)
y = data_4[['log_amount']]

In [None]:
X_1 = data_5.drop(['log_amount'],axis=1)
y_1 = data_5[['log_amount']]

In [None]:
X_2 = data_6.drop(['log_amount'],axis=1)
y_2 = data_6[['log_amount']]

In [None]:
X_3 = data_7.drop(['log_amount'],axis=1)
y_3 = data_7[['log_amount']]

# Model fitting

## Linear Regression

In [None]:
# Splitting the data into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state=1)
results = sm.OLS(y_train, X_train).fit()

# Predict
y_pred = results.predict(X_test)

# Metrics
print("R squared (bigger is better):", results.rsquared)
print("R squared adjusted (bigger is better):", results.rsquared_adj)
print("RMSE (smaller is better):", np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print()
print("Variable", " | ", "Coefficients", " | ", "p-values")
for attributeIndex in range (0, len(X_train.columns)):
    print(X_train.columns[attributeIndex], " | ",  results.params[attributeIndex], " | ", results.pvalues[attributeIndex])

In [None]:
results.summary()

In [None]:
# Splitting the data into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X_1, y_1, test_size = 0.25, random_state=1)
results = sm.OLS(y_train, X_train).fit()

# Predict
y_pred = results.predict(X_test)

# Metrics
print("R squared (bigger is better):", results.rsquared)
print("R squared adjusted (bigger is better):", results.rsquared_adj)
print("RMSE (smaller is better):", np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print()
print("Variable", " | ", "Coefficients", " | ", "p-values")
for attributeIndex in range (0, len(X_train.columns)):
    print(X_train.columns[attributeIndex], " | ",  results.params[attributeIndex], " | ", results.pvalues[attributeIndex])

In [None]:
results.summary()

In [None]:
# Splitting the data into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X_2, y_2, test_size = 0.25, random_state=1)
results = sm.OLS(y_train, X_train).fit()

# Predict
y_pred = results.predict(X_test)

# Metrics
print("R squared (bigger is better):", results.rsquared)
print("R squared adjusted (bigger is better):", results.rsquared_adj)
print("RMSE (smaller is better):", np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print()
print("Variable", " | ", "Coefficients", " | ", "p-values")
for attributeIndex in range (0, len(X_train.columns)):
    print(X_train.columns[attributeIndex], " | ",  results.params[attributeIndex], " | ", results.pvalues[attributeIndex])

In [None]:
results.summary()

In [None]:
# Splitting the data into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X_3, y_3, test_size = 0.25, random_state=1)
results = sm.OLS(y_train, X_train).fit()

# Predict
y_pred = results.predict(X_test)

# Metrics
print("R squared (bigger is better):", results.rsquared)
print("R squared adjusted (bigger is better):", results.rsquared_adj)
print("RMSE (smaller is better):", np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print()
print("Variable", " | ", "Coefficients", " | ", "p-values")
for attributeIndex in range (0, len(X_train.columns)):
    print(X_train.columns[attributeIndex], " | ",  results.params[attributeIndex], " | ", results.pvalues[attributeIndex])

In [None]:
results.summary()