In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import sys

np.set_printoptions(threshold=sys.maxsize)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

**Importing the relevant Libraries**


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 

In [None]:
# Loading dataset
train_df = pd.read_csv("../input/widsdatathon2020/training_v2.csv")
test_df = pd.read_csv("../input/widsdatathon2020/unlabeled.csv")



In [None]:
train_df.head()


In [None]:
display(train_df.nunique)

In [None]:
train_df.isna()
train_df.isna().sum()

In [None]:
train_df.describe()

In [None]:
print(test_df.shape)

In [None]:
test_df.describe()

In [None]:
test_df.isna()

In [None]:
test_df.isna().sum()

In [None]:
train_df['hospital_death'].dtype

In [None]:
test_df['hospital_death'].dtype

**Let's print the sum of missing and uniques values of all the columns**

In [None]:
def display_columns_properties(df):
    for i, col in enumerate(df.columns.tolist()):
         print('\n ({} {})  Missing: {}  UniqValsSz: {}'.format(i,col, df[col].isnull().sum() ,df[col].unique().size))
    print('\n')
    

In [None]:
display_columns_properties(train_df)

In [None]:
display_columns_properties(test_df)

**Selecting Categorical Columns**

In [None]:
cat_train_df = train_df.select_dtypes(include='object')
cat_train_df.head()

In [None]:
cat_train_df.info()

In [None]:
cat_test_df = test_df.select_dtypes(include='object')
cat_test_df.head()

In [None]:
cat_test_df.info()

****Printing Unique Values per column* **

In [None]:
def display_columns_uniqvals(df):
    for i, col in enumerate(df.columns.tolist()):
         print('\n ({} {}) Uniq: {}'.format(i,col, df[col].unique() ))
    print('\n')

In [None]:
display_columns_uniqvals(cat_test_df)

**Splitting the data**

We will split the data into two parts: 80% of Training set and 20% of Validation set. We will use Validation set for prediction and deciding which model/approach works better using the validation score.

In [None]:
from sklearn.model_selection import train_test_split

# copy the data
train = train_df.copy()

# Select target
y = train['hospital_death']

# To keep things simple, we'll use only numerical predictors
predictors = train.drop(['hospital_death'], axis=1)
X = predictors.select_dtypes(exclude=['object'])

# Divide data into training and validation subsets
X_train, X_valid, y_train, y_valid = train_test_split(X,y, train_size=0.8, test_size=0.2,
                                                      random_state=0)

X_train.shape


In [None]:
X_valid.shape

**Handling missing values using Imputation**

For categorical, we can fill with the most frequent value for that column.
For numerical, we can fill with mean or median value for that column. New values filled may be far away from what actual values should be.
It is of two types. Simple Imputation - fills some value and does not remember which all positions had been missing.

Imputation with extension - fills some value and remembers which all values are missing. New columns are created to store which positions had missing values.

In [None]:
from sklearn.impute import SimpleImputer

# Imputation
my_imputer = SimpleImputer()
imputed_X_train = pd.DataFrame(my_imputer.fit_transform(X_train)) #fit_transform is used for calculating the mean from columns and then replacing the missing values
imputed_X_valid = pd.DataFrame(my_imputer.transform(X_valid))

# Imputation removed column names; put them back
imputed_X_train.columns = X_train.columns
imputed_X_valid.columns = X_valid.columns

In [None]:
display_columns_properties(imputed_X_train)

In [None]:
display_columns_properties(imputed_X_valid)

Since there are no misisng values left, now we can apply Machine Learning Algorithm

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error


# Define model; Specify a number for random_state to ensure the same results in each run.
dt_model = DecisionTreeRegressor(random_state=1)

# Fit model using Training data
dt_model.fit(imputed_X_train, y_train)


In [None]:
# get predicted values on validation data
predicted_values = dt_model.predict(imputed_X_valid)

# Find difference
score = mean_absolute_error(y_valid, predicted_values)
print('MAE:', score)

In [None]:
# Find difference
score = mean_absolute_error(y_valid, predicted_values)
print('MAE:', score)

In [None]:
test = test_df.copy()

#Separate target
y_test = test['hospital_death']

# To keep things simple, we will only use numerical predictors
predictors_test = test.drop(['hospital_death'], axis=1)
X_test = predictors_test.select_dtypes(exclude=['object'])

X_test.head()

In [None]:
X_test.shape

In [None]:
# Imputation
my_imputer = SimpleImputer()
imputed_X_test = pd.DataFrame(my_imputer.fit_transform(X_test))

# Imputation removed column names; put them back
imputed_X_test.columns = X_test.columns

In [None]:
imputed_X_test.head()

In [None]:
# get predictions on test data
preds = dt_model.predict(imputed_X_test)

# Save predictions in format used for competition scoring
output = pd.DataFrame({'encounter_id': imputed_X_test.encounter_id,
                       'hospital_death': preds},dtype=np.int32)
print(output)

