## Data Understanding

### Exploratory Data Analysis

In [1]:
import pandas as pd
import numpy as np
import sys
import os

In [2]:
raw_df = pd.read_pickle('../data/telecom_xdr.pkl')

In [3]:
# column names
raw_df.columns.tolist()

['Bearer Id',
 'Start',
 'Start ms',
 'End',
 'End ms',
 'Dur. (ms)',
 'IMSI',
 'MSISDN/Number',
 'IMEI',
 'Last Location Name',
 'Avg RTT DL (ms)',
 'Avg RTT UL (ms)',
 'Avg Bearer TP DL (kbps)',
 'Avg Bearer TP UL (kbps)',
 'TCP DL Retrans. Vol (Bytes)',
 'TCP UL Retrans. Vol (Bytes)',
 'DL TP < 50 Kbps (%)',
 '50 Kbps < DL TP < 250 Kbps (%)',
 '250 Kbps < DL TP < 1 Mbps (%)',
 'DL TP > 1 Mbps (%)',
 'UL TP < 10 Kbps (%)',
 '10 Kbps < UL TP < 50 Kbps (%)',
 '50 Kbps < UL TP < 300 Kbps (%)',
 'UL TP > 300 Kbps (%)',
 'HTTP DL (Bytes)',
 'HTTP UL (Bytes)',
 'Activity Duration DL (ms)',
 'Activity Duration UL (ms)',
 'Dur. (ms).1',
 'Handset Manufacturer',
 'Handset Type',
 'Nb of sec with 125000B < Vol DL',
 'Nb of sec with 1250B < Vol UL < 6250B',
 'Nb of sec with 31250B < Vol DL < 125000B',
 'Nb of sec with 37500B < Vol UL',
 'Nb of sec with 6250B < Vol DL < 31250B',
 'Nb of sec with 6250B < Vol UL < 37500B',
 'Nb of sec with Vol DL < 6250B',
 'Nb of sec with Vol UL < 1250B',
 'Socia

### Data Pre-Processing
Data preprocessing is an integral step in Machine Learning as the quality of data and the useful information that can be derived from it directly affects the ability of our model to learn; therefore, it is extremely important that we preprocess our data before feeding it into our model.

We'll look at:


*   Handling Null Values
*   Standardization

Others:


*   Handling Categorical Variables
*   One-Hot Encoding

Reference: [https://towardsdatascience.com/introduction-to-data-preprocessing-in-machine-learning-a9fa83a5dc9d](https://towardsdatascience.com/introduction-to-data-preprocessing-in-machine-learning-a9fa83a5dc9d) 




In [5]:
# how many missing values exist or better still what is the % of missing values in the dataset?
def percent_missing(df):

    # Calculate total number of cells in dataframe
    totalCells = np.prod(df.shape)

    # Count number of missing values per column
    missingCount = df.isnull().sum()

    # Calculate total number of missing values
    totalMissing = missingCount.sum()

    # Calculate percentage of missing values
    print("The Diabetes dataset contains", round(((totalMissing/totalCells) * 100), 2), "%", "missing values.")
def columns_missing_most_values(df, percentage):
    



percent_missing(raw_df)

The Diabetes dataset contains 12.5 % missing values.


In [None]:
# drop columns with more than 30% missing values
df_clean = raw_df.drop(['weight', 'payer_code', 'medical_specialty', 'max_glu_serum', 'A1Cresult'], axis=1)
df_clean.shape

In [None]:
# fill missing with ffill method for columns (diag_1, diag_2, diag_3)

def fix_missing_ffill(df, col):
    df[col] = df[col].fillna(method='ffill')
    return df[col]


def fix_missing_bfill(df, col):
    df[col] = df[col].fillna(method='bfill')
    return df[col]

df_clean['diag_1'] = fix_missing_ffill(df_clean, 'diag_1')
df_clean['diag_2'] = fix_missing_ffill(df_clean, 'diag_2')
df_clean['diag_3'] = fix_missing_ffill(df_clean, 'diag_3')

# fill 'race' column with mode 
df_clean['race'] = df_clean['race'].fillna(df_clean['race'].mode()[0])

In [None]:
df_clean.head()

In [None]:
df_clean=df_clean.drop(['encounter_id', 'patient_nbr', 'age'], axis=1)
df_clean.head()

In [None]:
df_clean.isnull().sum()

In [None]:
df_clean['diag_1'] = df_clean['diag_1'].apply (pd.to_numeric, errors='coerce')
df_clean['diag_2'] = df_clean['diag_2'].apply (pd.to_numeric, errors='coerce')
df_clean['diag_3'] = df_clean['diag_3'].apply (pd.to_numeric, errors='coerce')


In [None]:
df_clean.dropna(subset = ['diag_1',"diag_2",'diag_3'], inplace=True)
df_clean.isnull().sum()

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
from sklearn.preprocessing import LabelEncoder

lb = LabelEncoder() 
df_clean['metformin'] = lb.fit_transform(df_clean['metformin'])
df_clean['repaglinide'] = lb.fit_transform(df_clean['repaglinide'])
df_clean['nateglinide'] = lb.fit_transform(df_clean['nateglinide'])
df_clean['chlorpropamide'] = lb.fit_transform(df_clean['chlorpropamide'])
df_clean['glimepiride'] = lb.fit_transform(df_clean['glimepiride'])
df_clean['acetohexamide'] = lb.fit_transform(df_clean['acetohexamide'])
df_clean['glipizide'] = lb.fit_transform(df_clean['glipizide'])
df_clean['glyburide'] = lb.fit_transform(df_clean['glyburide'])
df_clean['tolbutamide'] = lb.fit_transform(df_clean['tolbutamide'])
df_clean['pioglitazone'] = lb.fit_transform(df_clean['pioglitazone'])
df_clean['rosiglitazone'] = lb.fit_transform(df_clean['rosiglitazone'])

df_clean['acarbose'] = lb.fit_transform(df_clean['acarbose'])
df_clean['miglitol'] = lb.fit_transform(df_clean['miglitol'])
df_clean['troglitazone'] = lb.fit_transform(df_clean['troglitazone'])
df_clean['tolazamide'] = lb.fit_transform(df_clean['tolazamide'])
df_clean['examide'] = lb.fit_transform(df_clean['examide'])
df_clean['citoglipton'] = lb.fit_transform(df_clean['citoglipton'])
df_clean['insulin'] = lb.fit_transform(df_clean['insulin'])
df_clean['glyburide-metformin'] = lb.fit_transform(df_clean['glyburide-metformin'])
df_clean['glipizide-metformin'] = lb.fit_transform(df_clean['glipizide-metformin'])


df_clean['glimepiride-pioglitazone'] = lb.fit_transform(df_clean['glimepiride-pioglitazone'])
df_clean['metformin-rosiglitazone'] = lb.fit_transform(df_clean['metformin-rosiglitazone'])
df_clean['metformin-pioglitazone'] = lb.fit_transform(df_clean['metformin-pioglitazone'])
df_clean['change'] = lb.fit_transform(df_clean['change'])
df_clean['diabetesMed'] = lb.fit_transform(df_clean['diabetesMed'])
df_clean['readmitted'] = lb.fit_transform(df_clean['readmitted'])
df_clean['race'] = lb.fit_transform(df_clean['race'])
df_clean['gender'] = lb.fit_transform(df_clean['gender'])
df_clean.head()

## Modelling


In [None]:
y= df_clean['readmitted']
len(y)

In [None]:
x = df_clean.drop(['readmitted'],axis=1)

In [None]:
len(x)

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
import matplotlib.pyplot as plt



# Feature extraction
test = SelectKBest(score_func=chi2, k=4)
fit = test.fit(x, y)

# Summarize scores
np.set_printoptions(precision=3)
print(fit.scores_)

for i in range(len(fit.scores_)):
	print('Feature %d: %f' % (i, fit.scores_[i]))
# plot the scores
plt.bar([i for i in range(len(fit.scores_))], fit.scores_)
plt.show()
#features = fit.transform(x)
# Summarize selected features
#print(features[0:5,:])

In [None]:
selected_features=x[['discharge_disposition_id', 'time_in_hospital', 'num_medications', 'number_outpatient', 'number_emergency', 'number_inpatient', 'number_diagnoses', 'change', 'diabetesMed']]
len(selected_features)

In [None]:
selected_features.head()

In [None]:
len(y)

In [None]:
y.to_csv('label.csv')

In [None]:
selected_features.to_csv('features.csv')

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from joblib import dump,load
from sklearn.ensemble import RandomForestClassifier

In [None]:
import pickle

In [None]:
xtrain,xtest,ytrain,ytest = train_test_split(selected_features,y,test_size=0.2,random_state=40)

# creating scaler scale var.
norm = MinMaxScaler()
# fit the scal
norm_fit = norm.fit(xtrain)
pickle.dump(norm_fit, open("scaler.pkl", 'wb'))
dump(norm_fit,'scaler.joblib')
# transfromation of trainig data
scal_xtrain = norm_fit.transform(xtrain)

# transformation of testing data
scal_xtest = norm_fit.transform(xtest)
print(scal_xtrain)

In [None]:
  
# create model variable
rnd = RandomForestClassifier()
  
# fit the model
fit_rnd = rnd.fit(xtrain,ytrain)  

In [None]:
#predicting our target variable
y_predict = fit_rnd.predict(xtest)

#predicting the probability of our target variable
y_predict_probabilty = fit_rnd.predict_proba(xtest)

In [None]:
print(list(y_predict))

In [None]:
print(y_predict_probabilty)

In [None]:
# checking the accuracy score
rnd_score = fit_rnd.score(xtest,ytest) 
  
print('score of model is : ',rnd_score)
  
pickle.dump(fit_rnd, open("model.pkl", 'wb'))
#dump(rnd, 'model.joblib')

In [None]:
from sklearn.metrics import classification_report
x_predict = list(rnd.predict(xtest))
print(classification_report(ytest, x_predict))
#df = {'predicted':x_predict,'orignal':ytest}
#df