In [1]:
import pandas as pd
import numpy as np
df = pd.read_csv('8K_diabetes.csv')

### 1. Perform Exploratory Data Analysis (EDA) and discuss the data and what you observe prior to beginning modeling and how impact how to proceed 

In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8000 entries, 0 to 7999
Data columns (total 51 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   race                      8000 non-null   object
 1   gender                    8000 non-null   object
 2   age                       8000 non-null   object
 3   weight                    8000 non-null   object
 4   admission_type_id         7424 non-null   object
 5   discharge_disposition_id  7627 non-null   object
 6   admission_source_id       7250 non-null   object
 7   time_in_hospital          8000 non-null   int64 
 8   payer_code                8000 non-null   object
 9   medical_specialty         8000 non-null   object
 10  num_lab_procedures        8000 non-null   int64 
 11  num_procedures            8000 non-null   int64 
 12  num_medications           8000 non-null   int64 
 13  number_outpatient         8000 non-null   int64 
 14  number_emergency        

    this data set contains different types of data like text data, numberical data, catagorical data, and bool data. and some of the columns contains large amount missing values.
    if there are some missing values that will impact the accuracy of the model, and if the majority of the data is missing

In [3]:
# the reason why I do this is to replace these two string value "?" and "None" to None to get the true number of missing value.
for x in df.columns:
    for y in range(8000):
        i = df.loc[y,x]
        if i=='?' or i=='None' or type(i)==float:
            df.loc[y,x]= None

I want to seperate the original data data set to categorical, numerical, bool, and text data set to apply different preprocessing processes.

### 2. Pre-processed categorical data for use in the model and justified pre-processing method. Note this may be different for each algorithm you try.

In [4]:
numerical_data= df.columns[10:20]
text_df = df.columns[-3:]
insufficient_data = ['weight','payer_code','medical_specialty','max_glu_serum','A1Cresult']
categorical_data = df.drop(columns = numerical_data)
categorical_data = categorical_data.drop(columns = text_df)
categorical_data = categorical_data.drop(columns = insufficient_data)
# time in hospital is numerical data
categorical_data = categorical_data.drop(columns = ['time_in_hospital'])
useless = []
for x in categorical_data.columns:
    if len(categorical_data[x].value_counts())==1:
        useless.append(x)

cate_df = categorical_data.drop(columns = useless)
cate_df = cate_df.drop(columns = ['readmitted'])

In [5]:
for x in cate_df.columns:
    print('--')
    print(cate_df[x].value_counts())
    print()

--
Caucasian          5891
AfricanAmerican    1639
Hispanic            146
Other               101
Asian                43
Name: race, dtype: int64

--
Female    4314
Male      3686
Name: gender, dtype: int64

--
[70-80)     2049
[60-70)     1749
[50-60)     1391
[80-90)     1248
[40-50)      800
[30-40)      340
[90-100)     214
[20-30)      117
[10-20)       65
[0-10)        27
Name: age, dtype: int64

--
Emergency        3968
Urgent           1545
Elective         1401
Not Available     476
Not Mapped         33
Newborn             1
Name: admission_type_id, dtype: int64

--
Discharged to home                                                                                             4879
Discharged/transferred to SNF                                                                                   948
Discharged/transferred to home with home health service                                                         912
Discharged/transferred to another short term hospital              

In [6]:
cate_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8000 entries, 0 to 7999
Data columns (total 24 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   race                      7820 non-null   object
 1   gender                    8000 non-null   object
 2   age                       8000 non-null   object
 3   admission_type_id         7424 non-null   object
 4   discharge_disposition_id  7627 non-null   object
 5   admission_source_id       7250 non-null   object
 6   metformin                 8000 non-null   object
 7   repaglinide               8000 non-null   object
 8   nateglinide               8000 non-null   object
 9   chlorpropamide            8000 non-null   object
 10  glimepiride               8000 non-null   object
 11  glipizide                 8000 non-null   object
 12  glyburide                 8000 non-null   object
 13  tolbutamide               8000 non-null   object
 14  pioglitazone            

        in order to process the decision tree or random forest, we have to transform it to binary value by using one hot encode, so that we can implement the decision tree

In [7]:
from sklearn.preprocessing import OneHotEncoder

In [8]:
# onehotencoder with deal with the missing value
cate_encoder = OneHotEncoder()
cate_df_1hot = cate_encoder.fit_transform(cate_df)
one_hot_feature = cate_encoder.get_feature_names_out()
cate_np = cate_df_1hot.toarray()

In [9]:
one_hot_feature

array(['race_AfricanAmerican', 'race_Asian', 'race_Caucasian',
       'race_Hispanic', 'race_Other', 'race_None', 'gender_Female',
       'gender_Male', 'age_[0-10)', 'age_[10-20)', 'age_[20-30)',
       'age_[30-40)', 'age_[40-50)', 'age_[50-60)', 'age_[60-70)',
       'age_[70-80)', 'age_[80-90)', 'age_[90-100)',
       'admission_type_id_Elective', 'admission_type_id_Emergency',
       'admission_type_id_Newborn', 'admission_type_id_Not Available',
       'admission_type_id_Not Mapped', 'admission_type_id_Urgent',
       'admission_type_id_None',
       'discharge_disposition_id_Admitted as an inpatient to this hospital',
       'discharge_disposition_id_Discharged to home',
       'discharge_disposition_id_Discharged/transferred to ICF',
       'discharge_disposition_id_Discharged/transferred to SNF',
       'discharge_disposition_id_Discharged/transferred to a federal health care facility.',
       'discharge_disposition_id_Discharged/transferred to a long term care hospital.',
  

In [10]:
np.shape(cate_np)

(8000, 117)

### 3. Pre-processed numerical data appropriately including handling missing data and justified methods used. Note this may be different for each algorithm you try.

In [11]:
# numerical data set
numerical_data = list(numerical_data)
numerical_data.append('time_in_hospital')
num_df = df[numerical_data]

In [12]:
num_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8000 entries, 0 to 7999
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   num_lab_procedures  8000 non-null   int64 
 1   num_procedures      8000 non-null   int64 
 2   num_medications     8000 non-null   int64 
 3   number_outpatient   8000 non-null   int64 
 4   number_emergency    8000 non-null   int64 
 5   number_inpatient    8000 non-null   int64 
 6   diag_1              8000 non-null   object
 7   diag_2              7945 non-null   object
 8   diag_3              7825 non-null   object
 9   number_diagnoses    8000 non-null   int64 
 10  time_in_hospital    8000 non-null   int64 
dtypes: int64(8), object(3)
memory usage: 687.6+ KB


In [13]:
diags = ['diag_1','diag_2','diag_3']

for i in diags:
    for x in range(8000):
        a = num_df[i].iloc[x]
        if a != None:
            if a[0]=='V'or a[0]=='E':
                num_df.loc[x,i]= None
            else:
                num_df.loc[x,i] = float(a)

In [14]:
for x in diags:
    num_df[x] = num_df[x].fillna(num_df[x].median())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  num_df[x] = num_df[x].fillna(num_df[x].median())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  num_df[x] = num_df[x].fillna(num_df[x].median())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  num_df[x] = num_df[x].fillna(num_df[x].median())


In [15]:
num_np = np.array(num_df)

In [16]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import roc_auc_score
from sklearn.neighbors import KNeighborsClassifier

#### 4. Implement a model to make predictions using text data using tf-idf

    tf(t,d) = count of t in d / number of words in d
    df(t) = occurrence of t in N documents, DF is the number of documents in which the word is present at least once.
    N/df = this means how many docs are we expected to read to have one occurance of term t.
    idf(t) = log(N/(df + 1)), in case we will see some case df = 0, we add 1 on df.
    tf-idf(t, d) = tf(t, d) * idf(t)

In [17]:
# I decided to use random string to fill missing value, because maybe the reason why there is no desc is because the patient is cured
df['diag_2_desc'] = df['diag_2_desc'].fillna('jin')
df['diag_3_desc'] = df['diag_3_desc'].fillna('zhiyuan')

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [20]:
def get_stops(v1, v2, top_n=30):
    i = []
    vectorizer = TfidfVectorizer()
    for v in [v1, v2]:
        vectorizer.fit_transform(v)
        features = vectorizer.get_feature_names_out()

        numb_of_appearance_true = {}
        for x in features:
            numb_of_appearance_true[x]=0
        for x in features:
            for y in v:
                if x in y.lower():
                    numb_of_appearance_true[x]+=1    
        i.append(sorted(numb_of_appearance_true.items(), key=lambda x:x[1], reverse = True))
    a = [x[0] for x in i[0][:top_n]]
    b = [x[0] for x in i[1][:top_n]]
    stop = []
    for x in a:
        if x in b:
            stop.append(x)
    return stop

In [21]:
def term_score(DF,column_index,top_n):
    # I split the column into two parts which are readmitted = true, and readmitted = false
    # so that we can find out how each word are related with these to type of documents
    part_true = df.loc[df.iloc[:,-4]==True]
    part_false = df.loc[df.iloc[:,-4]==False]
    stopWords = get_stops(part_true,part_false, top_n)

    part_true = part_true.iloc[:,column_index]
    part_false = part_false.iloc[:,column_index]
    d_true = ''
    for x in part_true:
        d_true = d_true+' '+x
    d_false = ''
    for x in part_false:
        d_false = d_false+' '+x
    corpus = [d_true, d_false]

    vectorizer_ = TfidfVectorizer(stop_words = stopWords)
    X = vectorizer_.fit_transform(corpus)
    toker = vectorizer_.build_tokenizer()
    features = vectorizer_.get_feature_names_out()
    vocabulary = vectorizer_.vocabulary_

    # X is two arrays of tfidf score of each feature in these two document.
    # this is the score for each feature    

    # X.toarray looks like following. the first row is the relativity bewteen each word and doc 1. the second is ....doc 2. 
    # [[0.02490878 0.00373632 0.00124544 ... 0.01245439 0.01245439 0.00871807]
    # [0.01528978 0.00849432 0.00679546 ... 0.00509659 0.01019319 0.00169886]]
    # by subtracting them I have have the words more relate to doc1 having positive score, and the words more realte to doc2 nagative score.
    feature_score = 100*(X.toarray()[0]-X.toarray()[1])    
    return vectorizer_, feature_score

In [22]:
def text_score_calculator(DF,column_index,vectorizer_, feature_score):
    toker = vectorizer_.build_tokenizer()
    features = vectorizer_.get_feature_names_out()
    vocabulary = vectorizer_.vocabulary_
    scores_d1 = []
    for x in DF.iloc[:,column_index]:
        sample_score = 0
        x = x.lower()
        for y in toker(x):
            if y in features:
                index = vocabulary[y]
                sample_score+=feature_score[index]
        scores_d1.append(sample_score)
    return scores_d1

In [23]:
def partition(feature_matrix, target_vector, t, shuffle = True):
    np.random.seed(42)
    test_set_size = int(len(feature_matrix)*t)
    if shuffle ==True:
        shuffled_indices = np.random.permutation(len(feature_matrix))
        test_indices = shuffled_indices[:test_set_size]
        train_indices = shuffled_indices[test_set_size:]
        return feature_matrix[train_indices], feature_matrix[test_indices], target_vector[train_indices], target_vector[test_indices]
    else:
        train_set_size = len(feature_matrix)-test_set_size
        return feature_matrix[train_indices], feature_matrix[test_indices], target_vector[train_indices], target_vector[test_indices]


In [24]:
from sklearn.model_selection import KFold
def sFold_text(folds, data, labels, error_function, model, **model_args):
    kf = KFold(n_splits=folds, random_state=None, shuffle = True)
    scores = []
    y = labels
    if model == LogisticRegression:
        M = model(max_iter = 10**9)
    if model == GradientBoostingClassifier:
        M=model()
    if model == RandomForestClassifier:
        M=model()
    if model == AdaBoostClassifier:
        if len(model_args)!=0:
            M=model(learning_rate = model_args['learning_rate'])
        else:
            M = model()
    if model == SVC:
        M=model(probability = True)
    if model == GaussianNB:
        M =model()
    if model == KNeighborsClassifier:
        M = model()
    for train_index,test_index in kf.split(data):        
        x_training_set = data.iloc[train_index]
        y_training_set = y.iloc[train_index]
    # get term score from the training
        v_d1,ts_d1 = term_score(x_training_set, -3,36)
        v_d2,ts_d2 = term_score(x_training_set, -2,36)
        v_d3,ts_d3 = term_score(x_training_set, -1,36)
    # find out the score for text by term score for each text column.
        text_score_d1 = text_score_calculator(x_training_set, -3,v_d1,ts_d1)
        text_score_d2 = text_score_calculator(x_training_set, -2,v_d2,ts_d2)
        text_score_d3 = text_score_calculator(x_training_set, -1,v_d3,ts_d3)
        text_train = np.array([text_score_d1,text_score_d2,text_score_d3])
    # transformed training set
        x_training_set = np.transpose(text_train)
        x_test_set = data.iloc[test_index]
    # use the term score output from training set to get the 
    # score for the score for each text data for test set.
        text_score_d1_test = text_score_calculator(x_test_set, -3, v_d1, ts_d1)
        text_score_d2_test = text_score_calculator(x_test_set, -2, v_d2, ts_d2)
        text_score_d3_test = text_score_calculator(x_test_set, -1, v_d3, ts_d3)
        text_test = np.array([text_score_d1_test,text_score_d2_test,text_score_d3_test])
    # new test set
        x_test_set = np.transpose(text_test)    
        y_test_set = y.iloc[test_index]
        y_training_set = [x for x in np.array(y_training_set)]
        
        M.fit(np.array(x_training_set), y_training_set)
        y_pred = M.predict_proba(x_test_set)
        y_pred = [x[1] for x in y_pred]
        score = error_function(y_test_set,y_pred)
        scores.append(score)
        print('.',end='')
    average_error = round(sum(scores)/folds,4)
    return average_error

In [25]:
target = df['readmitted'].astype(float)

In [65]:
v_d1,ts_d1 = term_score(df, -3,36)
v_d2,ts_d2 = term_score(df, -2,36)
v_d3,ts_d3 = term_score(df, -1,36)
# find out the score for text by term score for each text column.
text_score_d1 = text_score_calculator(df, -3,v_d1,ts_d1)
text_score_d2 = text_score_calculator(df, -2,v_d2,ts_d2)
text_score_d3 = text_score_calculator(df, -1,v_d3,ts_d3)
text_np = np.array([text_score_d1,text_score_d2,text_score_d3]).transpose()

In [53]:
s1 = sFold_text(5, df, target, roc_auc_score , AdaBoostClassifier)
s1

.....

0.5955

### 5. Use model stacking to incorporate tf-idf predictions for all 3 text fields (so 3 models unless you elect to concatenate the text fields into 1 - need to justify if so) in downstream algorithm the uses non-text features

In [54]:
# this function 
def level0_text_predictor(t_train,y, model):
    leng = int(len(t_train)/2)
    t_a, t_b= t_train[:leng], t_train[leng:]
    tg_a, tg_b = y[:leng], y[leng:]
    m1 = model()
    m1.fit(t_a,tg_a)
    tp_b = [i[1] for i in m1.predict_proba(t_b)]

    m2 = model()
    m2.fit(t_b,tg_b)
    tp_a = [i[1] for i in m2.predict_proba(t_a)]
    
    tp = [[x] for x in np.hstack((tp_a,tp_b))]
    
    return tp

In [55]:
def train_stacking(input_x,y,model):
    t1,t2,t3 = [[x] for x in input_x[:,-3]],[[x] for x in input_x[:,-2]],[[x] for x in input_x[:,-1]]
    x = np.delete(input_x,[-1,-2,-3],1)
    y = np.array(y).astype(float)
    t1_meta = level0_text_predictor(t1,y,model)
    t2_meta = level0_text_predictor(t2,y,model)
    t3_meta = level0_text_predictor(t3,y,model)
    output_x = np.concatenate((x,t1_meta,t2_meta, t3_meta),axis = 1)
    m_3 = []
    # m_3 is the output model for the new data set which fed with the entire training set
    for t in [t1,t2,t3]:
        m3 = model()
        m3.fit(t,y)
        m_3.append(m3)
    return output_x,m_3

In [56]:
# this function is to transform the outsample text data to leval0 prediction
# which is predicted by the output model from the function train_stacking
def test_stacking(input_x,model):
    t1,t2,t3 = [[x] for x in input_x[:,-3]],[[x] for x in input_x[:,-2]],[[x] for x in input_x[:,-1]]
    x = np.delete(input_x,[-1,-2,-3],1)
    t1_meta = [[x[1]] for x in model[0].predict_proba(t1)]
    t2_meta = [[x[1]] for x in model[1].predict_proba(t2)]
    t3_meta = [[x[1]] for x in model[2].predict_proba(t3)]
    output_x = np.concatenate((x,t1_meta,t2_meta, t3_meta),axis = 1)
    return output_x

In [57]:
class stacking:
    def __init__(self, m1,m2,**m2_args):
        self._level1_predictor = None
        self._model_for_text_stacking = m1
        if m2==LogisticRegression:
            self._model_for_meta = m2(max_iter = 10**5)
        elif m2==GradientBoostingClassifier:
            if len(m2_args)==0:
                self._model_for_meta = m2()
            else:
                self._model_for_meta = m2(learning_rate = m2_args['learning_rate'],max_depth=m2_args['max_depth'])
        else:
            self._model_for_meta = m2()
    def fit(self, X_train, Y_train):
        output = train_stacking(X_train,Y_train,self._model_for_text_stacking)
        self._level1_predictor = output[1]
        self._model_for_meta.fit(output[0],Y_train)
    def predict_proba(self, new_data):
        new_data_meta = test_stacking(new_data,self._level1_predictor)
        target_pred_proba = self._model_for_meta.predict_proba(new_data_meta)
        target_pred_proba = [i[1] for i in target_pred_proba]
        return target_pred_proba
    def predict(self, new_data):
        new_data_meta = test_stacking(new_data,self._level1_predictor)
        target_pred = self._model_for_meta.predict(new_data_meta)
        return target_pred

In [58]:
from sklearn.model_selection import KFold
def sFold_stacking(folds, data, labels, error_function, model1,model2, **model_args):
    kf = KFold(n_splits=folds, random_state=None, shuffle = True)
    scores = []
    y = pd.DataFrame(labels)
    data = pd.DataFrame(data)
    M = stacking(model1, model2,**model_args)
    for train_index,test_index in kf.split(data):
        x_training_set = data.iloc[train_index]
        y_training_set = y.iloc[train_index]
        x_test_set = data.iloc[test_index]
        y_test_set = y.iloc[test_index]
        y_training_set = [x[0] for x in np.array(y_training_set)]
        M.fit(np.array(x_training_set), y_training_set)
        y_pred = M.predict_proba(np.array(x_test_set))
        score = error_function(y_test_set,y_pred)
        scores.append(score)
        print('.',end='')
    average_error = round(sum(scores)/folds,4)
    return average_error

In [62]:
cate_np.shape

(8000, 117)

In [63]:
num_np.shape

(8000, 11)

In [66]:
text_np.shape

(8000, 3)

In [103]:
con = np.concatenate((cate_np,num_np, text_np),axis = 1)
x_train, x_test, y_train, y_test = partition(con, df['readmitted'],0.2)

In [104]:
print(np.shape(cate_np))
print(np.shape(num_np))
print(np.shape(text_np))

(8000, 117)
(8000, 11)
(8000, 3)


In [105]:
s = ['newton-cg','lbfgs']

i = stacking(AdaBoostClassifier,LogisticRegression, penalty = 'l2', solver ='saga')
i.fit(x_train, y_train)
j = i.predict_proba(x_test)
roc_auc_score(y_test,j)

0.7044660409768078

In [106]:
sFold_stacking(5,x_train, y_train,roc_auc_score,AdaBoostClassifier,LogisticRegression,learning_rate = 0.1,max_depth = 3)

.....

0.6762

### 6. Perform experimentation for multiple modeling algorithms and justify why you selected the experiments you chose

In [72]:
def sFold(folds, data, labels, error_function, model, **model_args):
    kf = KFold(n_splits=folds, random_state=None, shuffle = True)
    scores = []
    y = labels
    data = pd.DataFrame(data)
    if model == LogisticRegression:
        M = model(max_iter = 10**9)
    if model == GradientBoostingClassifier:
        M=model()
    if model == RandomForestClassifier:
        M=model()
    if model == AdaBoostClassifier:
        if len(model_args)!=0:
            M=model(learning_rate = model_args['learning_rate'])
        else:
            M = model()
    if model == SVC:
        M=model(probability = True)
    if model == GaussianNB:
        M =model()
    if model == KNeighborsClassifier:
        M = model()
    for train_index,test_index in kf.split(data):        
        x_training_set = data.iloc[train_index]
        y_training_set = y.iloc[train_index]

        x_test_set = data.iloc[test_index] 
        y_test_set = y.iloc[test_index]
        y_training_set = [x for x in np.array(y_training_set)]
        
        M.fit(np.array(x_training_set), y_training_set)
        y_pred = M.predict_proba(x_test_set)
        y_pred = [x[1] for x in y_pred]
        score = error_function(y_test_set,y_pred)
        scores.append(score)
        print('.',end='')
    average_error = round(sum(scores)/folds,4)
    return average_error

I want to know the accuracy of this model without stacking. and the importance of each type of data. 

In [73]:
target

0       0.0
1       0.0
2       1.0
3       0.0
4       0.0
       ... 
7995    0.0
7996    0.0
7997    0.0
7998    0.0
7999    1.0
Name: readmitted, Length: 8000, dtype: float64

In [74]:
con = np.concatenate((cate_np,num_np),axis = 1)

sFold(5, con,target, roc_auc_score,GradientBoostingClassifier)

.....

0.6872

In [75]:
con2 = np.concatenate((cate_np,text_np),axis = 1)

sFold(5,  con,target, roc_auc_score,GradientBoostingClassifier)

.....

0.6856

In [76]:
con3 = np.concatenate((num_np,text_np),axis = 1)
sFold(5,  con,target, roc_auc_score,GradientBoostingClassifier)

.....

0.6863

In [138]:
con4 = np.concatenate((cate_np,num_np,text_np),axis = 1)
sFold(5,  con,target, roc_auc_score,GradientBoostingClassifier)

.....

0.6918

---

In [113]:
models = [GradientBoostingClassifier, AdaBoostClassifier,LogisticRegression]

In [81]:
x_train, x_test, y_train, y_test = partition(text_np, df['readmitted'],0.2)
s1 = sFold(5, x_train, y_train, roc_auc_score , LogisticRegression)

.....

In [82]:
for x in models:
    s1 = sFold(5, x_train, y_train, roc_auc_score , x)
    print(x.__name__, s1)

.....GradientBoostingClassifier 0.5972
.....RandomForestClassifier 0.5571
.....AdaBoostClassifier 0.6009
.....LogisticRegression 0.5853
.....GaussianNB 0.5861
.....KNeighborsClassifier 0.5368


adaboost preforms the best for the level 1 prediction

In [83]:
SCORE = []
for s in range(1,20):
    i = sFold(5, x_train, y_train, roc_auc_score,AdaBoostClassifier,learning_rate =0.01*s)
    print(s,i)

.....1 0.5794
.....2 0.5881
.....3 0.5904
.....4 0.5937
.....5 0.5958
.....6 0.5955
.....7 0.5957
.....8 0.5956
.....9 0.5943
.....10 0.5946
.....11 0.5965
.....12 0.5937
.....13 0.5962
.....14 0.5921
.....15 0.5945
.....16 0.5943
.....17 0.5924
.....18 0.5929
.....19 0.5955


adaboost preforms best while the learning rate is 0.10, so that I changed the function level_0_text_predictor(). like following

In [None]:
def level0_text_predictor(t_train,y, model):
    leng = int(len(t_train)/2)
    t_a, t_b= t_train[:leng], t_train[leng:]
    tg_a, tg_b = y[:leng], y[leng:]
    if model==AdaBoostClassifier:
        m1 = model(learning_rate = 0.9)
        m2 = model(learning_rate = 0.9)

    m1.fit(t_a,tg_a)
    tp_b = [i[1] for i in m1.predict_proba(t_b)]
    m2.fit(t_b,tg_b)
    tp_a = [i[1] for i in m2.predict_proba(t_a)]
    
    tp = [[x] for x in np.hstack((tp_a,tp_b))]
    
    return tp

In [None]:
con = np.concatenate((cate_np,num_np, text_np),axis = 1)
x_train, x_test, y_train, y_test = partition(con, df['readmitted'],0.2)
for x in models:
    i = sFold_stacking(5, x_train, y_train, roc_auc_score,AdaBoostClassifier, x)
    print(i,x.__name__)

GradientBoostingClassifier preforms the best for the level 2 prediction

In [None]:
for y in range(2,5):
    for x in range(1,20):
        i = sFold_stacking(5, x_train, y_train, roc_auc_score,AdaBoostClassifier, GradientBoostingClassifier,learning_rate = 0.02*x, max_depth = y)
        print(i,x*0.02,y)    

this experiment illustrated for the GradientBoostingClassifier in the range of 0.04<learning_rate<0.26, max depth = 2 and 0.04<learning_rate <0.2, max_depth =3 perfoms the best. so I kept as default for gradientboosting

#### 7. Final model selection and discussion of your model choice and the model weaknesses

my selection of the model is Adaboost for the level 1 prediction, and gradientboost for level 2 prediction.

the weaknee of this model is the model stacking didn't really increase the accuracy of the overall model. it is better of keep the text_np as it is than do the stacking for the text data.

In [119]:
for x in models:
    for y in models:
        SCORE = sFold_stacking(5,x_train,y_train, roc_auc_score,y,x)
        print(SCORE, y.__name__, x.__name__)

.....0.6848 GradientBoostingClassifier GradientBoostingClassifier
.....0.6833 AdaBoostClassifier GradientBoostingClassifier
.....0.6789 LogisticRegression GradientBoostingClassifier
.....0.6767 GradientBoostingClassifier AdaBoostClassifier
.....0.6786 AdaBoostClassifier AdaBoostClassifier
.....0.6767 LogisticRegression AdaBoostClassifier
.....0.6876 GradientBoostingClassifier LogisticRegression
.....0.6777 AdaBoostClassifier LogisticRegression
.....0.6822 LogisticRegression LogisticRegression


0.6872 GradientBoostingClassifier LogisticRegression
0.6849 GradientBoostingClassifier LogisticRegression
0.6853 GradientBoostingClassifier LogisticRegression

---

### part B


In [120]:
df_t = pd.read_csv('2K_diabetes_scoring.csv')

In [121]:
for x in df_t.columns:
    for y in range(2000):
        i = df_t.loc[y,x]
        if i=='?' or i=='None' or type(i)==float:
            df_t.loc[y,x]= None
numerical_test= df_t.columns[10:20]
categorical_test = df_t.drop(columns = numerical_data)
categorical_test = categorical_test.drop(columns = text_df)
categorical_test = categorical_test.drop(columns = insufficient_data)

cate_test = categorical_test.drop(columns = useless)

In [36]:
cate_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8000 entries, 0 to 7999
Data columns (total 24 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   race                      7820 non-null   object
 1   gender                    8000 non-null   object
 2   age                       8000 non-null   object
 3   admission_type_id         7424 non-null   object
 4   discharge_disposition_id  7627 non-null   object
 5   admission_source_id       7250 non-null   object
 6   metformin                 8000 non-null   object
 7   repaglinide               8000 non-null   object
 8   nateglinide               8000 non-null   object
 9   chlorpropamide            8000 non-null   object
 10  glimepiride               8000 non-null   object
 11  glipizide                 8000 non-null   object
 12  glyburide                 8000 non-null   object
 13  tolbutamide               8000 non-null   object
 14  pioglitazone            

In [37]:
cate_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 24 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   race                      1959 non-null   object
 1   gender                    2000 non-null   object
 2   age                       2000 non-null   object
 3   admission_type_id         1855 non-null   object
 4   discharge_disposition_id  1904 non-null   object
 5   admission_source_id       1814 non-null   object
 6   metformin                 2000 non-null   object
 7   repaglinide               2000 non-null   object
 8   nateglinide               2000 non-null   object
 9   chlorpropamide            2000 non-null   object
 10  glimepiride               2000 non-null   object
 11  glipizide                 2000 non-null   object
 12  glyburide                 2000 non-null   object
 13  tolbutamide               2000 non-null   object
 14  pioglitazone            

In [122]:
temp_cate = pd.concat([cate_df,cate_test])

In [123]:
cate_encoder_t = OneHotEncoder()
cate_df_1hot = cate_encoder_t.fit_transform(temp_cate)
temp_cate_np = cate_df_1hot.toarray()

In [124]:
cate_test = temp_cate_np[8000:]

In [125]:
np.shape(cate_test)

(2000, 118)

In [42]:
numerical_test

Index(['num_lab_procedures', 'num_procedures', 'num_medications',
       'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1',
       'diag_2', 'diag_3', 'number_diagnoses'],
      dtype='object')

In [126]:
numerical_test = list(numerical_test)
numerical_test.append('time_in_hospital')
num_test = df_t[numerical_data]

In [44]:
num_test

Unnamed: 0,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,diag_3,number_diagnoses,time_in_hospital
0,1,0,10,0,0,0,311,401,244,7,7
1,21,0,15,1,0,1,428,411,284,8,8
2,37,0,19,1,0,0,493,401,250,6,3
3,32,2,7,0,0,0,552,682,250,3,4
4,37,0,14,0,0,0,434,433,401,5,7
...,...,...,...,...,...,...,...,...,...,...,...
1995,30,1,29,0,0,0,715,401,272,7,3
1996,1,5,15,0,0,0,436,250.02,V53,9,8
1997,46,2,14,0,0,1,38,585,536,6,13
1998,62,1,7,0,0,3,535,276,599,9,2


In [127]:
diags = ['diag_1','diag_2','diag_3']

for i in diags:
    for x in range(2000):
        a = num_test[i].iloc[x]
        if a != None:
            if str(a)[0]=='V'or str(a)[0]=='E':
                num_test.loc[x,i]= None
            else:
                num_test.loc[x,i] = float(a)

In [128]:
for x in diags:
    m = num_test[x].median()
    num_test[x] = num_test[x].fillna(m)
num_test = np.array(num_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  num_test[x] = num_test[x].fillna(m)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  num_test[x] = num_test[x].fillna(m)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  num_test[x] = num_test[x].fillna(m)


In [129]:
df_t['diag_1_desc'] = df_t['diag_1_desc'].fillna('A')
df_t['diag_2_desc'] = df_t['diag_2_desc'].fillna('jin')
df_t['diag_3_desc'] = df_t['diag_3_desc'].fillna('zhiyuan')
text_score_d1 = text_score_calculator(df_t, -3,v_d1,ts_d1)
text_score_d2 = text_score_calculator(df_t, -2,v_d2,ts_d2)
text_score_d3 = text_score_calculator(df_t, -1,v_d3,ts_d3)

In [130]:
text_test = np.array([text_score_d1,text_score_d2,text_score_d3])
text_test = np.transpose(text_test)

In [131]:
np.shape(cate_test)

(2000, 118)

In [132]:
np.shape(num_test)

(2000, 11)

In [133]:
x_train = np.concatenate((temp_cate_np[:8000],num_np, text_np),axis = 1)
y_train = df['readmitted']
test = np.concatenate((cate_test,num_test, text_test),axis = 1)


In [134]:
np.shape(temp_cate_np[:8000])

(8000, 118)

In [135]:
x_train.shape

(8000, 132)

In [136]:
test.shape

(2000, 132)

In [139]:
i = GradientBoostingClassifier()
i.fit(x_train,y_train)
prediction = i.predict_proba(test).astype(float)

In [None]:
j = stacking(AdaBoostClassifier, GradientBoostingClassifier)
j.fit(x_train, y_train)
prd = j.predict(test).astype(float)

In [140]:
export = pd.DataFrame(prediction)

In [141]:
export.to_csv('Jin_Zhiyuan_pred2.csv')

In [142]:
pred2 = pd.read_csv('pred2.csv')

In [None]:
pred2 = pred2['0']

In [None]:
prediction

In [None]:
roc_auc_score(prediction,pred2)

### generanl stacking

Now I want to apply stacking not only for the text data, I want to use 3 different types of model to predict data, and use the predictions as three different features to make final prediction. this is the way how I thought the genernal stacking should be.

In [None]:
def level0_predictor(t_train,l_train, model):
    leng = int(len(t_train)/2)
    t_a, t_b= t_train[:leng], t_train[leng:]
    tg_a, tg_b = l_train[:leng], l_train[leng:]
    if model == LogisticRegression:
        m1 = model(max_iter = 10**8)
        m2 = model(max_iter = 10**8)
    else:
        m1 = model()
        m2 = model()
    m1.fit(t_a,tg_a)
    tp_b = [i[1] for i in m1.predict_proba(t_b)]

    m2.fit(t_b,tg_b)
    tp_a = [i[1] for i in m2.predict_proba(t_a)]
    
    tp = [[x] for x in np.hstack((tp_a,tp_b))]
    
    return tp

In [None]:
def train_stacking0(x,y,m_list):
    y = np.array(y).astype(float)
    meta1 = level0_predictor(x,y,m_list[0])
    meta2 = level0_predictor(x,y,m_list[1])
    meta3 = level0_predictor(x,y,m_list[2])
    output = np.concatenate((meta1, meta2, meta3),axis = 1)
    m_3 = []
    # m_3 is the output model for the new data set which fed with the entire training set
    for t in m_list:
        if t==LogisticRegression:
            m = t(max_iter = 10**9)
        else:
            m = t()
        m.fit(x,y)
        m_3.append(m)
    return output,m_3

In [None]:
# this function is to transform the outsample text data to leval0 prediction
# which is predicted by the output model from the function train_stacking
def test_stacking0(x,m_list):
    prediction1 = [[x[1]] for x in m_list[0].predict_proba(x)]
    prediction2 = [[x[1]] for x in m_list[1].predict_proba(x)]
    prediction3 = [[x[1]] for x in m_list[2].predict_proba(x)]
    prediction = np.concatenate((prediction1, prediction2, prediction3),axis=1)
    return prediction

In [None]:
class stacking0:
    def __init__(self, l1_model_list, l2_model, **l2_args):
        self._l1_predictors = None
        self._stacking_model = l1_model_list
        if l2_model==LogisticRegression:
            self._l2_predictor = l2_model(max_iter = 10**5,solver = l2_args['solver'], penalty = l2_args['penalty'])
        elif l2_model==GradientBoostingClassifier:
            self._l2_predictor = l2_model(learning_rate = l2_args['learning_rate'],max_depth=l2_args['max_depth'])
        else:
            self._l2_predictor = l2_model()
    def fit(self, X_train, Y_train):
        output = train_stacking0(X_train,Y_train,self._stacking_model)
        self._l1_predictors = output[1]
        self._l2_predictor.fit(output[0],Y_train)
    def predict_proba(self, x_test):
        l1_prediction = test_stacking0(x_test, self._l1_predictors)
        l2_prediction = self._l2_predictor.predict_proba(l1_prediction)
        return [i[1] for i in l2_prediction]
        

In [None]:
def sFold_stacking0(folds, data, labels, error_function, model1_list,model2, **model_args):
    kf = KFold(n_splits=folds, random_state=None, shuffle = True)
    scores = []
    y = pd.DataFrame(labels)
    data = pd.DataFrame(data)
    M = stacking0(model1_list, model2,**model_args)
    for train_index,test_index in kf.split(data):
        x_training_set = data.iloc[train_index]
        y_training_set = y.iloc[train_index]
        x_test_set = data.iloc[test_index]
        y_test_set = y.iloc[test_index]
        y_training_set = [x[0] for x in np.array(y_training_set)]
        M.fit(np.array(x_training_set), y_training_set)
        y_pred = M.predict_proba(np.array(x_test_set))
        score = error_function(y_test_set,y_pred)
        scores.append(score)
        print('.',end='')
    average_error = round(sum(scores)/folds,4)
    return average_error

In [None]:
con = np.concatenate((cate_np,num_np,text_np),axis = 1)
x_train, x_test, y_train, y_test = partition(con, df['readmitted'],0.2)
model_list = [LogisticRegression,GaussianNB, KNeighborsClassifier]
training_meta,m_3 = train_stacking0(x_train,y_train,model_list)

In [None]:
i = stacking0(model_list,GradientBoostingClassifier, learning_rate = 0.7,max_depth = 3)

In [None]:
i.fit(x_train,y_train)

In [None]:
p= i.predict_proba(x_test)

In [None]:
model_list1 = [LogisticRegression,RandomForestClassifier, AdaBoostClassifier]

In [None]:
model_list2 = [KNeighborsClassifier,RandomForestClassifier, AdaBoostClassifier]

In [None]:
sFold_stacking0(5, x_train, y_train, roc_auc_score,model_list1, GradientBoostingClassifier,learning_rate =0.17,max_depth=3)

---

In [None]:
df['readmitted'].info()

In [None]:
cate_train,cate_test,y_train,y_test = partition(cate_np, df['readmitted'],0.2)

In [None]:
np.shape(y_train)

In [None]:
for x in models:
    s1 = sFold(5, cate_train, y_train, roc_auc_score , x)
    print(x.__name__, s1)

In [None]:
num_train,num_test,y_train, y_test = partition(num_np,df['readmitted'],0.2)

In [None]:
for x in models:
    s1 = sFold(5, num_train, y_train, roc_auc_score , x)
    print(x.__name__, s1)

In [None]:
text_train, text_test, y_train, y_test = partition(text_np, df['readmitted'],0.2)

In [None]:
for x in models:
    s1 = sFold(5, text_train, y_train, roc_auc_score , x)
    print(x.__name__, s1)

In [2]:
import numpy as np

In [5]:
i = np.random.permutation(5).tolist()

In [8]:
i.remove(2)

In [14]:
w = np.array([0 for x in range(5)])
w[0] = 1


In [25]:
for x in range(10):
    h = np.random.permutation(5).tolist()
    for y in h:
        if w[y]>0:
            j = y
            h.remove(j)
            i = h[0]
    print(w[i],w[j])
    w[i]+=0.01
    w[j]-=0.01

0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0


1