In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.inspection import permutation_importance
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
import time

In [2]:
def level_1_predictor(t_train,y, model):
    leng = int(len(t_train)/2)
    t_a, t_b= t_train[:leng], t_train[leng:]
    tg_a, tg_b = y[:leng], y[leng:]
    if model == LogisticRegression:
        m1 = model(max_iter = 20**9)
        m2 = model(max_iter = 20**9)
    else:
        m1 = model()
        m2 = model()
    m1.fit(t_a,tg_a)
    tp_b = [i[1] for i in m1.predict_proba(t_b)]
    m2.fit(t_b,tg_b)
    tp_a = [i[1] for i in m2.predict_proba(t_a)]
    
    tp = [[x] for x in np.hstack((tp_a,tp_b))]
    
    return tp

In [3]:
def updata_important_l1():
    i = pd.read_csv('important_l1.csv')
    for x in important_l1:
        i[x]+=1
    i.to_csv('important_l1')

In [4]:
from sklearn.preprocessing import StandardScaler
def stder(X):
    cols = X.columns
    stder = StandardScaler(copy=True, with_mean = True, with_std = True)
    stder.fit(X)
    return pd.DataFrame(stder.transform(X), columns = cols)

In [5]:
def prepare_training(n,m):
    files= []
    for x in range(n,m):
        files.append(pd.read_csv(r'C:\Users\johnk\OneDrive\Desktop\project\python project\kaggle\amex-default-prediction\new_train\train_{0}.csv'.format(x)))
    training_set = pd.concat(files, axis = 0)
    training_set.index = [x for x in range(training_set.shape[0])]
    training_label = training_set['target']
    training_feature = training_set.iloc[:,1:-1]
    for x in training_feature.columns:
        training_feature[x]=training_feature[x].fillna(training_feature[x].mean())    
    return training_feature, np.array(training_label).T

In [6]:
from sklearn.model_selection import KFold
def cross_validation(data, labels, model, error_function=roc_auc_score,folds=5, **model_args):
    kf = KFold(n_splits=folds, random_state=None, shuffle = True)
    score = []
    y = np.array(labels)
    if model == LogisticRegression:
        M = model(max_iter = 20**9)
    else:
        M = model()
    for train_index,test_index in kf.split(data):
        x_training_set = data.iloc[train_index]
        y_training_set = y[train_index]
        x_test_set = data.iloc[test_index]
        y_test_set = y[test_index]
        M.fit(np.array(x_training_set), y_training_set)
        y_pred = M.predict(np.array(x_test_set))
        score.append(error_function(y_test_set,y_pred)) 
        print('.',end=' ')
    average_error = round(sum(score)/folds,4)
    return average_error

In [7]:
def feature_splitor(important_feature,model = GradientBoostingClassifier):
    delinquency=[]
    spend = []
    payment = []
    balance = []
    risk = []
    std = []
    mean = []
    _25 = []
    _50 = []
    _75 = []
    _min = []
    _max = []

    for x in important_feature:
        if x[0]=='D':
            delinquency.append(x)
        if x[0]=='S':
            spend.append(x)
        if x[0]=='P':
            payment.append(x)
        if x[0]=='B':
            balance.append(x)
        if x[0]=='R':
            risk.append(x)
        if x[-3:]=='std':
            std.append(x)
        elif x[-3:]=='25%':
            _25.append(x)
        elif x[-3:]=='75%':
            _75.append(x)
        elif x[-3:]=='50%':
            _50.append(x)
        elif x[-3:]=='min':
            _min.append(x)
        elif x[-3:]=='max':
            _max.append(x)
        else:
            mean.append(x)  
    feature_sets = {'delinquency':delinquency,'spend':spend,'payment':payment,'balance':balance,'risk':risk,'std':std,
                    'mean':mean,'_25':_25,'_50':_50,'_75':_75,'_min':_min,'_max':_max}
    return feature_sets

In [8]:
def l1_predictor(important_feature, label,feature_sets, model = GradientBoostingClassifier):
    column_name = list(feature_sets.keys())
    new_feature = pd.DataFrame(columns = column_name)
    for x in column_name:
        l1_feature = np.array(important_feature[feature_sets[x]])
        new_feature[x] = level_1_predictor(l1_feature,label, model)
    return new_feature

----------------

### preparation

In [11]:
feature_train, label_train = prepare_training(4,6)

In [12]:
feature_train_std = stder(feature_train)

In [13]:
models = [GradientBoostingClassifier,LogisticRegression,RandomForestClassifier]

### first glance

In [14]:
score = pd.DataFrame(columns = ['std', 'No_std'], index = ['GradientBoostingClassifier', 'LogisticRegression', 'RandomForestClassifier'])

In [None]:
for x in models:
    score.loc[x.__name__,'std']=cross_validation(feature_train_std, label_train, x)
    print(x.__name__)

. . . . . GradientBoostingClassifier
. 

In [None]:
for x in models:
    score.loc[x.__name__,'No_std']=cross_validation(feature_train, label_train, x) 

In [None]:
score

## feature_importance

In [None]:
def get_important_feature(X,Y,mdl = GradientBoostingClassifier):
    start_time = time.time()
    model = mdl()
    model.fit(X,Y)
    feature_importance = permutation_importance(model, X,Y, n_repeats=10)
    fi = pd.DataFrame(feature_importance.importances_mean, index = X.columns, columns = ['importance'])
    important_fe= X[fi.loc[fi['importance']>0].index]
    unimportant_fe = X[fi.loc[fi['importance']<=0].index]
    print(" %s second " %(time.time()-start_time))
    return important_fe, unimportant_fe

In [None]:
impt_fe, unimpt_fe = get_important_feature(feature_train, label_train)

In [None]:
impt_fe_std, unimpt_fe_std = get_important_feature(feature_train_std, label_train)

### predict with imortant features only

In [None]:
score1 = pd.DataFrame(columns = ['std', 'No_std'], index = ['GradientBoostingClassifier', 'LogisticRegression', 'RandomForestClassifier'])

In [None]:
for x in models:
    score1.loc[x.__name__,'std']=cross_validation(impt_fe_std, label_train, x)
    print(x.__name__)

In [None]:
for x in models:
    score1.loc[x.__name__,'No_std']=cross_validation(impt_fe, label_train, x)
    print(x.__name__)

In [None]:
score1

###  stacking, create l2 feature

In [None]:
feature_set = feature_splitor(impt_fe)
feature_set_std = feature_splitor(impt_fe_std)

In [None]:
l2_feature = l1_predictor(impt_fe, label_train,feature_set)
l2_feature_std =l1_predictor(impt_fe_std, label_train,feature_set_std)

### predict with l2 feature

In [None]:
score2 = pd.DataFrame(columns = ['std_l2', 'No_std_l2'], index = ['GradientBoostingClassifier', 'LogisticRegression', 'RandomForestClassifier'])

In [None]:
for x in models:
    score2.loc[x.__name__,'std_l2']=cross_validation(l2_feature, label_train, x)
    print(x)

In [None]:
for x in models:
    score2.loc[x.__name__,'No_std_l2']=cross_validation(l2_feature_std, label_train, x)
    print(x)

In [None]:
score2

### combine important_feature with L2 feature

In [None]:
score3 = pd.DataFrame(columns = ['meta', 'meta_std'], index = ['GradientBoostingClassifier', 'LogisticRegression', 'RandomForestClassifier'])

In [None]:
meta_feature = pd.concat([impt_fe,l2_feature],axis=1)
meta_feature_std = pd.concat([impt_fe_std,l2_feature_std],axis=1)

In [None]:
for x in models:
    score3.loc[x.__name__,'meta']=cross_validation(meta_feature, label_train, x)
    print(x.__name__)

In [None]:
for x in models:
    score3.loc[x.__name__,'meta_std']=cross_validation(meta_feature, label_train, x)
    print(x.__name__)

In [None]:
score3

### 9. check feature importance again using logist regression

In [None]:
LR = LogisticRegression(max_iter = 20**9)
LR.fit(new_feature,  np.array(label).T[0])
feature_importance_LR = permutation_importance(LR, new_feature, np.array(label).T[0], n_repeats=10)

In [None]:
fi_LR = pd.DataFrame(feature_importance_LR.importances_mean, index = new_feature.columns, columns = ['importance'])

In [None]:
fi_LR.loc[fi_LR['importance']>0]

In [None]:
important_feature_LR = new_feature[fi_LR.loc[fi_LR['importance']>0].index]
unimportant_feature_LR = new_feature[fi_LR.loc[fi_LR['importance']<=0].index]

### 10. created final_feature by using unimportant_feature to produce a new stacking feature, and add it on important_feature_LR

In [None]:
# use gradientBoosting to make level 1 prediction produces better result instead of Logist
unimportant_feature_LR_st = pd.DataFrame(level_1_predictor(unimportant_feature_LR,np_label.T[0], GradientBoostingClassifier),columns=['unimportant_feature_LR'])

In [None]:
final_feature = pd.concat([important_feature_LR,unimportant_feature_LR_st], axis=1)

### 11.  Final test

In [None]:
X_train, X_test, y_train, y_test = train_test_split(final_feature,label, test_size=0.2, random_state=11)
cross_validation(5,X_train, y_train, roc_auc_score,LogisticRegression)

----------

In [None]:
# create important_l1 file
dic = {}
for x in feature.columns:
    dic[x]=[0]
i = pd.DataFrame(dic)
i.to_csv('important_l1.csv', index=False)

In [None]:
i = pd.read_csv('important_l1.csv')
for x in important_l1:
    i[x]+=1
i.to_csv('important_l1.csv',index=False)

### experiments and draft code

In [None]:
class fillNa(BaseEstimator, TransformerMixin):
    def fit(self, X, y = None):
        return self
    def transform(self, X):
        for x in feature.columns:
            feature[x]=feature[x].fillna(feature[x].mean())
        return

In [None]:
class get_label(BaseEstimator, TransformerMixin):
    def fit(self, X, y = None):
        return self
    def transform(self, X):
        return X['target']

In [None]:
df = df = pd.read_csv(r'C:\Users\johnk\OneDrive\Desktop\project\python project\kaggle\amex-default-prediction\new_train\train_2.csv')

In [None]:
positive = df.loc[df['target']==1].shape[0]
negative = df.loc[df['target']==0].shape[0]

In [None]:
rate = positive/(positive+negative)

In [None]:
1-0.05/rate

In [None]:
np_label = np.array(label)

In [None]:
stack_features = stacker(feature_sets)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(stack_feature,label, test_size=0.2, random_state=11)
for x in models:
    print(x.__name__,cross_validation(5,X_train, y_train, roc_auc_score,x))

----

In [None]:
np.array(label).T[0].shape

In [None]:
start_time = time.time()
LR = LogisticRegression(max_iter = 20**9)
LR.fit(feature,  np.array(label).T[0])
feature_importance_LR = permutation_importance(LR, feature, np.array(label).T[0], n_repeats=10)
print(" %s second " %(time.time()-start_time))

In [None]:
fi_LR = pd.DataFrame(feature_importance_LR.importances_mean, index = feature.columns, columns = ['importance'])

In [None]:
unimportant_feature = feature[fi.loc[fi['importance']==0].index]

In [None]:
important_feature_LR = feature[fi_LR.loc[fi_LR['importance']>0].index]

In [None]:
unimportant_feature_LR = feature[fi_LR.loc[fi_LR['importance']<=0].index]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(unimportant_feature,label, test_size=0.2, random_state=11)
for x in models:
    print(x.__name__,cross_validation(5,X_train, y_train, roc_auc_score,x))

In [None]:
start_time = time.time()
X_train, X_test, y_train, y_test = train_test_split(important_feature,label, test_size=0.2, random_state=11)
for x in [GradientBoostingClassifier]:
    print(x.__name__,cross_validation(5,X_train, y_train, roc_auc_score,x))
print(" %s second " %str(time.time()-start_time))

In [None]:
feature_sets_U ={'delinquency':delinquency,'spend':spend,'payment':payment,'balance':balance,'risk':risk,'std':std,'mean':mean, '_25':_25, '_50':_50, '_75':_75,'_min':_min,'_max':_max,'unimportant_feature':unimportant_feature.columns}

In [None]:
stack_features

----

In [None]:
models = [GradientBoostingClassifier,RandomForestClassifier,LogisticRegression]

In [None]:
new_feature = pd.concat([important_feature,stack_features, unimportant_feature],axis=1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(new_feature,label, test_size=0.2, random_state=11)
for x in models:
    print(x.__name__,cross_validation(5,X_train, y_train, roc_auc_score,x))

In [None]:
LR = LogisticRegression(max_iter = 20**9)
LR.fit(new_feature,  np.array(label).T[0])
feature_importance_LR = permutation_importance(LR, new_feature, np.array(label).T[0], n_repeats=10)

In [None]:
fi_LR = pd.DataFrame(feature_importance_LR.importances_mean, index = new_feature.columns, columns = ['importance'])

In [None]:
better_feature = new_feature[fi_LR.loc[fi_LR['importance']>0].index]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(better_feature,label, test_size=0.2, random_state=11)
for x in models:
    print(x.__name__,cross_validation(5,X_train, y_train, roc_auc_score,x))

--------

In [None]:
fi_LR.loc[fi_LR['importance']>0]