In [131]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
# from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import precision_score, recall_score, f1_score

app_train = pd.read_csv('application_train.csv')
app_test = pd.read_csv('application_test.csv')
bureau = pd.read_csv('bureau.csv')
bureau_balance = pd.read_csv('bureau_balance.csv')
pos_cash_balance = pd.read_csv('POS_CASH_balance.csv')
credit_card_balance = pd.read_csv('credit_card_balance.csv')
previous_application = pd.read_csv('previous_application.csv')
installments_payments = pd.read_csv('installments_payments.csv')

In [168]:
app_train.shape

(307511, 123)

In [144]:
#imbalance
app_train['TARGET'].shape[0] - np.sum(app_train['TARGET'])

282686

In [132]:
#pre-process
all_numerical_cols = list(app_train.select_dtypes(exclude='object').columns)
cont_cols = [col for col in all_numerical_cols if col != "TARGET" and col[:5]!='FLAG_']

app_train = app_train.drop(columns=['CNT_FAM_MEMBERS','LIVE_REGION_NOT_WORK_REGION', 'REG_REGION_NOT_WORK_REGION', 'OBS_60_CNT_SOCIAL_CIRCLE'])
app_test = app_test.drop(columns=['CNT_FAM_MEMBERS','LIVE_REGION_NOT_WORK_REGION', 'REG_REGION_NOT_WORK_REGION', 'OBS_60_CNT_SOCIAL_CIRCLE'])
cols_to_remove = ['AMT_CREDIT', 'CNT_FAM_MEMBERS', 'REG_REGION_NOT_WORK_REGION', 'LIVE_REGION_NOT_WORK_REGION', 'OBS_60_CNT_SOCIAL_CIRCLE','SK_ID_CURR']
cont_cols = list(set(cont_cols) - set(cols_to_remove))
cont_cols

['LIVINGAREA_MODE',
 'AMT_REQ_CREDIT_BUREAU_HOUR',
 'ELEVATORS_MODE',
 'DAYS_EMPLOYED',
 'EXT_SOURCE_3',
 'FLOORSMIN_MODE',
 'YEARS_BEGINEXPLUATATION_MODE',
 'AMT_REQ_CREDIT_BUREAU_WEEK',
 'ELEVATORS_MEDI',
 'REG_REGION_NOT_LIVE_REGION',
 'COMMONAREA_MEDI',
 'OWN_CAR_AGE',
 'NONLIVINGAREA_MEDI',
 'AMT_GOODS_PRICE',
 'ENTRANCES_AVG',
 'AMT_REQ_CREDIT_BUREAU_DAY',
 'ELEVATORS_AVG',
 'LIVINGAREA_AVG',
 'LANDAREA_AVG',
 'AMT_ANNUITY',
 'LIVINGAPARTMENTS_MEDI',
 'YEARS_BUILD_MEDI',
 'TOTALAREA_MODE',
 'LANDAREA_MEDI',
 'EXT_SOURCE_2',
 'NONLIVINGAPARTMENTS_MODE',
 'HOUR_APPR_PROCESS_START',
 'LIVINGAPARTMENTS_MODE',
 'AMT_REQ_CREDIT_BUREAU_MON',
 'FLOORSMIN_AVG',
 'ENTRANCES_MODE',
 'YEARS_BEGINEXPLUATATION_AVG',
 'EXT_SOURCE_1',
 'NONLIVINGAREA_AVG',
 'REG_CITY_NOT_WORK_CITY',
 'DAYS_LAST_PHONE_CHANGE',
 'COMMONAREA_AVG',
 'YEARS_BEGINEXPLUATATION_MEDI',
 'CNT_CHILDREN',
 'APARTMENTS_MEDI',
 'REGION_POPULATION_RELATIVE',
 'DAYS_BIRTH',
 'FLOORSMAX_AVG',
 'LIVE_CITY_NOT_WORK_CITY',
 'FLOORS

In [167]:
len(cont_cols)

73

In [133]:

app_train['LTV'] = app_train['AMT_CREDIT']/app_train['AMT_GOODS_PRICE']
app_train['DTI'] = app_train['AMT_ANNUITY']/app_train['AMT_INCOME_TOTAL']
app_train['Employed/Birth'] = app_train['DAYS_EMPLOYED']/app_train['DAYS_BIRTH'] 
app_train['Flag_Greater_30'] = (app_train['DAYS_BIRTH']/-365.25).apply(lambda x: 1 if x > 30 else 0)
app_train['Flag_Employment_Greater_5'] = (app_train['DAYS_EMPLOYED']/-365.25).apply(lambda x: 1 if x > 5 else 0)

# #用不上的
# app_test['LTV'] = app_test['AMT_CREDIT']/app_test['AMT_GOODS_PRICE']
# app_test['DTI'] = app_test['AMT_ANNUITY']/app_test['AMT_INCOME_TOTAL']
# app_test['Employed/Birth'] = app_test['DAYS_EMPLOYED']/app_test['DAYS_BIRTH']
# app_test['Flag_Greater_30'] = (app_test['DAYS_BIRTH']/-365.25).apply(lambda x: 1 if x > 30 else 0)
# app_test['Flag_Employment_Greater_5'] = (app_test['DAYS_EMPLOYED']/-365.25).apply(lambda x: 1 if x > 5 else 0) 

In [134]:
from sklearn.preprocessing import OrdinalEncoder
from category_encoders import TargetEncoder 
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.metrics import roc_auc_score, confusion_matrix

#set X1 and Y1 and try to train
cat_col = app_train.select_dtypes('object')
enc = TargetEncoder()
app_train[cat_col.columns] = enc.fit_transform(app_train[cat_col.columns], app_train['TARGET'])

In [135]:
#fill na
k = 0
for i in cat_col:
    app_train[i].fillna(0, inplace=True)
for i in cont_cols:
    app_train[i].fillna(app_train[i].mean(), inplace=True)
    sc = StandardScaler().fit(app_train[[i]])
    app_train[i] = sc.transform(app_train[[i]])
for i in app_train.columns:
    if max(app_train[i])>5 and i not in cat_col:
        sc = StandardScaler().fit(app_train[[i]])
        app_train[i] = sc.transform(app_train[[i]])
print(app_train.shape)

(307511, 123)


In [136]:
def posi_extend(X, y):
    more_posi = (int) ((y.shape[0] - np.sum(y))*2)
    posi_idx = [i for i in range(len(y)) if y.iloc[i] == 1]
    indices = np.random.choice(posi_idx, size=more_posi, replace=True)
    X_ex = X.iloc[indices]
    y_ex = y.iloc[indices]
    X = pd.concat([X, X_ex], axis=0)
    y = pd.concat([y, y_ex], axis=0)
    return X, y

train1 = app_train.loc[:190000,:].dropna(axis=0)
mask = np.isinf(train1)
train1[mask] = np.nan
train1 = train1.dropna(axis=0)

train2 = app_train.loc[190000:260000,:].dropna(axis=0)
mask = np.isinf(train2)
train2[mask] = np.nan
train2 = train2.dropna(axis=0)
print(train1.shape, train2.shape)

Y1 = train1.iloc[:,1]
X1 = train1.drop(["TARGET", "SK_ID_CURR"],axis=1)
Y2 = train2.iloc[:,1]
X2 = train2.drop(["TARGET", "SK_ID_CURR"],axis=1)

X1, Y1 = posi_extend(X1, Y1)
X2, Y2 = posi_extend(X2, Y2)
# print(app_test.head())
# test = app_test.loc[:,vs].dropna(axis=0)
# print(test.head(10))
# Y_test = test.iloc[:,0]
# X_test = test.drop(["TARGET"],axis=1)
# print(X.shape, X_test.shape)
print(X1.shape, X2.shape)

#final test set, imbalanced
test_data = app_train.loc[260000:,:].dropna(axis=0)
YTest = test_data.iloc[:,1]
XTest = test_data.drop(["TARGET", "SK_ID_CURR"],axis=1)
print(XTest.shape)

(189839, 123) (69927, 123)
(538685, 121) (198509, 121)
(47457, 121)


In [170]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import Perceptron
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, balanced_accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split

#m1
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X1, Y1, test_size=0.3, random_state=42)
print(X_train.shape)
# Create a decision tree classifier object
clf = DecisionTreeClassifier(random_state=42, max_depth=15) #0.63
#clf = RandomForestClassifier(random_state=42, max_depth=15) #40k imbalanced test: 0.7282299442204214 0.2274404834326966
#clf = MLPClassifier(max_iter=100, hidden_layer_sizes=(512,128,32,32,32,16),alpha=0.1,learning_rate=0.001) #imbalanced test: AUC 0.662085296871909, F1 0.216791319919941
clf.fit(X_train, y_train)
print("done fit")
# Predict using the testing data
y_pred = clf.predict_proba(X_test)
# #m2
# # Create a neural network classifier object
# nn = MLPClassifier(hidden_layer_sizes=(50, ), max_iter=100)

# # Fit the classifier to the training data
# nn.fit(X_train, y_train)

# # Predict using the testing data
# y_pred_nn = nn.predict(X_test)

# accuracy = accuracy_score(y_test, y_pred)
# accuracy_nn = accuracy_score(y_test, y_pred_nn)
# f1 = f1_score(y_test, y_pred[:,1])
auc = roc_auc_score(y_test, y_pred[:,1])
# auc_nn = roc_auc_score(y_test, y_pred_nn)
print(auc, f1_score(y_test, clf.predict(X_test)))

(377079, 121)
done fit
0.83388966445565 0.8854320332830551


In [171]:
#On imbalanced raw 60k test set (no augmentation)
print(roc_auc_score(YTest, clf.predict_proba(XTest)[:,1]), f1_score(YTest, clf.predict(XTest)))
#on balanced validation set and on imbalanced test set
print(confusion_matrix(y_test,  clf.predict(X_test)))
print(confusion_matrix(YTest, clf.predict(XTest)))

0.6616511219947462 0.1979995619478718
[[ 27120  25210]
 [  2438 106838]]
[[22775 20930]
 [ 1040  2712]]


In [None]:
# # #models weight learner
# import numpy as np

# def linear_regression(X1, Y1, X, y, models, X_test, y_test, alpha=0.01, num_iterations=1000):
    
#     y_hats = []
#     #models train on predictions
#     ms = []
#     X_train, X_test, y_train, y_test = train_test_split(X1, Y1, test_size=0.3, random_state=42)
#     for i in range(len(models)):
#         m = models[i]
#         print(m)
#         m.fit(X_train, y_train)
#         y_hat = m.predict(X)
#         y_hats.append(y_hat)
#         ms.append(m)
#     y_hats = np.array(y_hats)
#     print(np.sum(y_hats[0]), np.sum(y_hats[1]))
    
 
#     X = y_hats.T 
#     print("after transpose, the input shape is: " + str(X.shape))
#     X = np.hstack((np.ones((X.shape[0], 1)), X))
#     print("X's value after stacking constant:")
#     print(X)
#     theta = np.zeros(X.shape[1])

#     # Define the cost function
#     def compute_cost(X, y, theta):
#         m = len(y)
#         # X_norm = (X - np.mean(X, axis=0)) / np.std(X, axis=0)
#         z = X.dot(theta)
#         h = 1 / (1 + np.exp(-z))

#         # Compute the cost function
#         J = -1 / len(y) * np.sum(y * np.log(h) + (1 - y) * np.log(1 - h))
#         return J

#     # Define the gradient descent function
#     def gradient_descent(X, y, theta, alpha, num_iterations):
#         m = len(y)
#         J_history = np.zeros(num_iterations)
#         for i in range(num_iterations):
#             h = X.dot(theta)
#             theta = theta - alpha*(1/m)*X.T.dot(h-y)
#             J_history[i] = compute_cost(X, y, theta)
#         return theta
    
#     def lr_predict(input, theta, ms):
#         #input原始数据
#         y_hats = []
#         for m in ms:
#             y_hats.append(m.predict(input))
#         y_hats = np.array(y_hats)
#         print(y_hats.shape)
#         X = y_hats.T 
#         X = np.hstack((np.ones((X.shape[0], 1)), X))
        
#         print(X.shape)
#         print(theta.shape)
#         # Compute the hypothesis
#         z = X.dot(theta)
#         h = 1 / (1 + np.exp(-z))
#         # Threshold the probabilities to get the predictions
#         y_pred = (h > 0.5).astype(int)
#         return y_pred.ravel()

#     # Run gradient descent
#     theta_star = gradient_descent(X, y, theta, alpha, num_iterations)

#     # Predict using the testing data
#     y_pred = lr_predict(X_test, theta_star, ms)
    
#     return accuracy_score(y_test,y_pred), f1_score(y_test,y_pred), roc_auc_score(y_test, y_pred)



In [None]:
# #model weights learner
# models = [DecisionTreeClassifier(random_state=42), MLPClassifier(hidden_layer_sizes=(50, ), max_iter=100)]
# X2 = train2.drop(["TARGET"],axis=1)
# Y2 = train2.iloc[:,0]
# X2_train, X2_test, y2_train, y2_test = train_test_split(X2,Y2, test_size=0.3, random_state=42)

# score = linear_regression(X, Y, X2_train, y2_train, models, X2_test, y2_test)
# print(score)


DecisionTreeClassifier(random_state=42)
MLPClassifier(hidden_layer_sizes=(50,), max_iter=100)
4493 3
after transpose, the input shape is: (75174, 2)
X's value after stacking constant:
[[1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 ...
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]]
(2, 59949)
(59949, 3)
(3,)
(0.08073529166458156, 0.14940807853184954, 0.5)


In [128]:
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error
from sklearn.metrics import confusion_matrix

def linear_regression(X1, Y1, X2, y2, models, XTest, YTest, alpha=0.01, num_iterations=1000):
    
    y_hats = []
    #models train on predictions
    ms = []
    X_train, X_test, y_train, y_test = train_test_split(X1, Y1, test_size=0.3, random_state=42)
    for i in range(len(models)):
        m = models[i]
        print(m)
        m.fit(X_train, y_train)
        print(roc_auc_score(y2, m.predict_proba(X2)[:,1]))
        y_hat = m.predict_proba(X2)[:,1]
        
        
        y_hats.append(y_hat)
        ms.append(m)
    y_hats = np.array(y_hats)
    lr = LogisticRegression()
    lr.fit(y_hats.T , y2)
    #检查回归方式是不是有问题
    print(lr.coef_)
    print(np.sqrt(mean_squared_error(y2, lr.predict(y_hats.T))))

    #pred
    input = []
    for m in ms:
        input.append(m.predict_proba(XTest)[:,1])
    input = np.array(input)
    y_pred = lr.predict_proba(input.T)[:,1]
    print(np.sum(lr.predict(input.T)))
    print(np.sum(YTest))

    return roc_auc_score(YTest, y_pred), f1_score(YTest, lr.predict(input.T)), accuracy_score(YTest,lr.predict(input.T)), confusion_matrix(YTest, lr.predict(input.T))

def lasso_reg(X1, Y1, X2, y2, models, X_test, y_test):
    y_hats = []
    #models train on predictions
    ms = []
    X_train, X_test, y_train, y_test = train_test_split(X1, Y1, test_size=0.3, random_state=42)
    for i in range(len(models)):
        m = models[i]
        print(m)
        m.fit(X_train, y_train)
        y_hat = m.predict(X2)
        print(np.sum(y_hat))
        y_hats.append(y_hat)
        ms.append(m)
    y_hats = np.array(y_hats)
    lr = Lasso(alpha=0.1)
    lr.fit(y_hats.T , y2)

    #pred
    input = []
    for m in ms:
        input.append(m.predict(X_test))
    input = np.array(input)
    # z = lr.predict(input.T)
    # h = 1 / (1 + np.exp(-z))
    # y_pred = (h > 0.5).astype(int)
    y_pred = lr.predict_proba(input.T)
    return balanced_accuracy_score(y_test,y_pred), f1_score(y_test,y_pred), roc_auc_score(y_test, y_pred), confusion_matrix(y_test, y_pred)




In [199]:
from sklearn.datasets import load_digits
from sklearn.linear_model import Perceptron
from sklearn.naive_bayes import GaussianNB


# Load the digits dataset
other_train_X = pd.read_csv("/Users/tq/Downloads/r/other/train_binary.csv")
other_train_y = pd.read_csv("/Users/tq/Downloads/r/other/y_train_binary.csv")
sp = (int)(len(other_train_X)*0.7)
X1 = other_train_X.iloc[:sp,:]
Y1 = other_train_y.iloc[:sp,:]
X2 = other_train_X.iloc[:sp,:]
Y2 = other_train_y.iloc[:sp,:]

X_test = pd.read_csv("/Users/tq/Downloads/r/other/test_binary.csv")
Y_test = pd.read_csv("/Users/tq/Downloads/r/other/y_test_binary.csv")

models = [DecisionTreeClassifier(random_state=42),Perceptron(random_state=42),GaussianNB()]
score = linear_regression(X1, Y1, X2, Y2, models, X_test, y_test)
print(score)
#(0.9655362897038515, 0.9585798816568047, 0.9655362897038514)
#DT (0.8940309958569894, 0.861764705882353, 0.8940309958569894)
#Percep (0.9617768912076109, 0.9515418502202644, 0.9617768912076108)
#NN (0.9699555009973915, 0.9644970414201184, 0.9699555009973914)
#DT + P 0.9617768912076109, 0.9515418502202644, 0.9617768912076108)

DecisionTreeClassifier(random_state=42)
1.0
Perceptron(random_state=42)


  y = column_or_1d(y, warn=True)


AttributeError: 'Perceptron' object has no attribute 'predict_proba'

In [106]:
#check the distribution of positives 
np.unique(Y2, return_counts=True) #Ok to go

X2_train, X2_test, y2_train, y2_test = train_test_split(X2,Y2, test_size=0.3, random_state=42)
np.unique(y2_test, return_counts=True)

(array([0, 1]), array([29535, 32326]))

In [83]:
! python3 -m pip install xgboost

Defaulting to user installation because normal site-packages is not writeable
Collecting xgboost
  Downloading xgboost-1.7.5-py3-none-macosx_12_0_arm64.whl (1.6 MB)
[K     |████████████████████████████████| 1.6 MB 7.8 MB/s eta 0:00:01
Installing collected packages: xgboost
Successfully installed xgboost-1.7.5
You should consider upgrading via the '/Applications/Xcode.app/Contents/Developer/usr/bin/python3 -m pip install --upgrade pip' command.[0m


In [130]:
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
import xgboost
from xgboost.sklearn import XGBClassifier

models = [DecisionTreeClassifier(random_state=42,max_depth=5),
          XGBClassifier(max_depth=5, learning_rate=0.1, n_estimators=100, seed=42),
          DecisionTreeClassifier(random_state=12,max_depth=5),
          XGBClassifier(max_depth=5, learning_rate=0.1, n_estimators=100, seed=123),
          RandomForestClassifier(random_state=42, max_depth=15),
          XGBClassifier(max_depth=10, learning_rate=0.1, n_estimators=100, seed=34),
          DecisionTreeClassifier(random_state=42,max_depth=5,criterion='entropy'),
          DecisionTreeClassifier(random_state=123,max_depth=5,max_features=50),
          DecisionTreeClassifier(random_state=42,max_depth=5,max_features=50,min_samples_leaf=50),
          DecisionTreeClassifier(random_state=333,max_depth=5,min_samples_leaf=50),
          DecisionTreeClassifier(random_state=42,max_depth=15,criterion='gini'),
          DecisionTreeClassifier(random_state=1,max_depth=5,max_features=10),
          DecisionTreeClassifier(random_state=12,max_depth=5,min_samples_leaf=100),
          DecisionTreeClassifier(random_state=42,max_depth=15,min_samples_leaf=50),
          DecisionTreeClassifier(random_state=123,max_depth=15,criterion='entropy'),
          DecisionTreeClassifier(random_state=42,max_depth=5,criterion='gini'),
          DecisionTreeClassifier(random_state=42,max_depth=5,criterion='log_loss'),
          DecisionTreeClassifier(random_state=42,max_depth=5,criterion='entropy'),
          DecisionTreeClassifier(random_state=42,max_depth=10),
          DecisionTreeClassifier(random_state=42,max_depth=5,criterion='entropy'),
          DecisionTreeClassifier(random_state=42,max_depth=15,min_samples_leaf=200),
          DecisionTreeClassifier(random_state=123,max_depth=5, criterion='entropy'),
          DecisionTreeClassifier(random_state=42,max_depth=5,max_features=50),
          DecisionTreeClassifier(random_state=123,max_depth=5,min_samples_split=50),
          DecisionTreeClassifier(random_state=42,max_depth=5,max_features=10),
          DecisionTreeClassifier(random_state=42,max_depth=5,criterion='log_loss', min_samples_split=10),
          DecisionTreeClassifier(random_state=123,max_depth=5,criterion='entropy'),
          DecisionTreeClassifier(random_state=42,max_depth=5,min_samples_leaf=100),
          DecisionTreeClassifier(random_state=42,max_depth=5),
          MLPClassifier(hidden_layer_sizes=(10,)),
          MLPClassifier(hidden_layer_sizes=(20,)),
          MLPClassifier(hidden_layer_sizes=(10,)),
          MLPClassifier(hidden_layer_sizes=(100,)),
          MLPClassifier(hidden_layer_sizes=(10,)),
          MLPClassifier(hidden_layer_sizes=(10,)),
          MLPClassifier(hidden_layer_sizes=(5,)),
          MLPClassifier(hidden_layer_sizes=(10,)),
          MLPClassifier(hidden_layer_sizes=(10,)),
          MLPClassifier(hidden_layer_sizes=(5,5)),
          MLPClassifier(hidden_layer_sizes=(10,10,1)),
          MLPClassifier(hidden_layer_sizes=(5,10,10,1)),
          MLPClassifier(hidden_layer_sizes=(10,)),
          MLPClassifier(hidden_layer_sizes=(10,)),
          MLPClassifier(hidden_layer_sizes=(5,)),
          MLPClassifier(hidden_layer_sizes=(10,)),
          MLPClassifier(hidden_layer_sizes=(30,)),
          MLPClassifier(hidden_layer_sizes=(10,)),
          MLPClassifier(hidden_layer_sizes=(30,)),
          MLPClassifier(hidden_layer_sizes=(10,)),
          MLPClassifier(hidden_layer_sizes=(10,)),
          MLPClassifier(hidden_layer_sizes=(80,)),
          MLPClassifier(hidden_layer_sizes=(10,)),
          MLPClassifier(hidden_layer_sizes=(200,)),
          MLPClassifier(hidden_layer_sizes=(10,)),
]
score = linear_regression(X1, Y1, X2, Y2, models, XTest, YTest)
print(score)

DecisionTreeClassifier(max_depth=5, random_state=42)
0.7126845428802702
XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.1, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=5, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=None, ...)
0.757446129595033
DecisionTreeClassifier(max_depth=5, random_state=12)
0.7126845428802702
XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylev



0.6848982334429878
MLPClassifier(hidden_layer_sizes=(10,))
0.7428590011058929
MLPClassifier(hidden_layer_sizes=(10,))
0.742271303826304
MLPClassifier(hidden_layer_sizes=(5,))
0.7481371492652447
MLPClassifier(hidden_layer_sizes=(10,))
0.7468735401787696
MLPClassifier(hidden_layer_sizes=(10,))
0.7426437344058363
MLPClassifier(hidden_layer_sizes=(5, 5))
0.7464214834340681
MLPClassifier(hidden_layer_sizes=(10, 10, 1))
0.739112185907542
MLPClassifier(hidden_layer_sizes=(5, 10, 10, 1))
0.7407038251393572
MLPClassifier(hidden_layer_sizes=(10,))
0.7452051604429887
MLPClassifier(hidden_layer_sizes=(10,))
0.7429389730477013
MLPClassifier(hidden_layer_sizes=(5,))
0.7461844804070284
MLPClassifier(hidden_layer_sizes=(10,))
0.743167009237893
MLPClassifier(hidden_layer_sizes=(30,))
0.7287523972667147
MLPClassifier(hidden_layer_sizes=(10,))
0.7415399641020053
MLPClassifier(hidden_layer_sizes=(30,))




0.7276411837008769
MLPClassifier(hidden_layer_sizes=(10,))
0.7440157595387369
MLPClassifier(hidden_layer_sizes=(10,))
0.7428163657007336
MLPClassifier(hidden_layer_sizes=(80,))




0.6937879094940456
MLPClassifier(hidden_layer_sizes=(10,))
0.7432178158266106
MLPClassifier(hidden_layer_sizes=(200,))




0.6634398470388071
MLPClassifier(hidden_layer_sizes=(10,))
0.7412447782886717
[[ 0.02729774  1.39394647  0.02729774  1.39394647 -0.83746658  0.74584803
   0.01177244 -0.54611626  0.08093013  0.02729774 -0.08324886  0.28641155
   0.02729774 -0.0762537  -0.01481117  0.02729774  0.01177244  0.01177244
   0.02064224  0.01177244  0.13539455  0.01177244  0.1033838   0.02729774
  -0.06416864  0.01177244  0.01177244  0.02729774  0.02729774  0.0601474
   0.21374377  0.27171509  0.02948591  0.17730095 -0.29776511  0.59630169
   0.59157335  0.04217116  0.61357489  0.14995234 -0.02118433  0.3252643
  -0.03419235 -0.12859908 -0.14447747  0.15821255 -0.23107029  0.12328013
   0.35055679 -0.25487153  0.04130753  0.02699131  0.05383864 -0.66544115]]
0.552372662494147
17200
3752
(0.7583002583955376, 0.25496372661321115, 0.6710706534336347, array([[29176, 14529],
       [ 1081,  2671]]))


In [126]:
print(YTest.shape)

(47457,)


In [149]:
#aggregates previous.csv
app_train.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR,LTV,DTI,Employed/Birth,Flag_Greater_30,Flag_Employment_Greater_5
0,-1.733423,1,0.083459,0.101419,0.085002,0.079616,-0.577538,0.142129,-0.478095,-0.166149,...,-0.06795715,-0.1805048,-0.313873,-0.3594746,-0.5176655,0.285399,0.121978,0.067329,0,0
1,-1.733413,0,0.083459,0.069993,0.085002,0.083249,-0.577538,0.426792,1.72545,0.592677,...,-0.06795715,-0.1805048,-0.313873,-0.3594746,-1.092866,0.179,0.132217,0.070862,1,0
2,-1.733403,0,0.054783,0.101419,0.072437,0.079616,-0.577538,-0.427196,-1.152888,-1.404676,...,-0.06795715,-0.1805048,-0.313873,-0.3594746,-1.092866,-0.991538,0.1,0.011814,1,0
3,-1.733384,0,0.083459,0.069993,0.085002,0.079616,-0.577538,-0.142533,-0.71143,0.177869,...,-2.920631e-17,2.207782e-17,-1.5019070000000003e-17,-4.456258e-17,-5.1724180000000005e-17,-0.565861,0.2199,0.159905,1,1
4,-1.733374,0,0.083459,0.101419,0.085002,0.079616,-0.577538,-0.199466,-0.213734,-0.361755,...,-0.06795715,-0.1805048,-0.313873,-0.3594746,-1.092866,-0.991538,0.179963,0.152418,1,1


0
10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
180000
190000
200000
210000
220000
230000
240000
250000
260000
270000
280000
290000
300000
310000
320000
330000


Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NAME_CONTRACT_TYPE,AMT_ANNUITY,AMT_CREDIT,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,NAME_CONTRACT_STATUS,DAYS_DECISION,DAYS_LAST_DUE,DAYS_TERMINATION,TOTAL_ANNUITY,TOTAL_APPLICATION
0,1369693,100001,Consumer loans,3951.0,23787.0,FRIDAY,13,Approved,-1740,-1619.0,-1612.0,3951.000,23787.0
0,1038818,100002,Consumer loans,9251.775,179055.0,SATURDAY,9,Approved,-606,-25.0,-17.0,9251.775,179055.0
0,1810518,100003,Cash loans,98356.995,1035882.0,FRIDAY,12,Approved,-746,-536.0,-527.0,169661.970,1452573.0
0,1564014,100004,Consumer loans,5357.25,20106.0,FRIDAY,5,Approved,-815,-724.0,-714.0,5357.250,20106.0
0,1857999,100005,Cash loans,,0.0,FRIDAY,10,Canceled,-315,,,4813.200,40153.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,2248017,456251,Consumer loans,6605.91,40455.0,THURSDAY,17,Approved,-273,-30.0,-25.0,6605.910,40455.0
0,1503599,456252,Consumer loans,10074.465,56821.5,WEDNESDAY,10,Approved,-2497,-2316.0,-2311.0,10074.465,56821.5
0,1686207,456253,Consumer loans,5567.715,27306.0,SATURDAY,12,Approved,-1909,-1716.0,-1712.0,9540.810,41251.5
0,2016407,456254,Consumer loans,19065.825,247423.5,SATURDAY,18,Approved,-277,365243.0,365243.0,21362.265,268879.5
