In [60]:
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import xgboost as xgb
# check version number
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import NearMiss
from sklearn.cluster import KMeans
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [2]:
import matplotlib.pyplot as plt

### Data Sources

In [5]:
app_train = pd.read_csv('../../Data/Processed/app_train_processed.csv')
app_test = pd.read_csv('../../Data/Processed/app_test_processed.csv')
bureau = pd.read_csv('../../Data/Processed/bureau_data.csv')
installments = pd.read_csv('../../Data/Processed/installments.csv')
app_test.drop(columns = 'Unnamed: 0', inplace = True)
app_train.drop(columns = 'Unnamed: 0', inplace = True)
installments.drop(columns = 'Unnamed: 0', inplace = True)

In [6]:
app_train.EXT_SOURCE_1.fillna(app_train.EXT_SOURCE_1.median(), inplace = True)
app_test.EXT_SOURCE_1.fillna(app_train.EXT_SOURCE_1.median(), inplace = True)
data = app_train.merge(bureau, on = 'SK_ID_CURR', how = 'left')
data = data.merge(installments, on = 'SK_ID_CURR', how = 'left')
data.fillna(0, inplace = True)
data_test = app_test.merge(bureau, on = 'SK_ID_CURR', how = 'left')
data_test = data_test.merge(installments, on = 'SK_ID_CURR', how = 'left')
data_test.fillna(0, inplace = True)
data.drop(columns = [ 'SK_ID_CURR'], inplace = True)
data_test.drop(columns = ['SK_ID_CURR'], inplace = True)

### Train Test Split

In [7]:
y = data['TARGET']
col = list(data.columns)
col.remove('TARGET')
X = data[col]

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42, stratify = y)

In [9]:
X_train.columns

Index(['NAME_CONTRACT_TYPE', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'CNT_CHILDREN',
       'AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY', 'AMT_GOODS_PRICE',
       'REGION_POPULATION_RELATIVE', 'DAYS_REGISTRATION',
       ...
       'Other', 'DPD', 'MAX_Over_due', 'Current_balance', 'Current_debt',
       'current_overdue', 'AMT_BALANCE', 'AMT_CREDIT_LIMIT_ACTUAL',
       'AMT_PAYMENT_CURRENT', 'AMT_TOTAL_RECEIVABLE'],
      dtype='object', length=109)

In [10]:
corr_matrix = X_train.corr().abs()

#the matrix is symmetric so we need to extract upper triangle matrix without diagonal (k = 1)

sol = (corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
                  .stack()
                  .sort_values(ascending=False))


In [11]:
drop_cols = [ 'CODE_GENDER_F', 'AMT_TOTAL_RECEIVABLE', 'Government', 'OBS_30_CNT_SOCIAL_CIRCLE',
            'AMT_GOODS_PRICE', 'NAME_TYPE_SUITE_Unaccompanied', 'REGION_RATING_CLIENT_W_CITY', 'Number_of_closed_loans',
            'NAME_EDUCATION_TYPE_Secondary / secondary special', 'CNT_CHILDREN', 'DEF_60_CNT_SOCIAL_CIRCLE', 'LIVE_REGION_NOT_WORK_REGION',
            'AMT_ANNUITY', 'AMT_CREDIT_LIMIT_ACTUAL', 'NAME_HOUSING_TYPE_With parents', 'ORGANIZATION_TYPE_XNA', 
             'REGION_RATING_CLIENT_W_CITY' ]

X_train.drop(columns = drop_cols, inplace = True)
X_test.drop(columns = drop_cols, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [12]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 246008 entries, 181648 to 285162
Data columns (total 93 columns):
 #   Column                                 Non-Null Count   Dtype  
---  ------                                 --------------   -----  
 0   NAME_CONTRACT_TYPE                     246008 non-null  int64  
 1   FLAG_OWN_CAR                           246008 non-null  int64  
 2   FLAG_OWN_REALTY                        246008 non-null  int64  
 3   AMT_INCOME_TOTAL                       246008 non-null  float64
 4   AMT_CREDIT                             246008 non-null  float64
 5   REGION_POPULATION_RELATIVE             246008 non-null  float64
 6   DAYS_REGISTRATION                      246008 non-null  float64
 7   DAYS_ID_PUBLISH                        246008 non-null  int64  
 8   FLAG_EMP_PHONE                         246008 non-null  int64  
 9   FLAG_CONT_MOBILE                       246008 non-null  int64  
 10  FLAG_EMAIL                             246008 non-n

In [None]:
def run_xgb_cv(X, y):
    clf = xgb.XGBClassifier()
    
    params = {'max_depth' : (4, 8, 12) , 'n_estimators' :  (400, 600, 800), 'gamma' : [2], 'subsample': [0.7], 'scale_pos_weight': [10], 'alpha' : [1], 'lambda': [0]}
    cv = GridSearchCV(clf, params, cv = 5)
    cv.fit(X, y)
    return cv

In [None]:
def run_xgb(X, y, params):
    clf = xgb.XGBClassifier(**params)
    clf.fit(X, y)
    return clf

In [None]:
cv = run_xgb_cv(X_train, y_train)

In [None]:
cv.best_params_

In [None]:
params = cv.best_params_

In [None]:
params = {'alpha': 1, 'gamma': 2, 'lambda': 0, 'max_depth': 6, 'n_estimators': 700, 'scale_pos_weight': 10, 'subsample': 0.7}
model = run_xgb(X_train, y_train, params)

In [None]:
y_pred = model.predict(X_test)

In [None]:
confusion_matrix(y_test, y_pred)

In [None]:
y_pred = model.predict(X_train)

In [None]:

confusion_matrix(y_train, y_pred)

In [None]:
print(classification_report(y_train, y_pred))

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
importance = list(X_train.columns).append(model.feature_importances_)

In [None]:
list(model.feature_importances_).append(list(X_train.columns))

In [None]:
l = list(model.feature_importances_)

In [None]:
c = list(X_train.columns)

In [None]:
out = []
for i,j in zip(l, c):
    out.append([j, i])
    

In [None]:
for i in sorted(out,key = lambda x: x[1], reverse = True):
    print(i)

In [None]:

plt.bar(X_train.columns, model.feature_importances_)
plt.show()

In [None]:
corr_matrix = X_train.corr().abs()

#the matrix is symmetric so we need to extract upper triangle matrix without diagonal (k = 1)

sol = (corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
                  .stack()
                  .sort_values(ascending=False))


In [None]:
sol[30:40]

In [None]:
drop_cols = ['Unnamed: 0_x', 'SK_ID_CURR', 'CODE_GENDER_F', 'AMT_TOTAL_RECEIVABLE', 'Government', 'OBS_30_CNT_SOCIAL_CIRCLE',
            'AMT_GOODS_PRICE', 'NAME_TYPE_SUITE_Unaccompanied', 'REGION_RATING_CLIENT_W_CITY', 'Number_of_closed_loans',
            'NAME_EDUCATION_TYPE_Secondary / secondary special', 'CNT_CHILDREN', 'DEF_60_CNT_SOCIAL_CIRCLE', 'LIVE_REGION_NOT_WORK_REGION',
            'AMT_ANNUITY', 'AMT_CREDIT_LIMIT_ACTUAL', 'NAME_HOUSING_TYPE_With parents', 'ORGANIZATION_TYPE_XNA', 'Unnamed: 0_y', 
             'REGION_RATING_CLIENT_W_CITY' ]

In [None]:
X_train.drop(columns = drop_cols, inplace = True)

In [None]:
data_test.drop(columns = drop_cols, inplace =True )

In [None]:
X_test.drop(columns = drop_cols, inplace = True)

In [None]:
pca = PCA(n)
pca.fit(X_train)

In [None]:
pca.explained_variance_ratio_

In [None]:
def get_scores(X_train, y_train, X_test, y_test, model):
    ypred = model.predict(X_train)
    print(classification_report(y_train, ypred))
    ypred = model.predict(X_test)
    print(classification_report(y_test, ypred))
    

In [None]:
pca_train = pca.transform(X_train)
pca_test = pca.transform(X_test)

In [None]:
params = {'alpha': 1, 'gamma': 2, 'lambda': 0, 'max_depth': 6, 'n_estimators': 700, 'scale_pos_weight': 10, 'subsample': 0.7}




In [None]:

model = run_xgb(pca_train[:,:6], y_train, params)
get_scores(pca_train[:,:6], y_train, pca_test[:,:6], y_test, model)

In [None]:

model = run_xgb(pca_train[:,:15], y_train, params)
get_scores(pca_train[:,:15], y_train, pca_test[:,:15], y_test, model)

In [None]:

model = run_xgb(pca_train[:,:25], y_train, params)
get_scores(pca_train[:,:25], y_train, pca_test[:,:25], y_test, model)

In [None]:
get_scores(pca_train[:,:25], y_train, pca_test[:,:25], y_test, model)

In [None]:
X_train.columns


In [None]:
cat = ['CODE_GENDER_M','NAME_INCOME_TYPE_Businessman','NAME_INCOME_TYPE_Student','NAME_INCOME_TYPE_Unemployed','NAME_EDUCATION_TYPE_Academic degree','NAME_EDUCATION_TYPE_Higher education','NAME_EDUCATION_TYPE_Incomplete higher','NAME_EDUCATION_TYPE_Lower secondary','NAME_HOUSING_TYPE_Co-op apartment','NAME_HOUSING_TYPE_House / apartment','NAME_HOUSING_TYPE_Municipal apartment','NAME_HOUSING_TYPE_Office apartment','NAME_HOUSING_TYPE_Rented apartment','ORGANIZATION_TYPE_Advertising','ORGANIZATION_TYPE_Agriculture','ORGANIZATION_TYPE_Bank','ORGANIZATION_TYPE_Construction','ORGANIZATION_TYPE_Culture','ORGANIZATION_TYPE_Insurance','ORGANIZATION_TYPE_Legal Services','ORGANIZATION_TYPE_Medicine','ORGANIZATION_TYPE_Mobile','ORGANIZATION_TYPE_Other','ORGANIZATION_TYPE_Police','ORGANIZATION_TYPE_Realtor','ORGANIZATION_TYPE_Religion','ORGANIZATION_TYPE_Restaurant','ORGANIZATION_TYPE_Self-employed','ORGANIZATION_TYPE_Services','ORGANIZATION_TYPE_Telecom','ORGANIZATION_TYPE_University','transport','trade','industry','Businesss','white_collar','services','occ_low_level','occ_medium_level','occ_high_leve','occ_missing','accompanied','Working ','Married','Consumer_credit','Credit_card','Micro_loan','Mortgage','Other','FLAG_EMP_PHONE','FLAG_CONT_MOBILE','FLAG_EMAIL','FLAG_DOCUMENT_3','FLAG_DOCUMENT_6']
cont = ['NAME_CONTRACT_TYPE','FLAG_OWN_CAR','FLAG_OWN_REALTY','AMT_INCOME_TOTAL','AMT_CREDIT','REGION_POPULATION_RELATIVE','DAYS_REGISTRATION','DAYS_ID_PUBLISH','CNT_FAM_MEMBERS','REGION_RATING_CLIENT','HOUR_APPR_PROCESS_START','REG_REGION_NOT_LIVE_REGION','REG_REGION_NOT_WORK_REGION','REG_CITY_NOT_LIVE_CITY','REG_CITY_NOT_WORK_CITY','LIVE_CITY_NOT_WORK_CITY','EXT_SOURCE_1','EXT_SOURCE_2','EXT_SOURCE_3','DEF_30_CNT_SOCIAL_CIRCLE','OBS_60_CNT_SOCIAL_CIRCLE','DAYS_LAST_PHONE_CHANGE','AMT_REQ_CREDIT_BUREAU_HOUR','AMT_REQ_CREDIT_BUREAU_MON','AMT_REQ_CREDIT_BUREAU_QRT','AGE','Years_experience','Number_of_total_loans','Number_of_active_loans','Number_of_sold_loans','Number_of_bad_loans','Car_loans','DPD','MAX_Over_due','Current_balance','Current_debt','current_overdue','AMT_BALANCE','AMT_PAYMENT_CURRENT']       

In [None]:
X_train_part1 = X_train[cont]
X_test_part1 = X_test[cont]
X_train_part2 = X_train[cat]
X_test_part2 = X_test[cat]
pca.fit(X_train_part1)
pca_train_part1 = pd.DataFrame(pca.transform(X_train_part1)[:,0:5])
pca_test_part1 = pd.DataFrame(pca.transform(X_test_part1)[:,0:5])
X_train_part2.reset_index(inplace = True)
X_test_part2.reset_index(inplace = True)
X_train_part2.drop(columns = 'index', inplace = True)
X_test_part2.drop(columns = 'index', inplace = True)
train = pd.concat([pca_train_part1, X_train_part2], axis = 1)
test = pd.concat([pca_test_part1, X_test_part2], axis = 1)
model = run_xgb(train, y_train, params)
get_scores(train, y_train, test, y_test, model)

In [None]:
def score(num):
    X_train_part1 = X_train[cont]
    X_test_part1 = X_test[cont]
    X_train_part2 = X_train[cat]
    X_test_part2 = X_test[cat]
    
    pca.fit(X_train_part1)
    pca_train_part1 = pd.DataFrame(pca.transform(X_train_part1)[:,0:num])
    pca_test_part1 = pd.DataFrame(pca.transform(X_test_part1)[:,0:num])
    pca_data_part1 = pd.DataFrame(pca.transform(data_test_cont)[:,0:num])
    X_train_part2.reset_index(inplace = True)
    X_test_part2.reset_index(inplace = True)
    pca_data_part2.reset_index(inplace = True)
    X_train_part2.drop(columns = 'index', inplace = True)
    X_test_part2.drop(columns = 'index', inplace = True)
    pca_data_part2.drop(columns = 'index', inplace = True)
    train = pd.concat([pca_train_part1, X_train_part2], axis = 1)
    test = pd.concat([pca_test_part1, X_test_part2], axis = 1)
    datatest = pd.concat([pca_data_part1, pca_data_part2], axis = 1)
    model = run_xgb(train, y_train, params)
    get_scores(train, y_train, test, y_test, model)
    out = model.predict_proba(datatest)
    return out

In [None]:
pca.explained_variance_ratio_

In [None]:
pca_data_part2 = data_test[cat]
data_test_cont = data_test[cont]

In [None]:
data_test_cat.shape

In [None]:
output = score(25)

In [None]:
np.savetxt("submission11.csv", output[:,1] , delimiter="\n")

In [None]:
score(5)

In [None]:
data_test

In [None]:
pd.DataFrame(pca_train_part1)

In [62]:
def grid_search(model, params, X, y):
    
    cv = GridSearchCV(model, params, cv = 3)
    cv.fit(X, y)
    
    return cv.best_params_

def train_model(model, X, y):
    model.fit(X, y)
    y_pred = model.predict(X)
    
    return model

def get_scores(model, X, y):
    y_pred = model.predict(X)
    print(classification_report(y, y_pred))

def get_output(model, X):
    
    ypred =model.predict_proba(X)
    
    return ypred
    
    

In [13]:
from sklearn.preprocessing import StandardScaler
cols_svm = ['NAME_CONTRACT_TYPE', 'AMT_INCOME_TOTAL', 'AMT_CREDIT', 'REGION_POPULATION_RELATIVE', 'DAYS_REGISTRATION', 'FLAG_EMP_PHONE', 'CNT_FAM_MEMBERS', 'EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'AMT_REQ_CREDIT_BUREAU_HOUR', 'AMT_REQ_CREDIT_BUREAU_MON', 'CODE_GENDER_M', 'NAME_INCOME_TYPE_Businessman', 'NAME_INCOME_TYPE_Student', 'NAME_EDUCATION_TYPE_Academic degree', 'NAME_EDUCATION_TYPE_Higher education', 'NAME_HOUSING_TYPE_Co-op apartment', 'ORGANIZATION_TYPE_Medicine', 'ORGANIZATION_TYPE_Services', 'AGE', 'Years_experience', 'occ_low_level', 'occ_medium_level', 'occ_high_leve', 'Number_of_total_loans', 'Number_of_active_loans', 'Number_of_bad_loans', 'Credit_card', 'DPD', 'MAX_Over_due', 'Current_balance', 'Current_debt', 'current_overdue', 'AMT_BALANCE', 'AMT_PAYMENT_CURRENT']
X_train_svm = X_train[cols_svm]
X_test_svm = X_test[cols_svm]
scaler = StandardScaler()
scaler.fit(X_train_svm)
X_train_svm = scaler.transform(X_train_svm)
X_test_svm = scaler.transform(X_test_svm)

In [21]:
clf = svm.SVC()
params = {'kernel' : ['rbf'], 'C' : [0.1, 0.5, 1], 'gamma' : [ 0.1, 0.5], 'class_weight' : ['balanced'], 'max_iter' : [1000]}
best_params = grid_search(clf, params, X_train_svm, y_train)
print("Best Params are:", best_params)

clf = svm.SVC(**best_params)

clf = train_model(clf, X_train_svm, y_train)

print("Training Scores")
get_scores(clf, X_train_svm, y_train)

print("Testing Scores")
get_scores(clf, X_test_svm, y_test)

Training Scores


  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.00      0.00      0.00    226148
           1       0.08      1.00      0.15     19860

    accuracy                           0.08    246008
   macro avg       0.04      0.50      0.07    246008
weighted avg       0.01      0.08      0.01    246008

Testing Scores
              precision    recall  f1-score   support

           0       0.00      0.00      0.00     56538
           1       0.08      1.00      0.15      4965

    accuracy                           0.08     61503
   macro avg       0.04      0.50      0.07     61503
weighted avg       0.01      0.08      0.01     61503



  _warn_prf(average, modifier, msg_start, len(result))


In [28]:
bagging_cols = ['NAME_CONTRACT_TYPE', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'AMT_INCOME_TOTAL', 'AMT_CREDIT', 'REGION_POPULATION_RELATIVE', 'DAYS_REGISTRATION', 'DAYS_ID_PUBLISH', 'FLAG_EMP_PHONE', 'FLAG_CONT_MOBILE', 'FLAG_EMAIL', 'CNT_FAM_MEMBERS', 'REGION_RATING_CLIENT', 'HOUR_APPR_PROCESS_START', 'REG_REGION_NOT_LIVE_REGION', 'REG_REGION_NOT_WORK_REGION', 'REG_CITY_NOT_LIVE_CITY', 'EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DEF_30_CNT_SOCIAL_CIRCLE', 'OBS_60_CNT_SOCIAL_CIRCLE', 'FLAG_DOCUMENT_3', 'FLAG_DOCUMENT_6', 'CODE_GENDER_M', 'NAME_INCOME_TYPE_Businessman', 'NAME_INCOME_TYPE_Student', 'NAME_EDUCATION_TYPE_Academic degree', 'NAME_EDUCATION_TYPE_Higher education', 'NAME_HOUSING_TYPE_Co-op apartment', 'ORGANIZATION_TYPE_Medicine', 'ORGANIZATION_TYPE_Restaurant', 'ORGANIZATION_TYPE_Services', 'ORGANIZATION_TYPE_Telecom', 'ORGANIZATION_TYPE_University', 'AGE', 'Years_experience', 'Businesss', 'white_collar', 'occ_low_level', 'occ_medium_level', 'occ_high_leve', 'Married', 'Number_of_total_loans', 'Number_of_active_loans', 'Number_of_bad_loans', 'Car_loans', 'Consumer_credit', 'Credit_card', 'Micro_loan', 'DPD', 'MAX_Over_due', 'Current_balance', 'Current_debt', 'current_overdue', 'AMT_BALANCE', 'AMT_PAYMENT_CURRENT']
X_train_bag = X_train[bagging_cols]
X_test_bag = X_test[bagging_cols]


In [33]:
clf = RandomForestClassifier()
params = {'max_depth' : [3, 5, 10], 'min_samples_split' : [100], 'criterion' : ['gini', 'entropy'], 'n_estimators' : [50, 100, 200, 500], 'class_weight' : ['balanced']}
best_params = grid_search(clf, params, X_train_bag, y_train)
print("Best Params are:", best_params)
clf = RandomForestClassifier(**best_params)

clf = train_model(clf, X_train, y_train)

print("Training Scores")
get_scores(clf, X_train, y_train)

print("Testing Scores")
get_scores(clf, X_test, y_test)

Best Params are: {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 10, 'min_samples_split': 100, 'n_estimators': 500}
              precision    recall  f1-score   support

           0       0.97      0.72      0.83    226148
           1       0.18      0.71      0.29     19860

    accuracy                           0.72    246008
   macro avg       0.57      0.72      0.56    246008
weighted avg       0.90      0.72      0.78    246008

Training Scores
              precision    recall  f1-score   support

           0       0.97      0.72      0.83    226148
           1       0.18      0.71      0.29     19860

    accuracy                           0.72    246008
   macro avg       0.57      0.72      0.56    246008
weighted avg       0.90      0.72      0.78    246008

Testing Scores
              precision    recall  f1-score   support

           0       0.96      0.72      0.82     56538
           1       0.17      0.65      0.27      4965

    accuracy        

In [35]:
clf = RandomForestClassifier()
params = {'max_depth' : [20, 25, 15], 'min_samples_split' : [100], 'criterion' : ['gini'], 'n_estimators' : [400, 500, 750], 'class_weight' : ['balanced']}
best_params = grid_search(clf, params, X_train_bag, y_train)
print("Best Params are:", best_params)
clf = RandomForestClassifier(**best_params)

clf = train_model(clf, X_train, y_train)

print("Training Scores")
get_scores(clf, X_train, y_train)

print("Testing Scores")
get_scores(clf, X_test, y_test)

Best Params are: {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 25, 'min_samples_split': 100, 'n_estimators': 750}
              precision    recall  f1-score   support

           0       0.98      0.89      0.94    226148
           1       0.40      0.80      0.53     19860

    accuracy                           0.89    246008
   macro avg       0.69      0.85      0.73    246008
weighted avg       0.93      0.89      0.90    246008

Training Scores
              precision    recall  f1-score   support

           0       0.98      0.89      0.94    226148
           1       0.40      0.80      0.53     19860

    accuracy                           0.89    246008
   macro avg       0.69      0.85      0.73    246008
weighted avg       0.93      0.89      0.90    246008

Testing Scores
              precision    recall  f1-score   support

           0       0.95      0.88      0.91     56538
           1       0.23      0.42      0.30      4965

    accuracy        

In [36]:
clf_best = clf

In [37]:
clf = RandomForestClassifier()
params = {'max_depth' : [24, 25, 30], 'min_samples_split' : [100], 'criterion' : ['gini'], 'n_estimators' : [1000, 700, 750], 'class_weight' : ['balanced']}
best_params = grid_search(clf, params, X_train_bag, y_train)
print("Best Params are:", best_params)
clf = RandomForestClassifier(**best_params)

clf = train_model(clf, X_train, y_train)

print("Training Scores")
get_scores(clf, X_train, y_train)

print("Testing Scores")
get_scores(clf, X_test, y_test)

Best Params are: {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 30, 'min_samples_split': 100, 'n_estimators': 1000}
              precision    recall  f1-score   support

           0       0.98      0.90      0.94    226148
           1       0.42      0.80      0.55     19860

    accuracy                           0.89    246008
   macro avg       0.70      0.85      0.75    246008
weighted avg       0.94      0.89      0.91    246008

Training Scores
              precision    recall  f1-score   support

           0       0.98      0.90      0.94    226148
           1       0.42      0.80      0.55     19860

    accuracy                           0.89    246008
   macro avg       0.70      0.85      0.75    246008
weighted avg       0.94      0.89      0.91    246008

Testing Scores
              precision    recall  f1-score   support

           0       0.94      0.89      0.91     56538
           1       0.24      0.41      0.30      4965

    accuracy       

In [38]:
clf = RandomForestClassifier()
params = {'max_depth' : [28, 30, 35], 'min_samples_split' : [100], 'criterion' : ['gini'], 'n_estimators' : [1000, 950, 1200], 'class_weight' : ['balanced']}
best_params = grid_search(clf, params, X_train_bag, y_train)
print("Best Params are:", best_params)
clf = RandomForestClassifier(**best_params)

clf = train_model(clf, X_train, y_train)

print("Training Scores")
get_scores(clf, X_train, y_train)

print("Testing Scores")
get_scores(clf, X_test, y_test)

Best Params are: {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 35, 'min_samples_split': 100, 'n_estimators': 1000}
              precision    recall  f1-score   support

           0       0.98      0.90      0.94    226148
           1       0.43      0.80      0.56     19860

    accuracy                           0.90    246008
   macro avg       0.70      0.85      0.75    246008
weighted avg       0.94      0.90      0.91    246008

Training Scores
              precision    recall  f1-score   support

           0       0.98      0.90      0.94    226148
           1       0.43      0.80      0.56     19860

    accuracy                           0.90    246008
   macro avg       0.70      0.85      0.75    246008
weighted avg       0.94      0.90      0.91    246008

Testing Scores
              precision    recall  f1-score   support

           0       0.94      0.89      0.91     56538
           1       0.24      0.41      0.30      4965

    accuracy       

In [42]:
data_testing = data_test[X_test.columns]

In [45]:
def predict_out(data, cols, filename, clf):
    out = data[cols]
    out = clf.predict_proba(out)
    pd.DataFrame(out).to_csv(filename)
    

In [50]:
importances = clf.feature_importances_
features = list(X_train.columns)

In [57]:
imp = zip(features, importances)


In [58]:
list(sorted(imp, key = lambda t:t[1]))

[('occ_missing', 0.0),
 ('NAME_INCOME_TYPE_Businessman', 6.351406488933979e-20),
 ('NAME_INCOME_TYPE_Student', 7.795306611012928e-07),
 ('NAME_EDUCATION_TYPE_Academic degree', 1.299954145124713e-05),
 ('Number_of_bad_loans', 1.6098959751400062e-05),
 ('ORGANIZATION_TYPE_Religion', 2.4955981633838377e-05),
 ('NAME_INCOME_TYPE_Unemployed', 6.330737698125212e-05),
 ('ORGANIZATION_TYPE_Mobile', 0.00013513706981186014),
 ('ORGANIZATION_TYPE_Insurance', 0.00016746303601646986),
 ('ORGANIZATION_TYPE_Legal Services', 0.00017277832575027127),
 ('FLAG_CONT_MOBILE', 0.0001743625463326113),
 ('ORGANIZATION_TYPE_Culture', 0.00017734028642773847),
 ('ORGANIZATION_TYPE_Advertising', 0.000225329136236892),
 ('ORGANIZATION_TYPE_Realtor', 0.0002868486406544982),
 ('ORGANIZATION_TYPE_Telecom', 0.0002897540382763482),
 ('ORGANIZATION_TYPE_University', 0.000298554903624419),
 ('NAME_HOUSING_TYPE_Co-op apartment', 0.00044371922553111584),
 ('AMT_REQ_CREDIT_BUREAU_HOUR', 0.0004709712893055032),
 ('ORGANIZATI

In [46]:
predict_out(data_test,X_test.columns, 'submission_test.csv', clf )

In [59]:
clf1 = RandomForestClassifier()
params = {'max_depth' : [40, 33, 35], 'min_samples_split' : [100], 'criterion' : ['gini'], 'n_estimators' : [1000, 1050, 950], 'class_weight' : ['balanced']}
best_params = grid_search(clf1, params, X_train_bag, y_train)
print("Best Params are:", best_params)
clf1 = RandomForestClassifier(**best_params)

clf1 = train_model(clf1, X_train, y_train)

print("Training Scores")
get_scores(clf1, X_train, y_train)

print("Testing Scores")
get_scores(clf1, X_test, y_test)

Best Params are: {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 35, 'min_samples_split': 100, 'n_estimators': 1000}
              precision    recall  f1-score   support

           0       0.98      0.90      0.94    226148
           1       0.42      0.80      0.56     19860

    accuracy                           0.90    246008
   macro avg       0.70      0.85      0.75    246008
weighted avg       0.94      0.90      0.91    246008

Training Scores
              precision    recall  f1-score   support

           0       0.98      0.90      0.94    226148
           1       0.42      0.80      0.56     19860

    accuracy                           0.90    246008
   macro avg       0.70      0.85      0.75    246008
weighted avg       0.94      0.90      0.91    246008

Testing Scores
              precision    recall  f1-score   support

           0       0.94      0.89      0.91     56538
           1       0.24      0.41      0.30      4965

    accuracy       

In [63]:
clf2 = XGBClassifier(scale_pos_weight= 90)
params = {'eta ' : [0.01, 0.1], 'gamma ' : [1, 10], 'max_depth ' : [12,16], 'n_estimators' : [1000, 800, 1200], 'alpha ' : ['0.8']}
best_params = grid_search(clf2, params, X_train_bag, y_train)
print("Best Params are:", best_params)
clf2 = XGBClassifier(**best_params)

clf2 = train_model(clf2, X_train, y_train)

print("Training Scores")
get_scores(clf2, X_train, y_train)

print("Testing Scores")
get_scores(clf2, X_test, y_test)

Parameters: { alpha , eta , gamma , max_depth  } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { alpha , eta , gamma , max_depth  } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { alpha , eta , gamma , max_depth  } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { alpha , eta , gamma , max_depth  } might not be used.

  This 

Parameters: { alpha , eta , gamma , max_depth  } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { alpha , eta , gamma , max_depth  } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { alpha , eta , gamma , max_depth  } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { alpha , eta , gamma , max_depth  } might not be used.

  This 

Parameters: { alpha , eta , gamma , max_depth  } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { alpha , eta , gamma , max_depth  } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { alpha , eta , gamma , max_depth  } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { alpha , eta , gamma , max_depth  } might not be used.

  This 

Parameters: { alpha , eta , gamma , max_depth  } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { alpha , eta , gamma , max_depth  } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { alpha , eta , gamma , max_depth  } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { alpha , eta , gamma , max_depth  } might not be used.

  This 

TypeError: __init__() got an unexpected keyword argument 'alpha '

In [64]:
clf2 = XGBClassifier(**best_params)

clf2 = train_model(clf2, X_train, y_train)

print("Training Scores")
get_scores(clf2, X_train, y_train)

print("Testing Scores")
get_scores(clf2, X_test, y_test)



Parameters: { alpha , eta , gamma , max_depth  } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Training Scores
              precision    recall  f1-score   support

           0       0.97      1.00      0.99    226148
           1       1.00      0.67      0.80     19860

    accuracy                           0.97    246008
   macro avg       0.98      0.83      0.89    246008
weighted avg       0.97      0.97      0.97    246008

Testing Scores
              precision    recall  f1-score   support

           0       0.92      0.99      0.96     56538
           1       0.36      0.06      0.10      4965

    accuracy                           0.92     61503
   macro avg       0.64      0.52      0.53     61503
weighted avg       0.88      0.92      0.89     61503



In [83]:
def get_combined_score(clf1, clf2, wt, data):
    y_pred_xgb = clf2.predict_proba(data)
    y_pred_rf = clf1.predict_proba(data)
    yout = y_pred_xgb*wt + y_pred_rf*(1 - wt)
    out = yout[:,1] 
    return out
    

In [78]:
y_pred_xgb = clf2.predict_proba(X_test)
y_pred_rf = clf1.predict_proba(X_test)

In [79]:
yout = y_pred_xgb*0.15 + y_pred_rf*0.85

In [80]:
out = yout[:,1] 

In [81]:
pred = [1 if i > 0.5 else 0 for i in out ]

In [82]:
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.94      0.94      0.94     56538
           1       0.28      0.28      0.28      4965

    accuracy                           0.88     61503
   macro avg       0.61      0.61      0.61     61503
weighted avg       0.88      0.88      0.88     61503



In [84]:
sub =  get_combined_score(clf1, clf2, 0.15, data_test[X_test.columns])

In [86]:
pd.DataFrame(sub).to_csv('sub_ensemble.csv')

In [88]:
from imblearn.over_sampling import SMOTE
oversample = SMOTE()
X, y = oversample.fit_resample(X_train, y_train)

In [91]:
best_params = {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 20, 'min_samples_split': 100, 'n_estimators': 100}
clf1 = RandomForestClassifier(**best_params)

clf1 = train_model(clf1,X, y)

print("Training Scores")
get_scores(clf1, X_train, y_train)

print("Testing Scores")
get_scores(clf1, X_test, y_test)

Training Scores
              precision    recall  f1-score   support

           0       0.93      0.98      0.95    226148
           1       0.31      0.11      0.16     19860

    accuracy                           0.91    246008
   macro avg       0.62      0.54      0.56    246008
weighted avg       0.88      0.91      0.89    246008

Testing Scores
              precision    recall  f1-score   support

           0       0.92      0.97      0.95     56538
           1       0.20      0.07      0.11      4965

    accuracy                           0.90     61503
   macro avg       0.56      0.52      0.53     61503
weighted avg       0.86      0.90      0.88     61503

