# Covariant Shift (Credit Model 281)

In [34]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import sklearn
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
import xgboost as xgb

import pickle
from sklearn.externals import joblib
pd.set_option('display.max_columns', 500)

In [35]:
# Read all data
df_demo = pd.read_csv('data/demo.csv',index_col='ip_id')
df_y_train = pd.read_csv('data/y_train.csv',index_col='ip_id')
df_y_test = pd.read_csv('data/y_test_index.csv',index_col='ip_id')

- Create age at open account

In [36]:
## Create Feature Age when opened the account
df_demo['age_open_acc'] = df_demo['act_strt_dt'].str.split('-').str.get(0).apply(float) - df_demo['brth_yr']

### Merge data

- For Feature Selection

In [37]:
# df_ = pd.concat([df_cc.drop(['label'],axis=1),df_cc_test],axis=0)
df = df_demo

In [38]:
df.shape

(10299, 10)

- For test model

In [39]:
df_test = df_demo[(df_demo.index.isin(df_y_test.index)) & \
                  (df_demo.index.isin(pd.read_csv('data/cc_txn.csv',index_col='ip_id').index.unique())== False) &
                 (df_demo.index.isin(pd.read_csv('data/sa.csv',index_col='ip_id').index.unique())== False)]
df_test.shape

(281, 10)

In [40]:
df_test

Unnamed: 0_level_0,brth_yr,act_strt_dt,no_of_dpnd_chl,cis_income,crn_bal,gnd_cd,mar_st_cd,ctf_tp_cd,ocp_cd,age_open_acc
ip_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
6391,1963.0,2008-01-18,1.0,42238.0,4151.0,1.0,2.0,2.0,7,45.0
6399,1976.0,2014-07-22,1.0,34839.0,500.0,1.0,2.0,5.0,9,38.0
6406,1961.0,2010-11-12,0.0,31662.0,7903.0,2.0,1.0,1.0,3,49.0
6418,1961.0,2001-11-17,0.0,26304.0,1817.0,2.0,1.0,4.0,3,40.0
6422,1983.0,2013-09-10,0.0,33199.0,35765.0,1.0,2.0,4.0,3,30.0
6458,1975.0,2017-06-13,1.0,23449.0,500.0,1.0,2.0,3.0,3,42.0
6477,1972.0,2017-10-14,0.0,27297.0,6098.0,2.0,1.0,4.0,3,45.0
6486,1978.0,2007-09-04,0.0,42848.0,500.0,1.0,1.0,4.0,3,29.0
6504,1967.0,2006-07-20,1.0,27746.0,500.0,1.0,1.0,4.0,3,39.0
6510,1985.0,2018-01-11,0.0,18519.0,500.0,2.0,1.0,4.0,3,33.0


# Manage the categorical data

### One-hot  Option 1

- For Feature Selection

In [41]:
# df.drop(['act_strt_dt'],axis=1,inplace=True)
# dummy = pd.get_dummies(df[df.select_dtypes('object').columns.tolist()])
# df = pd.concat([dummy,df[df.select_dtypes(['int64','float64']).columns.tolist()]],axis=1)

In [42]:
# df_test.dtypes

- For test model

In [43]:
# df_test = pd.get_dummies(df_test[df_test.select_dtypes('object').columns.tolist()])
# df_test = pd.concat([dummy,df_test[df_test.select_dtypes(['int64','float64']).columns.tolist()]],axis=1)
# df_test.shape

### Drop Option 2

- For Feature Selection

In [44]:
df.drop(df.select_dtypes('object').columns.tolist(),axis=1,inplace=True)

- For Test model

In [45]:
df_test.drop(df_test.select_dtypes('object').columns.tolist(),axis=1,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


# Create Label "is_train" 0 or 1

In [46]:
df['is_train'] = np.where(df.index.isin(df_y_train.index),1,0)

# Feature Selection
- Find the feature which have the same distribution

In [47]:
def feature_select(df_all, thres = 0.5):
    res = 1
    while res > thres:
        columns = df_all.columns.tolist()

        y = df_all['is_train'].values
        X = df_all.drop(['is_train'],axis=1).values
        X_train, X_val, y_train, y_test = train_test_split(X,y,test_size=0.2,stratify=y,random_state=0)
        
        
        rfc = RandomForestClassifier(n_jobs=-1, max_depth=5, min_samples_leaf = 5,
                                    n_estimators=100)
        rfc.fit(X_train,y_train)
        res_prob = rfc.predict_proba(X_val)[:, 1]

        fpr, tpr, thresholds = metrics.roc_curve(y_test.ravel(), res_prob, pos_label=1)
        res = metrics.auc(fpr, tpr)
        
        if res > thres:
            df_all= df_all.drop([columns[rfc.feature_importances_.argmax()]],axis=1)
    
    print(res)
    return df_all

In [48]:
df_select = feature_select(df,0.7)

0.6906286913769435


In [49]:
df_select.head()

Unnamed: 0_level_0,no_of_dpnd_chl,gnd_cd,age_open_acc,is_train
ip_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0.0,2.0,21.0,1
2,1.0,2.0,24.0,1
3,0.0,2.0,30.0,1
4,0.0,2.0,25.0,1
5,1.0,2.0,20.0,1


# Use the selected feature to build a credit model with randomForest

In [50]:
df_train_features = df[df.index.isin(df_y_train.index)][[col for col in df_select.drop(['is_train'],axis=1).columns]]
df_train_label = df_y_train

In [52]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
clf1 = LogisticRegression(solver='lbfgs', multi_class='multinomial',
                          random_state=1)
clf2 = RandomForestClassifier(n_estimators=50, random_state=1)
clf3 = GaussianNB()
eclf1 = VotingClassifier(estimators=[
        ('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='soft')
eclf1 = eclf1.fit(df_train_features, df_train_label)
print(eclf1.predict(df_train_features))
import numpy as np
from sklearn.metrics import roc_auc_score
y_true = df_train_label
y_scores = eclf1.predict_proba(df_train_features)[:,1]
print (roc_auc_score(y_true, y_scores))

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[0. 0. 0. ... 1. 0. 0.]
0.8461658904867393


  if diff:


# Use the trained model to predict the test data

In [53]:
df_test = df_test[df_test.index.isin(df_y_test.index)][[col for col in df_select.drop(['is_train'],axis=1).columns]]

In [54]:
result = eclf1.predict_proba(df_test)[:,1]

In [55]:
df_test['prob1'] = result

In [None]:
df_test[['prob1']].to_csv('result_1127.csv')

In [56]:
df_test[['prob1']].

Unnamed: 0_level_0,prob1
ip_id,Unnamed: 1_level_1
6391,0.169343
6399,0.107880
6406,0.455780
6418,0.096379
6422,0.070481
6458,0.110590
6477,0.194794
6486,0.073785
6504,0.092281
6510,0.070980
