# Covariant Shift (Sa Model 2530)

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import sklearn
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
import xgboost as xgb

import pickle
from sklearn.externals import joblib
pd.set_option('display.max_columns', 500)

  from numpy.core.umath_tests import inner1d


In [2]:
# Read all data
df_sa = pd.read_csv('prep_data/sa_summary_sorn_train.csv',index_col='ip_id')
df_sa_test = pd.read_csv('prep_data/sa_summary_sorn_test.csv',index_col='ip_id')
df_demo = pd.read_csv('data/demo.csv',index_col='ip_id')
df_y_train = pd.read_csv('data/y_train.csv',index_col='ip_id')
df_y_test = pd.read_csv('data/y_test_index.csv',index_col='ip_id')

- Fix the column in df_sa_test

In [3]:
df_sa_test['n_CR_EDC'] = 0

In [4]:
sa_cols = df_sa.columns
sa_test_cols = df_sa_test.columns

common_cols = sa_cols.intersection(sa_test_cols)
train_not_test = sa_cols.difference(sa_test_cols)
print(train_not_test)

Index(['label'], dtype='object')


### Merge data

- For Feature Selection

In [5]:
df_sa_all = pd.concat([df_sa.drop(['label'],axis=1),df_sa_test],axis=0,sort=True)
df = df_demo.join(df_sa_all,how='inner')

In [6]:
df.shape

(9963, 46)

- For test model

In [7]:
df_test = df_demo[df_demo.index.isin(df_y_test.index)].join(df_sa_test,how='inner')
df_test.shape

(3602, 46)

- Scope the data of df_test to 2530 (have no cc, have sa)

In [8]:
df_test = df_test[df_test.index.isin(pd.read_csv('data/cc_txn.csv')['ip_id'])==False]
df_test.shape

(2530, 46)

# Manage the categorical data

### One-hot  Option 1

- For Feature Selection

In [9]:
# df.drop(['act_strt_dt'],axis=1,inplace=True)
# dummy = pd.get_dummies(df[df.select_dtypes('object').columns.tolist()])
# df = pd.concat([dummy,df[df.select_dtypes(['int64','float64']).columns.tolist()]],axis=1)

- For test model

In [10]:
# df_test = pd.get_dummies(df_test[df_test.select_dtypes('object').columns.tolist()])
# df_test = pd.concat([dummy,df_test[df_test.select_dtypes(['int64','float64']).columns.tolist()]],axis=1)
# df_test.shape

### Drop Option 2

- For Feature Selection

In [11]:
df.drop(df.select_dtypes('object').columns.tolist(),axis=1,inplace=True)

- For Test model

In [12]:
df_test.drop(df_test.select_dtypes('object').columns.tolist(),axis=1,inplace=True)

# Create Label "is_train" 0 or 1

In [13]:
df['is_train'] = np.where(df.index.isin(df_y_train.index),1,0)

# Feature Selection
- Find the feature which have the same distribution

In [14]:
def feature_select(df_all, thres = 0.5):
    res = 1
    while res > thres:
        columns = df_all.columns.tolist()

        y = df_all['is_train'].values
        X = df_all.drop(['is_train'],axis=1).values
        X_train, X_val, y_train, y_test = train_test_split(X,y,test_size=0.2,stratify=y,random_state=0)
        
        
        rfc = RandomForestClassifier(n_jobs=-1, max_depth=5, min_samples_leaf = 5,
                                    n_estimators=100)
        rfc.fit(X_train,y_train)
        res_prob = rfc.predict_proba(X_val)[:, 1]

        fpr, tpr, thresholds = metrics.roc_curve(y_test.ravel(), res_prob, pos_label=1)
        res = metrics.auc(fpr, tpr)
        
        if res > thres:
            df_all= df_all.drop([columns[rfc.feature_importances_.argmax()]],axis=1)
    
    print(res)
    return df_all

In [15]:
df_select = feature_select(df,0.7)

0.5247657865124433


In [16]:
df_select.head()

Unnamed: 0_level_0,no_of_dpnd_chl,gnd_cd,n_CR_EDC,is_train
ip_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0.0,2.0,0.0,1
2,1.0,2.0,0.0,1
3,0.0,2.0,0.0,1
4,0.0,2.0,0.0,1
5,1.0,2.0,0.0,1


In [17]:
df_train_features = df[df.index.isin(df_y_train.index)][[col for col in df_select.drop(['is_train'],axis=1).columns]]
df_train_label = df_y_train

### PCA-Train

In [18]:
from sklearn import decomposition

pca = decomposition.PCA(n_components=3)
pca.fit(df_train_features)
pca_df_train_features = pca.transform(df_train_features)

In [19]:
pca_df_train_features

array([[-0.35174383,  0.49031352, -0.01237142],
       [ 0.64808791,  0.50863384, -0.0114448 ],
       [-0.35174383,  0.49031352, -0.01237142],
       ...,
       [ 4.66573894, -0.41790808, -0.01195628],
       [-0.35174383,  0.49031352, -0.01237142],
       [-0.35174383,  0.49031352, -0.01237142]])

In [20]:
df_train_features['pc1'] = pca_df_train_features[:,0]
df_train_features['pc2'] = pca_df_train_features[:,1]
df_train_features['pc3'] = pca_df_train_features[:,2]

# Use the selected feature to build a credit model with randomForest

In [21]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier

# clf1 = LogisticRegression(solver='lbfgs', multi_class='multinomial',random_state=1)
clf2 = RandomForestClassifier(n_estimators=100,min_samples_leaf=5 ,n_jobs=-1,class_weight='balanced')
clf3 = RandomForestClassifier(n_estimators=50,min_samples_leaf=10, max_depth=10 ,n_jobs=-1,class_weight='balanced')
clf4 = RandomForestClassifier(n_estimators=100,min_samples_leaf=20, max_depth=5 ,n_jobs=-1,class_weight='balanced')
# clf3 = GaussianNB()
# clf4 = MLPClassifier(solver='adam', alpha=1e-5, hidden_layer_sizes=(30,50,10,))
clf5 = GradientBoostingClassifier(min_samples_leaf=5, max_depth=None)
clf6 = GradientBoostingClassifier(min_samples_leaf=1, max_depth=10,learning_rate=0.01)
clf7 = GradientBoostingClassifier(min_samples_leaf=8, max_depth=20,learning_rate=0.01)
# X_train = df_train_features[:5000]
# y_train = df_train_label[:5000]
# eclf1 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3), ('nn',clf4), ('gdb',clf5)], voting='soft')
# eclf1 = VotingClassifier(estimators=[('rf', clf2), ('nn',clf4), ('gdb',clf5)], voting='soft')
eclf1 = VotingClassifier(estimators=[('rf', clf2),('rf2', clf3),('rf3', clf4) ,('gdb',clf5),('gdb2',clf6),('gdb3',clf7)], voting='soft',n_jobs=-1)
eclf1 = eclf1.fit(df_train_features, df_train_label)
print(eclf1.predict(df_train_features))

import numpy as np
from sklearn.metrics import roc_auc_score
y_true = df_train_label
y_scores = eclf1.predict_proba(df_train_features)[:,1]
print (roc_auc_score(y_true, y_scores))

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  if diff:


[0. 0. 0. ... 1. 0. 0.]
0.635074600192197


# Use the trained model to predict the test data

### PCA-Test

In [22]:
df_test = df_test[df_test.index.isin(df_y_test.index)][[col for col in df_select.drop(['is_train'],axis=1).columns]]

In [23]:
pca_df_test_features = pca.transform(df_test)

df_test['pc1'] = pca_df_test_features[:,0]
df_test['pc2'] = pca_df_test_features[:,1]
df_test['pc3'] = pca_df_test_features[:,2]

In [24]:
result = eclf1.predict_proba(df_test)[:,1]

In [25]:
df_test['prob1'] = result

In [26]:
df_test[['prob1']].to_csv('nat_result_2530.csv')