In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import normalize

from sklearn.ensemble import RandomForestClassifier as rfc
from sklearn.neighbors import NearestNeighbors
from sklearn.neighbors import KNeighborsClassifier

import xgboost as xgb
from xgboost.sklearn import XGBClassifier

from scipy.stats import ks_2samp

from matplotlib import pyplot as plt
%matplotlib inline

inputFolder='../data/'
trainFolder=inputFolder+'train/'
testFolder=inputFolder+'test/'





In [2]:
def readFiles(type='train'):
    if type not in ('train', 'test'):
        print 'error: type must be either \'train\' or \'test\''
        return
    
    folder=trainFolder if type=='train' else testFolder
    
    usersFile=folder+'user_info_'+type+'.txt'
    banksFile=folder+'bank_detail_'+type+'.txt'
    browseFile=folder+'browse_history_'+type+'.txt'
    billsFile=folder+'bill_detail_'+type+'.txt'
    loanTimeFile=folder+'loan_time_'+type+'.txt'
    overdueFile=folder+'overdue_'+type+'.txt'

    users=pd.read_csv(usersFile, \
              names=['id','sex','profession','education','marriage','citizen_type'],\
              index_col=0)
    banks=pd.read_csv(banksFile,\
              names=['user_id', 'time', 'transaction_type', 'amount', 'is_salary'])

    browse=pd.read_csv(browseFile,\
               names=['user_id', 'time', 'browse_type', 'sub_type'])
    bills=pd.read_csv(billsFile,\
              names=['user_id', 'time', 'bank_id', 'last_bill_amount', 'last_bill_pay', 'credit_line',\
                     'cur_bill_balance', 'cur_bill_min_due', 'transactionNum', 'cur_bill_amount', \
                     'adjusted_amount', 'cumulative_interest', 'available_deposit', 'available_credit', 'debt_status'])

    loanTimes=pd.read_csv(loanTimeFile,\
              names=['user_id', 'time'])

    overdues=pd.read_csv(overdueFile,\
              names=['user_id', 'label'], index_col=0) if type=='train' else None
    
    return users, banks, browse, bills, loanTimes, overdues

trainUsers, trainBanks, trainBrowse, trainBills, trainLoanTime, trainOverdue=readFiles(type='train')
testUsers, testBanks, testBrowse, testBills, testLoanTime, _=readFiles(type='test')


        

# trainUsers=pd.read_csv(trainFolder+'user_info_train.txt', \
#                            names=['id','sex','profession','education','marriage','citizen_type'],\
#                           index_col=0)


# trainBanks=pd.read_csv(trainFolder+'bank_detail_train.txt',\
#                           names=['user_id', 'time', 'transaction_type', 'amount', 'is_salary'])

# trainBrowse=pd.read_csv(trainFolder+'browse_history_train.txt',\
#                            names=['user_id', 'time', 'browse_type', 'sub_type'])

# trainBills=pd.read_csv(trainFolder+'bill_detail_train.txt',\
#                           names=['user_id', 'time', 'bank_id', 'last_bill_amount', 'last_bill_pay', 'credit_line',\
#                                  'cur_bill_balance', 'cur_bill_min_due', 'transactionNum', 'cur_bill_amount', \
#                                  'adjusted_amount', 'cumulative_interest', 'available_deposit', 'available_credit', 'debt_status'])

# trainLoanTime=pd.read_csv(trainFolder+'loan_time_train.txt',\
#                                 names=['user_id', 'time'])

# trainOverdue=pd.read_csv(trainFolder+'overdue_train.txt',\
#                             names=['user_id', 'label'], index_col=0)

In [3]:
# # process transaction amounts by converting back to real sum of money.  Have to guess the function

# trialTrainBanks=trainBanks.copy()
# fundConv=lambda x: 2*np.power((np.exp(x)-1),0.5)
# trialTrainBanks['amount']=fundConv(trialTrainBanks['amount'])
# print trialTrainBanks['amount'].head()

In [4]:
def extractBillFeatures(bills, n_ids):
    # prepare a list of banks in the bills form
    
    bank_ids=bills['bank_id'].unique()
    bankBills=dict() # keys are banks ids, and values are bills associated with that bank id.
    for b_id in bank_ids:
        bs=bills[bills.bank_id==b_id]
        if len(bs)>=1:
            bankBills[b_id]=bs
            
    # prepare bank-wise bill features
    billFeatures=pd.DataFrame(np.arange(n_ids)+1, columns=['id'])
    billFeatures.set_index(['id'], inplace=True)
    for b_id in bankBills.keys():
        bankBillName='bank'+str(b_id).zfill(6)
        billFeatures[bankBillName+'AvgCreditLine']=bankBills[b_id].groupby('user_id')['credit_line'].mean()
    #     billFeatures[bankBillName+'MaxCreditLine']=bankBills[b_id].groupby('user_id')['available_credit'].mean()
    billFeatures.fillna(-1, inplace=True)
    
    return billFeatures

billFeatures=extractBillFeatures(trainBills, len(trainUsers))

In [5]:
def extractBrowseTypeFeatures(browse, n_ids):
    # prepare browseType wise features:

    browseType_ids=browse['browse_type'].unique()
    browseTypes=dict()
    for b_id in browseType_ids:
        bs=browse[browse.browse_type==b_id]
        if len(bs)>=1:
            browseTypes[b_id]=bs

    browseTypeFeatures=pd.DataFrame(np.arange(n_ids)+1, columns=['id'])
    browseTypeFeatures.set_index(['id'], inplace=True)
    for b_id in browseTypes.keys():
        browseTypeName='browseType'+str(b_id).zfill(3)
    #     browseTypeFeatures[browseTypeName+'TotalTimes']=browseTypes[b_id].groupby('user_id')['time'].count()
        browseTypeFeatures[browseTypeName+'Freq']=browseTypes[b_id].groupby('user_id')['time'].count().div\
        (browseTypes[b_id].groupby('user_id')['time'].max()-browseTypes[b_id].groupby('user_id')['time'].min()+1+0.000001)
    browseTypeFeatures.fillna(0, inplace=True)

    return browseTypeFeatures

browseTypeFeatures=extractBrowseTypeFeatures(trainBrowse, len(trainUsers))

In [6]:
# make a working dataset for training and testing

userFeatures=trainUsers.copy()
userFeatures=userFeatures.join(billFeatures)
userFeatures=userFeatures.join(browseTypeFeatures)
userFeatures.sort_index(inplace=True)
trainOverdue.sort_index(inplace=True)


In [None]:
# add manually picked features to the working dataset


bankRecordsPersonal=trainBanks.groupby('user_id')

userFeatures['transactionNum']=bankRecordsPersonal['amount'].count()
userFeatures['transactionAmount']=bankRecordsPersonal['amount'].sum()


userFeatures['transactionNetAmount']=bankRecordsPersonal.apply(lambda x: x[x['transaction_type']==1]['amount'].sum())-\
        bankRecordsPersonal.apply(lambda x: x[x['transaction_type']==0]['amount'].sum())


userFeatures['transactionExpense']=bankRecordsPersonal.apply(lambda x: x[x['transaction_type']==1]['amount'].sum())
userFeatures['transactionExpenseMax']=bankRecordsPersonal.apply(lambda x: x[x['transaction_type']==1]['amount'].max())
userFeatures['transactionExpenseAvg']=bankRecordsPersonal.apply(lambda x: x[x['transaction_type']==1]['amount'].mean())

userFeatures['transactionIncome']=bankRecordsPersonal.apply(lambda x: x[x['transaction_type']==0]['amount'].sum())

userFeatures['transactionIncomeMax']=bankRecordsPersonal.apply(lambda x: x[x['transaction_type']==0]['amount'].max())
userFeatures['transactionIncomeAvg']=bankRecordsPersonal.apply(lambda x: x[x['transaction_type']==0]['amount'].mean())

userFeatures['transactionExpenseNum']=bankRecordsPersonal.apply(lambda x: x[x['transaction_type']==1]['amount'].count())
userFeatures['transactionIncomeNum']=bankRecordsPersonal.apply(lambda x: x[x['transaction_type']==0]['amount'].count())

userFeatures['salaryNum']=bankRecordsPersonal['is_salary'].sum()
userFeatures['salaryTotal']=bankRecordsPersonal.apply(lambda x: x[x['is_salary']==1]['amount'].sum())
userFeatures['salaryAvg']=bankRecordsPersonal.apply(lambda x: x[x['is_salary']==1]['amount'].mean())
userFeatures['salaryMax']=bankRecordsPersonal.apply(lambda x: x[x['is_salary']==1]['amount'].max())

browseRecordsPersonal=trainBrowse.groupby('user_id')
userFeatures['browseNum']=browseRecordsPersonal['time'].count()


userFeatures['browseFreq']=(browseRecordsPersonal['time'].count()).div(browseRecordsPersonal['time'].max()-browseRecordsPersonal['time'].min()+1+0.000001)
userFeatures['browseTypes']=browseRecordsPersonal['browse_type'].nunique()
userFeatures['browseSubTypes']=browseRecordsPersonal.apply(lambda x: x.groupby('browse_type')['sub_type'].nunique().sum())


bills=trainBills.groupby('user_id')
userFeatures['billBanksNum']=bills['bank_id'].nunique()
userFeatures['billNum']=bills['time'].count()
userFeatures['underpay']=bills.apply(lambda x: 1.0*sum(x['last_bill_amount']-x['last_bill_pay'])/len(x))

userFeatures['avgCreditLine']=bills['credit_line'].mean()
userFeatures['avgTransactionNum']=bills['transactionNum'].mean()
userFeatures['maxBankCredit_line']=bills.apply(lambda x: x.groupby('bank_id')['credit_line'].mean().max())
userFeatures['avgLastBillAmount']=bills.apply(lambda x: x.groupby('bank_id')['last_bill_amount'].mean().mean())
userFeatures['avgMaxLastBillAmount']=bills.apply(lambda x: x.groupby('bank_id')['last_bill_amount'].max().mean())

userFeatures['relative_bill_adjustment']=(bills['cur_bill_amount'].mean()-bills['adjusted_amount'].mean()).div(bills['cur_bill_amount'].mean()+0.00001)


userFeatures['avgBillTimeSpan']=bills['time'].max()-bills['time'].min()
userFeatures['maxBankBillTimeSpan']=bills.apply(lambda x: (x.groupby('bank_id')['time'].max()-x.groupby('bank_id')['time'].min()).max())
userFeatures['totalBankBillTimeSpan']=bills.apply(lambda x: (x.groupby('bank_id')['time'].max()-x.groupby('bank_id')['time'].min()).sum())
userFeatures['debtNum']=bills['debt_status'].mean()


userFeatures['loanTime']=trainLoanTime['time']




In [None]:
# add manually picked features to the working dataset
def handPickedFeatures(users, banks, browse, bills, loanTimes):

    handPickedFeatures=pd.DataFrame(np.arange(len(users))+1, columns=['id'])
    handPickedFeatures.set_index(['id'], inplace=True)


    # bank record features
    bankRecordsPersonal=banks.groupby('user_id')
    handPickedFeatures['transactionNum']=bankRecordsPersonal['amount'].count()
    handPickedFeatures['transactionAmount']=bankRecordsPersonal['amount'].sum()
    handPickedFeatures['transactionNetAmount']=bankRecordsPersonal.apply(lambda x: x[x['transaction_type']==1]['amount'].sum())-\
            bankRecordsPersonal.apply(lambda x: x[x['transaction_type']==0]['amount'].sum())
    handPickedFeatures['transactionExpense']=bankRecordsPersonal.apply(lambda x: x[x['transaction_type']==1]['amount'].sum())
    handPickedFeatures['transactionExpenseMax']=bankRecordsPersonal.apply(lambda x: x[x['transaction_type']==1]['amount'].max())
    handPickedFeatures['transactionExpenseAvg']=bankRecordsPersonal.apply(lambda x: x[x['transaction_type']==1]['amount'].mean())
    handPickedFeatures['transactionIncome']=bankRecordsPersonal.apply(lambda x: x[x['transaction_type']==0]['amount'].sum())
    handPickedFeatures['transactionIncomeMax']=bankRecordsPersonal.apply(lambda x: x[x['transaction_type']==0]['amount'].max())
    handPickedFeatures['transactionIncomeAvg']=bankRecordsPersonal.apply(lambda x: x[x['transaction_type']==0]['amount'].mean())
    handPickedFeatures['transactionExpenseNum']=bankRecordsPersonal.apply(lambda x: x[x['transaction_type']==1]['amount'].count())
    handPickedFeatures['transactionIncomeNum']=bankRecordsPersonal.apply(lambda x: x[x['transaction_type']==0]['amount'].count())
    handPickedFeatures['salaryNum']=bankRecordsPersonal['is_salary'].sum()
    handPickedFeatures['salaryTotal']=bankRecordsPersonal.apply(lambda x: x[x['is_salary']==1]['amount'].sum())
    handPickedFeatures['salaryAvg']=bankRecordsPersonal.apply(lambda x: x[x['is_salary']==1]['amount'].mean())
    handPickedFeatures['salaryMax']=bankRecordsPersonal.apply(lambda x: x[x['is_salary']==1]['amount'].max())


    browseRecordsPersonal=browse.groupby('user_id')
    handPickedFeatures['browseNum']=browseRecordsPersonal['time'].count()
    handPickedFeatures['browseFreq']=(browseRecordsPersonal['time'].count()).div(browseRecordsPersonal['time'].max()-browseRecordsPersonal['time'].min()+1+0.000001)
    handPickedFeatures['browseTypes']=browseRecordsPersonal['browse_type'].nunique()
    handPickedFeatures['browseSubTypes']=browseRecordsPersonal.apply(lambda x: x.groupby('browse_type')['sub_type'].nunique().sum())


    billsPersonal=bills.groupby('user_id')
    handPickedFeatures['billBanksNum']=billsPersonal['bank_id'].nunique()
    handPickedFeatures['billNum']=billsPersonal['time'].count()
    handPickedFeatures['underpay']=billsPersonal.apply(lambda x: 1.0*sum(x['last_bill_amount']-x['last_bill_pay'])/len(x))
    handPickedFeatures['avgCreditLine']=billsPersonal['credit_line'].mean()
    handPickedFeatures['avgTransactionNum']=billsPersonal['transactionNum'].mean()
    handPickedFeatures['maxBankCredit_line']=billsPersonal.apply(lambda x: x.groupby('bank_id')['credit_line'].mean().max())
    handPickedFeatures['avgLastBillAmount']=billsPersonal.apply(lambda x: x.groupby('bank_id')['last_bill_amount'].mean().mean())
    handPickedFeatures['avgMaxLastBillAmount']=billsPersonal.apply(lambda x: x.groupby('bank_id')['last_bill_amount'].max().mean())
    handPickedFeatures['relative_bill_adjustment']=(billsPersonal['cur_bill_amount'].mean()-billsPersonal['adjusted_amount'].mean()).div(billsPersonal['cur_bill_amount'].mean()+0.00001)
    handPickedFeatures['avgBillTimeSpan']=billsPersonal['time'].max()-billsPersonal['time'].min()
    handPickedFeatures['maxBankBillTimeSpan']=billsPersonal.apply(lambda x: (x.groupby('bank_id')['time'].max()-x.groupby('bank_id')['time'].min()).max())
    handPickedFeatures['totalBankBillTimeSpan']=billsPersonal.apply(lambda x: (x.groupby('bank_id')['time'].max()-x.groupby('bank_id')['time'].min()).sum())
    handPickedFeatures['debtNum']=billsPersonal['debt_status'].mean()


    handPickedFeatures['loanTime']=trainLoanTime['time']

    return handPickedFeatures

userFeatures.join(handPickedFeatures(trainUsers, trainBanks, trainBrowse, trainBills, trainLoanTime))

In [10]:
# make one-hot encoding for categorical features
def convert2Dummies(df, cln):
    dummies=pd.get_dummies(df[cln]).astype(int)
    dummies.columns=[cln+`c` for c in dummies.columns]

    return pd.concat([df, dummies], axis=1)

userFeatures=convert2Dummies(userFeatures, 'sex')
userFeatures=convert2Dummies(userFeatures, 'profession')
userFeatures=convert2Dummies(userFeatures, 'education')
userFeatures=convert2Dummies(userFeatures, 'marriage')
userFeatures=convert2Dummies(userFeatures, 'citizen_type')




# fill in NA with zeros or other numbers

userFeatures['transactionNum'].fillna(0, inplace=True)
userFeatures['transactionNum']=userFeatures['transactionNum'].astype(int)


userFeatures['salaryNum'].fillna(0, inplace=True)
userFeatures['salaryNum']=userFeatures['salaryNum'].astype(int)

# userFeatures['browseNum'].fillna(0, inplace=True)
userFeatures['browseTypes'].fillna(0, inplace=True)
userFeatures['browseTypes']=userFeatures['browseTypes'].astype(int)

userFeatures['browseSubTypes'].fillna(0, inplace=True)
userFeatures['browseSubTypes']=userFeatures['browseSubTypes'].astype(int)

userFeatures['billBanksNum'].fillna(0, inplace=True)
userFeatures['billBanksNum']=userFeatures['billBanksNum'].astype(int)

userFeatures['billNum'].fillna(0, inplace=True)
userFeatures['billNum']=userFeatures['billNum'].astype(int)

userFeatures['salaryTotal'].fillna(0, inplace=True)
userFeatures['transactionAmount'].fillna(0, inplace=True)
# userFeatures['browseNum']=userFeatures['browseNum'].astype(int)

In [11]:
# place to make addition change to the working dataset, without poluting the data

userFeatures_select=userFeatures.copy()


# temporary feature testing cells
#userFeatures_select['try']=bills.apply(lambda x: (x.groupby('bank_id')['cur_bill_amount'].mean()-\
#             x.groupby('bank_id')['adjusted_amount'].mean()).div(x.groupby('bank_id')['cur_bill_amount'].mean()+0.000001).mean())



#userFeatures_select['try']=bills.apply(lambda x: x.groupby('bank_id')['cur_bill_amount'].mean().mean())
#userFeatures_select['try']=bills.apply(lambda x: x.groupby('bank_id')['cumulative_interest'].mean().max())

#userFeatures_select['try']=bills.apply(lambda x: x.groupby('bank_id')['cur_bill_min_due'].mean().mean())
#userFeatures_select['nonSalary']=bankRecordsPersonal.apply(\
#     lambda x: x[(x['is_salary']==0) & (x['transaction_type']==0)]['amount']/ \
#     x[(x['is_salary']==1)]['amount'])
# printuserFeatures_select[['try', 'billNum']].head(20)

In [12]:
# userFeatures.drop(['try'], axis=1, inplace=True)

In [13]:
imp=Imputer(missing_values='NaN', strategy='most_frequent', axis=0)
userFeatures_select=imp.fit_transform(userFeatures_select)

# print userFeatures_select.head()

In [14]:
# split to training and testing sets
y=trainOverdue.values.ravel()
X_train, X_test, y_train, y_test=train_test_split(userFeatures_select, y\
                                                  , test_size=0.2, random_state=8)
# make a data weight list for random forests to handle uneven distribution
weights=y_train+0.3

In [15]:
# standardization
scaler=StandardScaler()
scaler.fit_transform(X_train)
scaler.transform(X_test)

array([[-0.33689176,  2.14097994,  0.99953475, ..., -0.62446927,
        -0.41475882, -0.58884405],
       [-0.33689176,  0.81791615, -0.30181203, ...,  1.60135982,
        -0.41475882, -0.58884405],
       [-0.33689176, -0.50514763, -0.30181203, ...,  1.60135982,
        -0.41475882, -0.58884405],
       ..., 
       [-0.33689176, -0.50514763, -0.30181203, ..., -0.62446927,
        -0.41475882,  1.6982425 ],
       [-0.33689176, -0.50514763, -0.30181203, ..., -0.62446927,
        -0.41475882, -0.58884405],
       [-0.33689176, -0.50514763,  0.99953475, ...,  1.60135982,
        -0.41475882, -0.58884405]])

In [16]:
# compute the correlation between variants and labels
corrmatrix=np.corrcoef(userFeatures_select.T, y)
print corrmatrix[-1,:-1]

[-0.09984565  0.00826746  0.03870428  0.00396767  0.01066433 -0.00226623
 -0.00947552 -0.03160595 -0.03647206  0.0024244  -0.0269219  -0.04960687
 -0.01497956 -0.01787624 -0.01638896 -0.00927952 -0.00131292 -0.03357353
 -0.02491264 -0.01962355 -0.03972253  0.00327372 -0.00364836 -0.00230947
 -0.00282958  0.01101058 -0.00163363 -0.00163363 -0.00205874 -0.00163363
 -0.00231011 -0.00163363 -0.00230867  0.00282292 -0.00228205 -0.00163363
 -0.00291592 -0.00370093  0.00165323 -0.00709271 -0.01112004 -0.00437161
 -0.00973534 -0.00863096 -0.00411298  0.00149692 -0.00732318 -0.00589689
 -0.0052874  -0.00048004 -0.00169119 -0.00231032  0.00281764 -0.01531177
 -0.00137294 -0.00497841 -0.01940946 -0.00280437 -0.01527393 -0.0031122
 -0.00334446 -0.02459072  0.00356079  0.00499028 -0.01134316  0.00056615
 -0.00992807 -0.01083622 -0.02848537 -0.01102328 -0.00027087 -0.0509346
 -0.00889017 -0.00066495 -0.01224795 -0.00605801 -0.00830329 -0.0210204
 -0.05103155 -0.00510511 -0.00163363 -0.00275662 -0.00

In [17]:
# compute relative importance of features
factors=abs(np.array(corrmatrix[-1, :-1]))**1.7
for i in xrange(2):
    factors=normalize(factors.reshape(1,-1))
    factors*=abs(corrmatrix[-1, :-1])

# scale features according to their lative importance
X_train*=factors
X_test*=factors

In [18]:
# nearest neighbors
knnc = KNeighborsClassifier(n_neighbors=10, weights='distance')
knnc.fit(X_train, y_train)
pred=knnc.predict_proba(X_test)[:,1]

In [19]:
print len(pred[pred>0.5])
print 1.0*sum(y_test)/len(y_test)
print sum(y_test)
print sum(y_train)
print 1.0*len(pred[(pred>0.5)&(y_test==1)])/sum(y_test==1)

p=pred[y_test==1]
n=pred[y_test==0]
print ks_2samp(p,n)

352
0.130575539568
1452
5731
0.0564738292011
Ks_2sampResult(statistic=0.16809935591671021, pvalue=1.263097638816552e-31)


In [20]:
# random forest classifier
clf_rf=rfc(n_estimators=1200, max_features=8*int(np.sqrt(X_train.shape[1])), max_depth=None, n_jobs=7, min_samples_split=16, random_state=0)
clf_rf.fit(X_train, y_train, sample_weight=weights)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=136, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=16, min_weight_fraction_leaf=0.0,
            n_estimators=1200, n_jobs=7, oob_score=False, random_state=0,
            verbose=0, warm_start=False)

In [21]:
pred=clf_rf.predict_proba(X_test)[:,1]
print len(pred[pred>0.5])
print 1.0*sum(y_test)/len(y_test)
print sum(y_test)
print sum(y_train)
print 1.0*len(pred[(pred>0.5)&(y_test==1)])/sum(y_test==1)

p=pred[y_test==1]
n=pred[y_test==0]
print ks_2samp(p,n)

446
0.130575539568
1452
5731
0.154269972452
Ks_2sampResult(statistic=0.43616184031612626, pvalue=1.8035114625754209e-210)


In [22]:
clf_xgb=XGBClassifier(learning_rate=0.04, n_estimators=1200, subsample=0.9, colsample_bylevel=0.9, objective='binary:logistic', \
                      max_depth=5, gamma=2, seed=0)
clf_xgb.fit(X_train, y_train)

XGBClassifier(base_score=0.5, colsample_bylevel=0.9, colsample_bytree=1,
       gamma=2, learning_rate=0.04, max_delta_step=0, max_depth=5,
       min_child_weight=1, missing=None, n_estimators=1200, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=0.9)

In [23]:
pred = clf_xgb.predict_proba(X_test)[:,1]

In [24]:
print len(pred[pred>0.5])
print 1.0*sum(y_test)/len(y_test)
print sum(y_test)
print sum(y_train)
print 1.0*len(pred[(pred>0.5)&(y_test==1)])/sum(y_test==1)

p=pred[y_test==1]
n=pred[y_test==0]
print ks_2samp(p,n)

315
0.130575539568
1452
5731
0.129476584022
Ks_2sampResult(statistic=0.45133130682459305, pvalue=2.4635188337377945e-225)
