In [107]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import normalize

from sklearn.ensemble import RandomForestClassifier as rfc
from sklearn.neighbors import NearestNeighbors
from sklearn.neighbors import KNeighborsClassifier

import xgboost as xgb
from xgboost.sklearn import XGBClassifier

from scipy.stats import ks_2samp

import time

from matplotlib import pyplot as plt
%matplotlib inline

inputFolder='../data/'
trainFolder=inputFolder+'train/'
testFolder=inputFolder+'test/'



In [108]:
def readFiles(type='train'):
    if type not in ('train', 'test'):
        print 'error: type must be either \'train\' or \'test\''
        return
    
    folder=trainFolder if type=='train' else testFolder
    
    usersFile=folder+'user_info_'+type+'.txt'
    banksFile=folder+'bank_detail_'+type+'.txt'
    browseFile=folder+'browse_history_'+type+'.txt'
    billsFile=folder+'bill_detail_'+type+'.txt'
    loanTimeFile=folder+'loan_time_'+type+'.txt'
    overdueFile=folder+'overdue_'+type+'.txt'

    users=pd.read_csv(usersFile, \
              names=['id','sex','profession','education','marriage','citizen_type'],\
              index_col=0).sort_index()
    banks=pd.read_csv(banksFile,\
              names=['user_id', 'time', 'transaction_type', 'amount', 'is_salary'])

    browse=pd.read_csv(browseFile,\
               names=['user_id', 'time', 'browse_type', 'sub_type'])
    bills=pd.read_csv(billsFile,\
              names=['user_id', 'time', 'bank_id', 'last_bill_amount', 'last_bill_pay', 'credit_line',\
                     'cur_bill_balance', 'cur_bill_min_due', 'transactionNum', 'cur_bill_amount', \
                     'adjusted_amount', 'cumulative_interest', 'available_deposit', 'available_credit', 'debt_status'])

    loanTimes=pd.read_csv(loanTimeFile,\
              names=['user_id', 'time'], index_col=0).sort_index()

    overdues=pd.read_csv(overdueFile,\
              names=['user_id', 'label'], index_col=0).sort_index().values.ravel() if type=='train' else None
    
    return users, banks, browse, bills, loanTimes, overdues


In [122]:
def extractBillFeatures(bills, index):
    # prepare a list of banks in the bills form
    
    bank_ids=bills['bank_id'].unique()
    bankBills=dict() # keys are banks ids, and values are bills associated with that bank id.
    for b_id in bank_ids:
        bs=bills[bills.bank_id==b_id]
        if len(bs)>=1:
            bankBills[b_id]=bs
            
    # prepare bank-wise bill features
    billFeatures=pd.DataFrame(index, columns=['id'])
    billFeatures.set_index(['id'], inplace=True)
    for b_id in bankBills.keys():
        bankBillName='bank'+str(b_id).zfill(6)
        billFeatures[bankBillName+'AvgCreditLine']=bankBills[b_id].groupby('user_id')['credit_line'].mean()
    #     billFeatures[bankBillName+'MaxCreditLine']=bankBills[b_id].groupby('user_id')['available_credit'].mean()
    billFeatures.fillna(-1, inplace=True)
    
    return billFeatures

In [123]:
def extractBrowseTypeFeatures(browse, index):
    # prepare browseType wise features:

    browseType_ids=browse['browse_type'].unique()
    browseTypes=dict()
    for b_id in browseType_ids:
        bs=browse[browse.browse_type==b_id]
        if len(bs)>=1:
            browseTypes[b_id]=bs

    browseTypeFeatures=pd.DataFrame(index, columns=['id'])
    browseTypeFeatures.set_index(['id'], inplace=True)
    for b_id in browseTypes.keys():
        browseTypeName='browseType'+str(b_id).zfill(7)
    #     browseTypeFeatures[browseTypeName+'TotalTimes']=browseTypes[b_id].groupby('user_id')['time'].count()
        browseTypeFeatures[browseTypeName+'Freq']=browseTypes[b_id].groupby('user_id')['time'].count().div\
        (browseTypes[b_id].groupby('user_id')['time'].max()-browseTypes[b_id].groupby('user_id')['time'].min()+1+0.000001)
    browseTypeFeatures.fillna(0, inplace=True)

    return browseTypeFeatures

In [134]:
trainBills=pd.read_csv(trainFolder+'bill_detail_train.txt',\
              names=['user_id', 'time', 'bank_id', 'last_bill_amount', 'last_bill_pay', 'credit_line',\
                     'cur_bill_balance', 'cur_bill_min_due', 'transactionNum', 'cur_bill_amount', \
                     'adjusted_amount', 'cumulative_interest', 'available_deposit', 'available_credit', 'debt_status'])
testBills=pd.read_csv(testFolder+'bill_detail_test.txt',\
              names=['user_id', 'time', 'bank_id', 'last_bill_amount', 'last_bill_pay', 'credit_line',\
                     'cur_bill_balance', 'cur_bill_min_due', 'transactionNum', 'cur_bill_amount', \
                     'adjusted_amount', 'cumulative_interest', 'available_deposit', 'available_credit', 'debt_status'])
trainLoanTime=pd.read_csv(trainFolder+'loan_time_train.txt',\
              names=['user_id', 'time'], index_col=0).sort_index()
testLoanTime=pd.read_csv(testFolder+'loan_time_test.txt',\
              names=['user_id', 'time'], index_col=0).sort_index()

allBills=pd.concat([trainBills, testBills])
loanTime=pd.concat([trainLoanTime, testLoanTime])

finalTime=trainBills['time'].max()+1
startTime=trainBills['time'].min()-1


def afterLoanBillFreq(x):
    lt=loanTime.loc[x['user_id'].mean(),'time']
    return float(x[x['time']>lt]['time'].count())/float(finalTime-lt)

# afterLoanBrowseFreq=afterLoanBillFreq

def afterLoanSalaryChange(x):
    lt=loanTime.loc[x['user_id'].mean(),'time']
    beforeSalary=x[(x['is_salary']==1) & (x['time']<=lt)]['amount'].mean()
    afterSalary=x[(x['is_salary']==1) & (x['time']>lt)]['amount'].mean()
#     if np.isnan(afterSalary) and ~np.isnan(beforeSalary):
#         afterSalary=0
#     if np.isnan(beforeSalary) and ~np.isnan(afterSalary):
#         beforeSalary=0
    return afterSalary-beforeSalary

# def afterLoanNetIncome(x):
#     lt=trainLoanTime.loc[x['user_id'].mean(),'time']
#     income=x[(x['transaction_type']==0) & (x['time']>lt)]['amount'].sum()
#     expense=x[(x['transaction_type']==1) & (x['time']>lt)]['amount'].sum()
#     return income-expense

# def afterLoanIncomeChange(x):
#     lt=trainLoanTime.loc[x['user_id'].mean(),'time']
#     beforeIncome=float(x[(x['transaction_type']==0) & (x['time']<=lt)]['amount'].sum())/float(lt-startTime)
#     afterIncome=float(x[(x['transaction_type']==0) & (x['time']>lt)]['amount'].sum())/float(finalTime-lt)
#     return afterIncome-beforeIncome

# def loanBillFreqChange(x):
#     lt=trainLoanTime.loc[x['user_id'].mean(),'time']
#     beforeFreq=float(x[x['time']<=lt]['time'].count())/float(lt-startTime)
#     afterFreq= float(x[x['time']>lt]['time'].count())/float(finalTime-lt)
#     return afterFreq-beforeFreq
# def newBanksAfterLoan(x):
#     lt=trainLoanTime.loc[x['user_id'].mean(),'time']
#     beforeBanks=set(x[x['time']<=lt]['bank_id'].unique())
#     afterBanks=set(x[x['time']>lt]['bank_id'].unique())
#     return len(afterBanks-beforeBanks)
# def terminatedBanksAfterLoan(x):
#     lt=trainLoanTime.loc[x['user_id'].mean(),'time']
#     beforeBanks=set(x[x['time']<=lt]['bank_id'].unique())
#     afterBanks=set(x[x['time']>lt]['bank_id'].unique())
#     return len(beforeBanks-afterBanks)


def afterLoanBillAdjustment(x):
    lt=loanTime.loc[x['user_id'].mean(),'time']
    afterLoanX=x[x['time']<=lt]
    return (afterLoanX['cur_bill_amount'].mean()-afterLoanX['adjusted_amount'].mean())/(afterLoanX['cur_bill_amount'].mean()+0.00001)

def afterLoanUnderpay(x):
    lt=loanTime.loc[x['user_id'].mean(),'time']
    afterX=x[x['time']>lt]
    return (afterX['last_bill_amount']-afterX['last_bill_pay']).mean()


In [136]:
# add manually picked features to the working dataset
def handPickedFeatures(users, banks, browse, bills, loanTimes):

    handPickedFeatures=pd.DataFrame(users.index, columns=['id'])
    handPickedFeatures.set_index(['id'], inplace=True)


    # bank record features
    bankRecordsPersonal=banks.groupby('user_id')
    handPickedFeatures['transactionNum']=bankRecordsPersonal['amount'].count()
    handPickedFeatures['transactionAmount']=bankRecordsPersonal['amount'].sum()
    handPickedFeatures['transactionNetAmount']=bankRecordsPersonal.apply(lambda x: x[x['transaction_type']==1]['amount'].sum())-\
            bankRecordsPersonal.apply(lambda x: x[x['transaction_type']==0]['amount'].sum())
    handPickedFeatures['transactionExpense']=bankRecordsPersonal.apply(lambda x: x[x['transaction_type']==1]['amount'].sum())
    handPickedFeatures['transactionExpenseMax']=bankRecordsPersonal.apply(lambda x: x[x['transaction_type']==1]['amount'].max())
    handPickedFeatures['transactionExpenseAvg']=bankRecordsPersonal.apply(lambda x: x[x['transaction_type']==1]['amount'].mean())
    handPickedFeatures['transactionIncome']=bankRecordsPersonal.apply(lambda x: x[x['transaction_type']==0]['amount'].sum())
    handPickedFeatures['transactionIncomeMax']=bankRecordsPersonal.apply(lambda x: x[x['transaction_type']==0]['amount'].max())
    handPickedFeatures['transactionIncomeAvg']=bankRecordsPersonal.apply(lambda x: x[x['transaction_type']==0]['amount'].mean())
    handPickedFeatures['transactionExpenseNum']=bankRecordsPersonal.apply(lambda x: x[x['transaction_type']==1]['amount'].count())
    handPickedFeatures['transactionIncomeNum']=bankRecordsPersonal.apply(lambda x: x[x['transaction_type']==0]['amount'].count())
    handPickedFeatures['salaryNum']=bankRecordsPersonal['is_salary'].sum()
    handPickedFeatures['salaryTotal']=bankRecordsPersonal.apply(lambda x: x[x['is_salary']==1]['amount'].sum())
    handPickedFeatures['salaryAvg']=bankRecordsPersonal.apply(lambda x: x[x['is_salary']==1]['amount'].mean())
    handPickedFeatures['salaryMax']=bankRecordsPersonal.apply(lambda x: x[x['is_salary']==1]['amount'].max())


    browseRecordsPersonal=browse.groupby('user_id')
    handPickedFeatures['browseNum']=browseRecordsPersonal['time'].count()
    handPickedFeatures['browseFreq']=(browseRecordsPersonal['time'].count()).div(browseRecordsPersonal['time'].max()-browseRecordsPersonal['time'].min()+1+0.000001)
    handPickedFeatures['browseTypes']=browseRecordsPersonal['browse_type'].nunique()
    handPickedFeatures['browseSubTypes']=browseRecordsPersonal.apply(lambda x: x.groupby('browse_type')['sub_type'].nunique().sum())


    billsPersonal=bills.groupby('user_id')
    handPickedFeatures['billBanksNum']=billsPersonal['bank_id'].nunique()
    handPickedFeatures['billNum']=billsPersonal['time'].count()
    handPickedFeatures['underpay']=billsPersonal.apply(lambda x: 1.0*sum(x['last_bill_amount']-x['last_bill_pay'])/len(x))
    handPickedFeatures['avgCreditLine']=billsPersonal['credit_line'].mean()
    handPickedFeatures['avgTransactionNum']=billsPersonal['transactionNum'].mean()
    handPickedFeatures['maxBankCredit_line']=billsPersonal.apply(lambda x: x.groupby('bank_id')['credit_line'].mean().max())
    handPickedFeatures['avgLastBillAmount']=billsPersonal.apply(lambda x: x.groupby('bank_id')['last_bill_amount'].mean().mean())
    handPickedFeatures['avgMaxLastBillAmount']=billsPersonal.apply(lambda x: x.groupby('bank_id')['last_bill_amount'].max().mean())
    handPickedFeatures['relative_bill_adjustment']=(billsPersonal['cur_bill_amount'].mean()-billsPersonal['adjusted_amount'].mean()).div(billsPersonal['cur_bill_amount'].mean()+0.00001)
    handPickedFeatures['avgBillTimeSpan']=billsPersonal['time'].max()-billsPersonal['time'].min()
    handPickedFeatures['maxBankBillTimeSpan']=billsPersonal.apply(lambda x: (x.groupby('bank_id')['time'].max()-x.groupby('bank_id')['time'].min()).max())
    handPickedFeatures['totalBankBillTimeSpan']=billsPersonal.apply(lambda x: (x.groupby('bank_id')['time'].max()-x.groupby('bank_id')['time'].min()).sum())
    handPickedFeatures['debtNum']=billsPersonal['debt_status'].mean()
    
    
    
    
    handPickedFeatures['afterLoanBillFreq']=billsPersonal.apply(afterLoanBillFreq)
    handPickedFeatures['afterLoanBillNum']=billsPersonal.apply(lambda x: x[x['time']>loanTime.loc[x['user_id'].mean(),'time']]['time'].count())
    handPickedFeatures['afterLoanUnderpay']=billsPersonal.apply(afterLoanUnderpay)
    handPickedFeatures['afterLoanSalaryChange']=bankRecordsPersonal.apply(afterLoanSalaryChange)
    handPickedFeatures['afterLoanBillTimeSpan']=billsPersonal.apply(lambda x: x['time'].max()-loanTime.loc[x['user_id'].mean(),'time'])
    handPickedFeatures['beforeLoanBillTimeSpan']=billsPersonal.apply(lambda x: loanTime.loc[x['user_id'].mean(),'time']-x['time'].min())
    handPickedFeatures['afterLoanBillAdjustment']=billsPersonal.apply(afterLoanBillAdjustment)


    handPickedFeatures['loanTime']=loanTimes['time']

    return handPickedFeatures

In [117]:
# make one-hot encoding for categorical features
def convert2Dummies(df, cln):
    dummies=pd.get_dummies(df[cln]).astype(int)
    dummies.columns=[cln+`c` for c in dummies.columns]

    return pd.concat([df, dummies], axis=1)

In [118]:
def makeXYdata(type='train'):
    if type=='combine':
        users_train, banks_train, browse_train, bills_train, loanTime_train, overdue_train=readFiles(type='train')
        users_test, banks_test, browse_test, bills_test, loanTime_test, _=readFiles(type='test')
        users=pd.concat([users_train,users_test])
        banks=pd.concat([banks_train, banks_test])
        browse=pd.concat([browse_train, browse_test])
        bills=pd.concat([bills_train, bills_test])
        loanTime=pd.concat([loanTime_train, loanTime_test])

    else:
        users, banks, browse, bills, loanTime, overdue=readFiles(type='train') if type=='train' else readFiles(type='test')

    billFeatures=extractBillFeatures(bills, users.index)
    browseTypeFeatures=extractBrowseTypeFeatures(browse, users.index)
    madeFeatures=handPickedFeatures(users, banks, browse, bills, loanTime)
    userFeatures=users.join(billFeatures).join(browseTypeFeatures)
    userFeatures=userFeatures.join(madeFeatures)

    # create dummies
    userFeatures=convert2Dummies(userFeatures, 'sex')
    userFeatures=convert2Dummies(userFeatures, 'profession')
    userFeatures=convert2Dummies(userFeatures, 'education')
    userFeatures=convert2Dummies(userFeatures, 'marriage')
    userFeatures=convert2Dummies(userFeatures, 'citizen_type')


    # fill in NA with zeros or other numbers
    userFeatures['transactionNum'].fillna(0, inplace=True)
    userFeatures['transactionNum']=userFeatures['transactionNum'].astype(int)


    userFeatures['salaryNum'].fillna(0, inplace=True)
    userFeatures['salaryNum']=userFeatures['salaryNum'].astype(int)

    # userFeatures['browseNum'].fillna(0, inplace=True)
    userFeatures['browseTypes'].fillna(0, inplace=True)
    userFeatures['browseTypes']=userFeatures['browseTypes'].astype(int)

    userFeatures['browseSubTypes'].fillna(0, inplace=True)
    userFeatures['browseSubTypes']=userFeatures['browseSubTypes'].astype(int)

    userFeatures['billBanksNum'].fillna(0, inplace=True)
    userFeatures['billBanksNum']=userFeatures['billBanksNum'].astype(int)

    userFeatures['billNum'].fillna(0, inplace=True)
    userFeatures['billNum']=userFeatures['billNum'].astype(int)

    userFeatures['salaryTotal'].fillna(0, inplace=True)
    userFeatures['transactionAmount'].fillna(0, inplace=True)

    print len(userFeatures.columns)

    imp=Imputer(missing_values='NaN', strategy='most_frequent', axis=0)
    userFeatures=imp.fit_transform(userFeatures)

    print userFeatures.shape

    return userFeatures, overdue if type=='train' else None

In [119]:
def produceTrainTest(X_train_, y_train_, X_test=None, y_test=None, test_size=0.2, random_state=8):
    if X_test is None:
        nTrain=int((1-test_size)*len(X_train_))
        X_train=X_train_[:nTrain]
        y_train=y_train_[:nTrain]
        
        X_test=X_train_[nTrain:]
        y_test=y_train_[nTrain:]
#         X_train, X_test, y_train, y_test=train_test_split(X_train_, y_train_\
#                                                   , test_size=test_size, random_state=random_state)
        scaleTemp=X_train_
    else:
        X_train, y_train=X_train_, y_train_
        scaleTemp=np.concatenate([X_train_, X_test])
    # standardization
    scaler=StandardScaler()
    scaler.fit(scaleTemp)
    X_train=scaler.transform(X_train)
    X_test=scaler.transform(X_test)  
    
    # compute the correlation between variants and labels
    corrmatrix=np.corrcoef(X_train_.T, y_train_)
    # compute relative importance of features
    
    
    
    factors=abs(np.array(corrmatrix[-1, :-1]))**1.5
    for i in xrange(0):
        factors=normalize(factors.reshape(1,-1))
        factors*=abs(corrmatrix[-1, :-1])
    # scale features according to their lative importance
    X_train*=factors
    X_test*=factors
    
    return X_train, X_test, y_train, y_test
    
    
            

In [120]:
def makeClassifier(X_train, y_train, clf_name):
    weights=y_train+0.3
    if clf_name=='knn':
        # nearest neighbors
        return KNeighborsClassifier(n_neighbors=10, weights='distance').fit(X_train, y_train)
    elif clf_name=='rf':
        # random forest classifier
        
        return rfc(n_estimators=1200, max_features=8*int(np.sqrt(X_train.shape[1])), max_depth=None, \
                   n_jobs=7, min_samples_split=16, random_state=0).fit(X_train, y_train, sample_weight=weights)
    elif clf_name=='xgb':
        # xgboost
        return XGBClassifier(learning_rate=0.04, n_estimators=1200, subsample=0.9, colsample_bylevel=0.9, \
                         objective='binary:logistic', max_depth=5, gamma=2, seed=0).fit(X_train, y_train)

In [127]:
X, y=makeXYdata()

312
(55596, 312)


In [11]:
print len(X)

55596


In [137]:
X_all, _=makeXYdata(type='combine')

315
(69495, 315)


In [138]:
X=X_all[:55596]
X_val=X_all[55596:]

In [139]:
X_train, X_test, y_train, y_test=produceTrainTest(X, y, X_val)
clf=makeClassifier(X_train, y_train, 'xgb')
prob=clf.predict_proba(X_test)[:,1]

In [140]:
submit=pd.DataFrame()
submit['userid']=np.arange(13899)+55597
submit['probability']=prob

In [141]:
outFile='../results/submit'+time.ctime()+'.csv'
submit.to_csv(outFile, index=False)

In [129]:
X_train, X_test, y_train, y_test=produceTrainTest(X, y)
clf=makeClassifier(X_train, y_train, 'rf')
prob=clf.predict_proba(X_test)[:,1]

print len(prob[prob>0.5])
print 1.0*sum(y_test)/len(y_test)
print sum(y_test)
print sum(y_train)
print 1.0*len(prob[(prob>0.5)&(y_test==1)])/sum(y_test==1)

p=prob[y_test==1]
n=prob[y_test==0]
print ks_2samp(p,n)

300
0.133992805755
1490
5693
0.0852348993289
Ks_2sampResult(statistic=0.44183793653780484, pvalue=9.9750448265880043e-221)


In [81]:
if __name__=='__main__':
    


In [45]:
X_train, X_test, y_train, _=produceTrainTest(X, y, X_val)

ValueError: operands could not be broadcast together with shapes (13899,37) (305,) (13899,37) 

In [33]:
clf=makeClassifier(X_train, y_train, 'xgb')
prob=clf.predict_proba(X_test)[:,1]

ValueError: setting an array element with a sequence.

In [39]:
print len(X_val[0])

55596
