In [15]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import normalize

from sklearn.ensemble import RandomForestClassifier as rfc

from sklearn.neighbors import NearestNeighbors
from sklearn.neighbors import KNeighborsClassifier

from scipy.stats import ks_2samp

from matplotlib import pyplot as plt
%matplotlib inline

inputFolder='../data/'
trainFolder=inputFolder+'train/'
testFolder=inputFolder+'test/'

In [16]:
trainUsers=pd.read_csv(trainFolder+'user_info_train.txt', \
                           names=['id','sex','profession','education','marriage','citizen_type'],\
                          index_col=0)

trainBanks=pd.read_csv(trainFolder+'bank_detail_train.txt',\
                          names=['user_id', 'time', 'transaction_type', 'amount', 'is_salary'])

trainBrowse=pd.read_csv(trainFolder+'browse_history_train.txt',\
                           names=['user_id', 'time', 'browse_type', 'sub_type'])

trainBills=pd.read_csv(trainFolder+'bill_detail_train.txt',\
                          names=['user_id', 'time', 'bank_id', 'last_bill_amount', 'last_bill_pay', 'credit_line',\
                                 'cur_bill_balance', 'cur_bill_min_due', 'transactionNum', 'cur_bill_amount', \
                                 'adjusted_amount', 'cumulative_interest', 'available_deposit', 'available_credit', 'debt_status'])

trainLoanTime=pd.read_csv(trainFolder+'loan_time_train.txt',\
                                names=['user_id', 'time'])

trainOverdue=pd.read_csv(trainFolder+'overdue_train.txt',\
                            names=['user_id', 'label'], index_col=0)

In [17]:
bank_ids=trainBills.bank_id.unique()
bankBills=dict()
for b_id in bank_ids:
    bs=trainBills[trainBills.bank_id==b_id]
    if len(bs)>=1:
        bankBills[b_id]=bs
    
print bankBills.keys()

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 510014, 510016, 510017, 510022, 510024, 510026, 510027, 510033, 510037, 510044, 510050, 510053, 510057]


In [40]:
trialTrainBanks=trainBanks.copy()
fundConv=lambda x: 2*np.power((np.exp(x)-1),0.5)
trialTrainBanks['amount']=fundConv(trialTrainBanks['amount'])
print trialTrainBanks['amount'].head()

0    1942.009713
1    1942.009713
2    2746.415581
3     386.454945
4    1842.351624
Name: amount, dtype: float64


In [18]:
billFeatures=pd.DataFrame(np.arange(len(trainUsers))+1, columns=['id'])
billFeatures.set_index(['id'], inplace=True)
for b_id in bankBills.keys():
    bankBillName='bank'+str(b_id).zfill(2)
    billFeatures[bankBillName+'AvgCreditLine']=bankBills[b_id].groupby('user_id')['credit_line'].mean()
#     billFeatures[bankBillName+'MaxCreditLine']=bankBills[b_id].groupby('user_id')['available_credit'].mean()
    
billFeatures.fillna(-1, inplace=True)


In [83]:
userFeatures=trainUsers.copy()
userFeatures=userFeatures.join(billFeatures)

In [None]:


bankRecordsPersonal=trainBanks.groupby('user_id')

userFeatures['transactionNum']=bankRecordsPersonal['amount'].count()
userFeatures['transactionAmount']=bankRecordsPersonal['amount'].sum()


userFeatures['transactionNetAmount']=bankRecordsPersonal.apply(lambda x: x[x['transaction_type']==1]['amount'].sum())-\
        bankRecordsPersonal.apply(lambda x: x[x['transaction_type']==0]['amount'].sum())


userFeatures['transactionExpense']=bankRecordsPersonal.apply(lambda x: x[x['transaction_type']==1]['amount'].sum())
userFeatures['transactionExpenseMax']=bankRecordsPersonal.apply(lambda x: x[x['transaction_type']==1]['amount'].max())
userFeatures['transactionExpenseAvg']=bankRecordsPersonal.apply(lambda x: x[x['transaction_type']==1]['amount'].mean())

userFeatures['transactionIncome']=bankRecordsPersonal.apply(lambda x: x[x['transaction_type']==0]['amount'].sum())

userFeatures['transactionIncomeMax']=bankRecordsPersonal.apply(lambda x: x[x['transaction_type']==0]['amount'].max())
userFeatures['transactionIncomeAvg']=bankRecordsPersonal.apply(lambda x: x[x['transaction_type']==0]['amount'].mean())

userFeatures['transactionExpenseNum']=bankRecordsPersonal.apply(lambda x: x[x['transaction_type']==1]['amount'].count())
userFeatures['transactionIncomeNum']=bankRecordsPersonal.apply(lambda x: x[x['transaction_type']==0]['amount'].count())

userFeatures['salaryNum']=bankRecordsPersonal['is_salary'].sum()
userFeatures['salaryTotal']=bankRecordsPersonal.apply(lambda x: x[x['is_salary']==1]['amount'].sum())
userFeatures['salaryAvg']=bankRecordsPersonal.apply(lambda x: x[x['is_salary']==1]['amount'].mean())
userFeatures['salaryMax']=bankRecordsPersonal.apply(lambda x: x[x['is_salary']==1]['amount'].max())

userFeatures['nonSalary']=bankRecordsPersonal.apply(\
    lambda x: x[(x['is_salary']==0) & (x['transaction_type']==0)]['amount'].mean())


browseRecordsPersonal=trainBrowse.groupby('user_id')
userFeatures['browseNum']=browseRecordsPersonal['time'].count()
userFeatures['browseTypes']=browseRecordsPersonal['browse_type'].nunique()
userFeatures['browseSubTypes']=browseRecordsPersonal.apply(lambda x: x.groupby('browse_type')['sub_type'].nunique().sum())


bills=trainBills.groupby('user_id')
userFeatures['billBanksNum']=bills['bank_id'].nunique()
userFeatures['billNum']=bills['time'].count()
userFeatures['underpay']=bills.apply(lambda x: 1.0*sum(x['last_bill_amount']-x['last_bill_pay'])/len(x))

userFeatures['avgCreditLine']=bills['credit_line'].mean()
userFeatures['avgTransactionNum']=bills['transactionNum'].mean()
userFeatures['maxBankCredit_line']=bills.apply(lambda x: x.groupby('bank_id')['credit_line'].mean().max())
userFeatures['avgLastBillAmount']=bills.apply(lambda x: x.groupby('bank_id')['last_bill_amount'].mean().mean())
userFeatures['avgMaxLastBillAmount']=bills.apply(lambda x: x.groupby('bank_id')['last_bill_amount'].max().mean())

userFeatures['avgBillTimeSpan']=bills['time'].max()-bills['time'].min()
userFeatures['maxBankBillTimeSpan']=bills.apply(lambda x: (x.groupby('bank_id')['time'].max()-x.groupby('bank_id')['time'].min()).max())
userFeatures['totalBankBillTimeSpan']=bills.apply(lambda x: (x.groupby('bank_id')['time'].max()-x.groupby('bank_id')['time'].min()).sum())
userFeatures['debtNum']=bills['debt_status'].mean()


userFeatures['loanTime']=trainLoanTime['time']


In [None]:
# userFeatures['try']=bills.apply(lambda x: x.groupby('bank_id')['cur_bill_amount'].mean().mean())
# userFeatures['try']=bills.apply(lambda x: x.groupby('bank_id')['cumulative_interest'].mean().max())

# userFeatures['try']=bills.apply(lambda x: x.groupby('bank_id')['cur_bill_min_due'].mean().mean())

# print userFeatures[['try', 'billNum']].head(20)

In [None]:
userFeatures['transactionNum'].fillna(0, inplace=True)
userFeatures['transactionNum']=userFeatures['transactionNum'].astype(int)


userFeatures['salaryNum'].fillna(0, inplace=True)
userFeatures['salaryNum']=userFeatures['salaryNum'].astype(int)

# userFeatures['browseNum'].fillna(0, inplace=True)
userFeatures['browseTypes'].fillna(0, inplace=True)
userFeatures['browseTypes']=userFeatures['browseTypes'].astype(int)

userFeatures['browseSubTypes'].fillna(0, inplace=True)
userFeatures['browseSubTypes']=userFeatures['browseSubTypes'].astype(int)

userFeatures['billBanksNum'].fillna(0, inplace=True)
userFeatures['billBanksNum']=userFeatures['billBanksNum'].astype(int)

userFeatures['billNum'].fillna(0, inplace=True)
userFeatures['billNum']=userFeatures['billNum'].astype(int)

userFeatures['salaryTotal'].fillna(0, inplace=True)
userFeatures['transactionAmount'].fillna(0, inplace=True)
# userFeatures['browseNum']=userFeatures['browseNum'].astype(int)





print userFeatures.head()

In [None]:
userFeatures.sort_index(inplace=True)
trainOverdue.sort_index(inplace=True)

In [None]:
def convert2Dummies(df, cln):
    dummies=pd.get_dummies(df[cln]).astype(int)
    dummies.columns=[cln+`c` for c in dummies.columns]

    return pd.concat([df, dummies], axis=1)

In [None]:
userFeatures=convert2Dummies(userFeatures, 'sex')
userFeatures=convert2Dummies(userFeatures, 'profession')
userFeatures=convert2Dummies(userFeatures, 'education')
userFeatures=convert2Dummies(userFeatures, 'marriage')
userFeatures=convert2Dummies(userFeatures, 'citizen_type')


In [None]:
# overdueAmount=userFeatures['transactionAmount'].values[trainOverdue.values.ravel()==1]
# plt.hist(overdueAmount, bins=20)
# plt.figure()
# plt.hist(userFeatures['transactionAmount'].values, bins=20)

In [None]:
print userFeatures.head()

In [None]:
userFeatures_select=userFeatures.copy()

In [None]:
X=userFeatures_select.values
imp=Imputer(missing_values='NaN', strategy='most_frequent', axis=0)
X=imp.fit_transform(X)
y=trainOverdue.values.ravel()

In [None]:
corrmatrix=np.corrcoef(X.T, y)
print corrmatrix[-1,:-1]

In [None]:
X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.2, random_state=8)
weights=y_train+1

In [None]:
scaler=StandardScaler()
scaler.fit_transform(X_train)
scaler.transform(X_test)

In [None]:
factors=abs(np.array(corrmatrix[-1, :-1]))**1.7
for i in xrange(2):
    factors=normalize(factors.reshape(1,-1))
    factors*=abs(corrmatrix[-1, :-1])
print factors

In [None]:
X_train*=factors
X_test*=factors
    

In [None]:
knnc = KNeighborsClassifier(n_neighbors=10, weights='distance')
knnc.fit(X_train, y_train)
pred=knnc.predict_proba(X_test)[:,1]

In [None]:
print len(pred[pred>0.5])
print 1.0*sum(y_test)/len(y_test)
print sum(y_test)
print sum(y_train)
print 1.0*len(pred[(pred>0.5)&(y_test==1)])/sum(y_test==1)

p=pred[y_test==1]
n=pred[y_test==0]
print ks_2samp(p,n)

In [None]:
clf_rf=rfc(n_estimators=1200, max_depth=None, n_jobs=7, min_samples_split=16, random_state=0)
clf_rf.fit(X_train, y_train, sample_weight=weights)

In [None]:
pred=clf_rf.predict_proba(X_test)[:,1]
print len(pred[pred>0.5])
print 1.0*sum(y_test)/len(y_test)
print sum(y_test)
print sum(y_train)
print 1.0*len(pred[(pred>0.5)&(y_test==1)])/sum(y_test==1)

p=pred[y_test==1]
n=pred[y_test==0]
print ks_2samp(p,n)