In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import normalize

from sklearn.neighbors import NearestNeighbors
from sklearn.neighbors import KNeighborsClassifier

from scipy.stats import ks_2samp

from matplotlib import pyplot as plt
%matplotlib inline

inputFolder='../data/'
trainFolder=inputFolder+'train/'
testFolder=inputFolder+'test/'

In [399]:
trainUsers=pd.read_csv(trainFolder+'user_info_train.txt', \
                           names=['id','sex','profession','education','marriage','citizen_type'],\
                          index_col=0)

trainBanks=pd.read_csv(trainFolder+'bank_detail_train.txt',\
                          names=['user_id', 'time', 'transaction_type', 'amount', 'is_salary'])

trainBrowse=pd.read_csv(trainFolder+'browse_history_train.txt',\
                           names=['user_id', 'time', 'browse_type', 'sub_type'])

trainBills=pd.read_csv(trainFolder+'bill_detail_train.txt',\
                          names=['user_id', 'time', 'bank_id', 'last_bill_amount', 'last_bill_pay', 'credit_line',\
                                 'cur_bill_balance', 'cur_bill_min_due', 'transactionNum', 'cur_bill_amount', \
                                 'adjusted_amount', 'cumulative_interest', 'available_deposit', 'available_credit', 'debt_status'])

trainLoanTime=pd.read_csv(trainFolder+'loan_time_train.txt',\
                                names=['user_id', 'time'])

trainOverdue=pd.read_csv(trainFolder+'overdue_train.txt',\
                            names=['user_id', 'label'], index_col=0)

In [9]:
print 




In [383]:
userFeatures=trainUsers.copy()

bankRecordsPersonal=trainBanks.groupby('user_id')

userFeatures['transactionNum']=bankRecordsPersonal['amount'].count()
userFeatures['transactionAmount']=bankRecordsPersonal['amount'].sum()

# got worse
# userFeatures['transactionNetAmount']=bankRecordsPersonal.apply(lambda x: x[x['transaction_type']==1]['amount'].sum())-\
#         bankRecordsPersonal.apply(lambda x: x[x['transaction_type']==0]['amount'].sum())
# userFeatures['transactionTimeSpan']=bankRecordsPersonal['time'].max()-bankRecordsPersonal['time'].min()

userFeatures['transactionExpense']=bankRecordsPersonal.apply(lambda x: x[x['transaction_type']==1]['amount'].sum())
userFeatures['transactionExpenseMax']=bankRecordsPersonal.apply(lambda x: x[x['transaction_type']==1]['amount'].max())
userFeatures['transactionExpenseAvg']=bankRecordsPersonal.apply(lambda x: x[x['transaction_type']==1]['amount'].mean())

userFeatures['transactionIncome']=bankRecordsPersonal.apply(lambda x: x[x['transaction_type']==0]['amount'].sum())
# not useful
# userFeatures['transactionIncomeMax']=bankRecordsPersonal.apply(lambda x: x[x['transaction_type']==0]['amount'].max())
# userFeatures['transactionIncomeAvg']=bankRecordsPersonal.apply(lambda x: x[x['transaction_type']==0]['amount'].mean())

userFeatures['transactionExpenseNum']=bankRecordsPersonal.apply(lambda x: x[x['transaction_type']==1]['amount'].count())
userFeatures['transactionIncomeNum']=bankRecordsPersonal.apply(lambda x: x[x['transaction_type']==0]['amount'].count())
userFeatures['salaryNum']=bankRecordsPersonal['is_salary'].sum()
userFeatures['salaryTotal']=bankRecordsPersonal.apply(lambda x: x[x['is_salary']==1]['amount'].sum())
userFeatures['salaryAvg']=bankRecordsPersonal.apply(lambda x: x[x['is_salary']==1]['amount'].mean())
userFeatures['salaryMax']=bankRecordsPersonal.apply(lambda x: x[x['is_salary']==1]['amount'].max())
# userFeatures['salaryAvg']=

browseRecordsPersonal=trainBrowse.groupby('user_id')
userFeatures['browseNum']=browseRecordsPersonal['time'].count()
userFeatures['browseTypes']=browseRecordsPersonal['browse_type'].nunique()
userFeatures['browseSubTypes']=browseRecordsPersonal.apply(lambda x: x.groupby('browse_type')['sub_type'].nunique().sum())

bills=trainBills.groupby('user_id')
userFeatures['billBanksNum']=bills['bank_id'].nunique()
userFeatures['billNum']=bills['time'].count()
userFeatures['available_credit']=bi

# userFeatures['loanTime']=trainLoanTime['time']

In [384]:
userFeatures['transactionNum'].fillna(0, inplace=True)
userFeatures['transactionNum']=userFeatures['transactionNum'].astype(int)


userFeatures['salaryNum'].fillna(0, inplace=True)
userFeatures['salaryNum']=userFeatures['salaryNum'].astype(int)

# userFeatures['browseNum'].fillna(0, inplace=True)
userFeatures['browseTypes'].fillna(0, inplace=True)
userFeatures['browseTypes']=userFeatures['browseTypes'].astype(int)

userFeatures['browseSubTypes'].fillna(0, inplace=True)
userFeatures['browseSubTypes']=userFeatures['browseSubTypes'].astype(int)

userFeatures['billBanksNum'].fillna(0, inplace=True)
userFeatures['billBanksNum']=userFeatures['billBanksNum'].astype(int)

userFeatures['billNum'].fillna(0, inplace=True)
userFeatures['billNum']=userFeatures['billNum'].astype(int)

userFeatures['salaryTotal'].fillna(0, inplace=True)
userFeatures['transactionAmount'].fillna(0, inplace=True)
# userFeatures['browseNum']=userFeatures['browseNum'].astype(int)





print userFeatures.head()

      sex  profession  education  marriage  citizen_type  transactionNum  \
id                                                                         
3150    1           2          4         1             4               0   
6965    1           2          4         3             2             364   
1265    1           3          4         3             1             419   
6360    1           2          4         3             2               0   
2583    2           2          2         1             1             831   

      transactionAmount  transactionAvgAmount  transactionExpense  \
id                                                                  
3150           0.000000                   NaN                 NaN   
6965        4207.382203             11.558742         3234.531975   
1265        5370.663258             12.817812         3662.457063   
6360           0.000000                   NaN                 NaN   
2583        9800.785996             11.793966        

In [385]:
userFeatures.sort_index(inplace=True)
trainOverdue.sort_index(inplace=True)

In [386]:
def convert2Dummies(df, cln):
    dummies=pd.get_dummies(df[cln]).astype(int)
    dummies.columns=[cln+`c` for c in dummies.columns]

    return pd.concat([df, dummies], axis=1)

In [387]:
userFeatures=convert2Dummies(userFeatures, 'sex')
userFeatures=convert2Dummies(userFeatures, 'profession')
userFeatures=convert2Dummies(userFeatures, 'education')
userFeatures=convert2Dummies(userFeatures, 'marriage')
userFeatures=convert2Dummies(userFeatures, 'citizen_type')


In [388]:
# overdueAmount=userFeatures['transactionAmount'].values[trainOverdue.values.ravel()==1]
# plt.hist(overdueAmount, bins=20)
# plt.figure()
# plt.hist(userFeatures['transactionAmount'].values, bins=20)

In [389]:
print userFeatures.head()

    sex  profession  education  marriage  citizen_type  transactionNum  \
id                                                                       
1     1           2          3         1             3              86   
2     1           2          3         2             1               0   
3     1           4          4         1             4             679   
4     1           4          4         3             2             291   
5     1           2          2         3             1               0   

    transactionAmount  transactionAvgAmount  transactionExpense  \
id                                                                
1         1156.718031             13.450210          676.025269   
2            0.000000                   NaN                 NaN   
3         7264.831053             10.699309         4985.957607   
4         3293.768106             11.318791         2129.425722   
5            0.000000                   NaN                 NaN   

    transac

In [390]:
userFeatures_select=userFeatures.copy()

In [391]:
X=userFeatures_select.values
imp=Imputer(missing_values='NaN', strategy='most_frequent', axis=0)
X=imp.fit_transform(X)
y=trainOverdue.values.ravel()

In [392]:
corrmatrix=np.corrcoef(X.T, y)
print corrmatrix[-1,:-1]

[-0.09984565  0.00826746  0.03870428  0.00396767  0.01066433  0.02240804
  0.01833539  0.02938393  0.01928639 -0.052578    0.02922111  0.01526059
  0.01430856  0.00833712 -0.00175276 -0.00548025 -0.01266307 -0.01799713
 -0.02566857  0.01834179  0.01834026 -0.07649229 -0.05819992  0.13775726
 -0.01059447 -0.05074661  0.01815592  0.00654069 -0.01376585 -0.00278987
  0.0146259   0.01804614 -0.00943854 -0.0422042  -0.00684591  0.03592918
  0.01793669 -0.00793756  0.00428987 -0.00054338  0.00666468  0.01036003
  0.01804614 -0.02056155  0.01613081 -0.01261837  0.01202309]


In [393]:
X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.2, random_state=8)
weights=y_train+1

In [394]:
scaler=StandardScaler()
scaler.fit_transform(X_train)
scaler.transform(X_test)

array([[-0.33689176,  2.14097994,  0.99953475, ..., -0.62446927,
        -0.41475882, -0.58884405],
       [-0.33689176,  0.81791615, -0.30181203, ...,  1.60135982,
        -0.41475882, -0.58884405],
       [-0.33689176, -0.50514763, -0.30181203, ...,  1.60135982,
        -0.41475882, -0.58884405],
       ..., 
       [-0.33689176, -0.50514763, -0.30181203, ..., -0.62446927,
        -0.41475882,  1.6982425 ],
       [-0.33689176, -0.50514763, -0.30181203, ..., -0.62446927,
        -0.41475882, -0.58884405],
       [-0.33689176, -0.50514763,  0.99953475, ...,  1.60135982,
        -0.41475882, -0.58884405]])

In [395]:
factors=abs(np.array(corrmatrix[-1, :-1]))**1.7
for i in xrange(2):
    factors=normalize(factors.reshape(1,-1))
    factors*=abs(corrmatrix[-1, :-1])
print factors

[[  3.75782888e-02   3.72989658e-06   1.12752794e-03   2.46603444e-07
    9.56693608e-06   1.49249670e-04   7.10546474e-05   4.06842089e-04
    8.56736677e-05   3.50262852e-03   3.98563203e-04   3.60274373e-05
    2.83872668e-05   3.84749474e-06   1.20001948e-08   8.14670218e-07
    1.80639722e-05   6.63240754e-05   2.46720482e-04   7.11464696e-05
    7.11245881e-05   1.40218264e-02   5.10072720e-03   1.23635379e-01
    9.33711659e-06   3.07202916e-03   6.85152170e-05   1.56756394e-06
    2.46031249e-05   6.70010602e-08   3.07874109e-05   6.69948523e-05
    6.08928515e-06   1.55322838e-03   1.85572734e-06   8.56198260e-04
    6.55036963e-05   3.20819762e-06   3.29201956e-07   1.57505213e-10
    1.68036243e-06   8.59517488e-06   6.69948523e-05   1.08574041e-04
    4.42330877e-05   1.78291861e-05   1.49100259e-05]]


In [396]:
X_train*=factors
X_test*=factors
    

In [397]:
knnc = KNeighborsClassifier(n_neighbors=10, weights='distance')
knnc.fit(X_train, y_train)
pred=knnc.predict_proba(X_test)[:,1]

In [398]:
print len(pred[pred>0.5])
print 1.0*sum(y_test)/len(y_test)
print sum(y_test)
print sum(y_train)
print 1.0*len(pred[(pred>0.5)&(y_test==1)])/sum(y_test==1)

p=pred[y_test==1]
n=pred[y_test==0]
print ks_2samp(p,n)

383
0.130575539568
1452
5731
0.0771349862259
Ks_2sampResult(statistic=0.13507997187050858, pvalue=1.4274786593642516e-20)
