# 신용카드 사용정보로 해당 Transaction 의 이상(Fraud)여부를 예측

In [523]:
import pandas as pd
import numpy as np
from scipy import stats
import statsmodels.formula.api as sm

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

In [524]:
data = pd.read_csv("15.csv")

In [525]:
data.describe()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
count,284806.0,284806.0,284806.0,284806.0,284806.0,284806.0,284806.0,284806.0,284806.0,284806.0,...,284806.0,284806.0,284806.0,284806.0,284806.0,284806.0,284806.0,284806.0,284806.0,992.0
mean,94813.585781,2e-06,6.661837e-07,-2e-06,2e-06,4.405008e-08,2e-06,-6e-06,1e-06,-2e-06,...,-9.166149e-07,-2e-06,-1e-06,-3.088756e-08,2e-06,3e-06,8.483873e-09,-4.792707e-08,88.349168,0.495968
std,47488.00453,1.958699,1.651311,1.516257,1.415871,1.380249,1.332273,1.237092,1.194355,1.098634,...,0.7345251,0.725702,0.624461,0.6056481,0.521278,0.482225,0.4036332,0.3300838,250.120432,0.500236
min,0.0,-56.40751,-72.71573,-48.325589,-5.683171,-113.7433,-26.160506,-43.557242,-73.216718,-13.434066,...,-34.83038,-10.933144,-44.807735,-2.836627,-10.295397,-2.604551,-22.56568,-15.43008,0.0,0.0
25%,54201.25,-0.920374,-0.5985522,-0.890368,-0.848642,-0.6915995,-0.768296,-0.55408,-0.208628,-0.643098,...,-0.2283974,-0.542351,-0.161846,-0.3545895,-0.317142,-0.326979,-0.07083961,-0.05295995,5.6,0.0
50%,84691.5,0.018109,0.06549621,0.179846,-0.019845,-0.05433621,-0.274186,0.040097,0.022358,-0.051429,...,-0.0294502,0.006781,-0.011196,0.04097671,0.016596,-0.052134,0.001342244,0.01124381,22.0,0.0
75%,139320.0,1.315645,0.8037257,1.027198,0.743348,0.6119267,0.398567,0.570426,0.327346,0.59714,...,0.1863701,0.528548,0.147641,0.439527,0.350716,0.240955,0.09104579,0.07828043,77.16,1.0
max,172788.0,2.45493,22.05773,9.382558,16.875344,34.80167,73.301626,120.589494,20.007208,15.594995,...,27.20284,10.50309,22.528412,4.584549,7.519589,3.517346,31.6122,33.84781,25691.16,1.0


In [526]:
data.sample(5)

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
124763,77230.0,-1.271877,0.248638,-0.073104,-0.336529,-0.782379,-0.062836,1.776083,0.286979,-0.736094,...,0.315112,0.390624,0.686424,0.032392,-0.177793,0.28518,0.107888,0.183209,349.61,
77912,56957.0,-1.64329,-1.886028,1.771232,-1.25219,1.142856,-2.011341,-0.637147,-0.065567,-1.298345,...,-0.318193,-1.065566,0.473506,0.473567,-0.543376,0.567544,-0.220625,-0.076304,45.8,
274125,165849.0,0.443117,-0.344064,-2.49224,-1.675471,2.090221,3.914964,-1.336886,-1.943732,-1.234642,...,-1.108148,1.19278,-0.133106,0.738858,0.216407,0.079869,0.249301,0.379855,110.14,
161242,113722.0,2.058702,-0.204885,-1.640988,-0.025752,0.365018,-0.303526,-0.061372,-0.010453,0.743192,...,0.276858,0.846477,-0.027507,0.204098,0.314687,-0.434967,-0.007236,-0.065154,1.0,
152845,97374.0,1.929557,-0.493178,0.187064,0.560419,-0.976007,-0.372312,-0.929273,-0.066178,2.651414,...,-0.106463,0.041204,0.285473,-0.13804,-0.607347,0.446228,-0.043545,-0.04009,39.0,


In [527]:
print(data['Class'].isnull().sum())   # null value : 283814개
print(data['Class'].count())   # not null : 992개
print(data['Time'].count())  # 284806개

283814
992
284806


## Analysis
- Time : 발생시간
- V1~V28 : 신용카드 사용 Transaction에 대한 정보(PCA를 통해 변형된 형태)
- Amount : 구매액
- Class : Fraud 여부(0: 정상, 1: Fraud)

1) Amount 200이하 데이터 제거(소액 제거)<br>
2) not null인 992개는 처리하지 않는다.

```
data_new_df = data[(~data['Class'].isnull()) | 
                   ((data['Class'].isnull()) & (data['Amount'] > 200))]
```
```
data_new_df = data[(data['Class'].notnull()) |
                   ((data['Class'].isnull()) & (data['Amount'] > 200))]
```

In [528]:
data_cp = data.copy()

In [529]:
data_df_isnull = data_cp[(data_cp['Class'].isnull()) & (data_cp['Amount'] > 200)]
# len : 28704
data_df_notnull = data_cp[data_cp['Class'].notnull()]
# len : 992

3) V1~V28을 이용하여 Class가 Null이 아닌 모든 데이터와 유클리디안 거리 계산.<br>
   (Class가 Null인 점마다 5개 가장 가까운 Class가 Null이 아닌 점을 선별)
  

- V1~V28을 이용하여 Class가 Null이 아닌 모든 데이터
```
data_df_notnull = data_df[data_df['Class'].notnull()]
data_df_isnull = data_df[data_df['Class'].isnull()]
print(len(data_df_isnull))
print(len(data_df_notnull))
```

In [530]:
data_df_isnull

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
994,1.0,-1.358354,-1.340163,1.773209,0.379780,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,
1012,16.0,0.694885,-1.361819,1.029221,0.834159,-1.191209,1.309109,-0.878586,0.445290,-0.446196,...,-0.295583,-0.571955,-0.050881,-0.304215,0.072001,-0.422234,0.086553,0.063499,231.71,
1043,36.0,-1.004929,-0.985978,-0.038039,3.710061,-6.631951,5.122103,4.371691,-2.006868,-0.278736,...,1.393406,-0.381671,0.969719,0.019445,0.570923,0.333278,0.857373,-0.075538,1402.95,
1056,42.0,-0.522666,1.009923,0.276470,1.475289,-0.707013,0.355243,1.559849,-0.399579,-0.479813,...,0.172401,1.011543,0.069666,0.157820,-1.109224,-0.302369,0.318170,0.316910,243.66,
1077,55.0,-4.575093,-4.429184,3.402585,0.903915,3.002224,-0.491078,-2.705393,0.666451,1.922216,...,-0.047365,0.853360,-0.971600,-0.114862,0.408300,-0.304576,0.547785,-0.456297,200.01,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
284732,172721.0,1.076175,-3.116353,-2.051439,-0.953189,-1.544838,-1.124645,0.385570,-0.698014,-1.829401,...,0.104853,-0.598243,-0.343164,0.088299,-0.267658,-0.313651,-0.132091,0.056029,664.60,
284733,172726.0,-1.353026,0.289945,0.173684,-3.780442,-1.173016,-0.159307,3.009433,-1.544902,1.422788,...,0.415421,-0.701799,-0.144663,1.056709,1.074381,-1.129447,0.033781,-0.325899,405.09,
284735,172727.0,-1.661169,-0.565425,0.294268,-1.549156,-2.301359,2.365956,-0.248881,-0.857361,0.137784,...,1.432397,-0.257828,-0.072471,-1.035804,-0.437889,-0.238543,0.365302,-0.448621,381.05,
284748,172738.0,1.634178,-0.486939,-1.975967,0.495364,0.263635,-0.713049,0.459925,-0.336879,0.743676,...,-0.113197,-0.493594,0.001993,0.602533,-0.049936,-0.145522,-0.040554,0.024884,220.28,


In [531]:
y = data_df_notnull['Class']
X = data_df_notnull.drop(['Class', 'Time','Amount'], axis=1)

test_X = data_df_isnull.drop(["Class", "Time", 'Amount'], axis=1)

In [532]:
X.head(5)

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28
0,-2.312227,1.951992,-1.609851,3.997906,-0.522188,-1.426545,-2.537387,1.391657,-2.770089,-2.772272,...,0.416956,0.126911,0.517232,-0.035049,-0.465211,0.320198,0.044519,0.17784,0.261145,-0.143276
1,-3.043541,-3.157307,1.088463,2.288644,1.359805,-1.064823,0.325574,-0.067794,-0.270953,-0.838587,...,0.283345,2.102339,0.661696,0.435477,1.375966,-0.293803,0.279798,-0.145362,-0.252773,0.035764
2,-2.30335,1.759247,-0.359745,2.330243,-0.821628,-0.075788,0.56232,-0.399147,-0.238253,-1.525412,...,-1.334441,-0.430022,-0.294166,-0.932391,0.172726,-0.08733,-0.156114,-0.542628,0.039566,-0.153029
3,-4.397974,1.358367,-2.592844,2.679787,-1.128131,-1.706536,-3.496197,-0.248778,-0.247768,-4.801637,...,0.308334,-0.171608,0.573574,0.176968,-0.436207,-0.053502,0.252405,-0.657488,-0.827136,0.849573
4,1.234235,3.01974,-4.304597,4.732795,3.624201,-1.357746,1.713445,-0.496358,-1.282858,-2.447469,...,-2.721853,0.009061,-0.379068,-0.704181,-0.656805,-1.632653,1.488901,0.566797,-0.010016,0.146793


In [533]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X, y)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [534]:
pred = knn.predict(test_X)
pred_prob = knn.predict_proba(test_X)

In [535]:
pred_df = pd.DataFrame(pred_prob)

In [536]:
# 컬럼이름 찾기
list(pred_df.columns)

[0, 1]

In [537]:
pred_df.rename(columns={0:"prob_0", 1:"prob_1"}, inplace=True)

In [538]:
# 인덱스 리셋 : data_df_isnull.reset_index()
data_df_isnull = data_df_isnull.reset_index().drop(['index'], axis=1)

In [539]:
# Class가 null인 행들과 이를 knn으로 predict_prob한 값을 join해준다.
data_df_join = data_df_isnull.join(pred_df, how='right')

In [540]:
data_df_join

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V23,V24,V25,V26,V27,V28,Amount,Class,prob_0,prob_1
0,1.0,-1.358354,-1.340163,1.773209,0.379780,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,,1.0,0.0
1,16.0,0.694885,-1.361819,1.029221,0.834159,-1.191209,1.309109,-0.878586,0.445290,-0.446196,...,-0.050881,-0.304215,0.072001,-0.422234,0.086553,0.063499,231.71,,0.8,0.2
2,36.0,-1.004929,-0.985978,-0.038039,3.710061,-6.631951,5.122103,4.371691,-2.006868,-0.278736,...,0.969719,0.019445,0.570923,0.333278,0.857373,-0.075538,1402.95,,0.6,0.4
3,42.0,-0.522666,1.009923,0.276470,1.475289,-0.707013,0.355243,1.559849,-0.399579,-0.479813,...,0.069666,0.157820,-1.109224,-0.302369,0.318170,0.316910,243.66,,0.8,0.2
4,55.0,-4.575093,-4.429184,3.402585,0.903915,3.002224,-0.491078,-2.705393,0.666451,1.922216,...,-0.971600,-0.114862,0.408300,-0.304576,0.547785,-0.456297,200.01,,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28699,172721.0,1.076175,-3.116353,-2.051439,-0.953189,-1.544838,-1.124645,0.385570,-0.698014,-1.829401,...,-0.343164,0.088299,-0.267658,-0.313651,-0.132091,0.056029,664.60,,1.0,0.0
28700,172726.0,-1.353026,0.289945,0.173684,-3.780442,-1.173016,-0.159307,3.009433,-1.544902,1.422788,...,-0.144663,1.056709,1.074381,-1.129447,0.033781,-0.325899,405.09,,0.8,0.2
28701,172727.0,-1.661169,-0.565425,0.294268,-1.549156,-2.301359,2.365956,-0.248881,-0.857361,0.137784,...,-0.072471,-1.035804,-0.437889,-0.238543,0.365302,-0.448621,381.05,,1.0,0.0
28702,172738.0,1.634178,-0.486939,-1.975967,0.495364,0.263635,-0.713049,0.459925,-0.336879,0.743676,...,0.001993,0.602533,-0.049936,-0.145522,-0.040554,0.024884,220.28,,0.6,0.4


```
data_df_join[(data_df_join['prob_0']==1) | (data_df_join['prob_1']==1)]
```

In [541]:
# prob_0과 prob_1에 1이 하나라도 있으면 Class는 1, 아니면 0
data_df_join['Class'] = np.where(data_df_join['prob_0'] == 1,1.0,
                                  np.where(data_df_join['prob_1'] == 1,1.0, 0.0))

In [542]:
# Class가 1인 값들의 개수
data_df_join['Class'][data_df_join['Class']==1].count()

18171

In [543]:
data_df_tot = pd.concat([data_df_join, data_df_notnull], axis=0, sort=False)

In [544]:
data_df_tot

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V23,V24,V25,V26,V27,V28,Amount,Class,prob_0,prob_1
0,1.0,-1.358354,-1.340163,1.773209,0.379780,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,1.0,1.0,0.0
1,16.0,0.694885,-1.361819,1.029221,0.834159,-1.191209,1.309109,-0.878586,0.445290,-0.446196,...,-0.050881,-0.304215,0.072001,-0.422234,0.086553,0.063499,231.71,0.0,0.8,0.2
2,36.0,-1.004929,-0.985978,-0.038039,3.710061,-6.631951,5.122103,4.371691,-2.006868,-0.278736,...,0.969719,0.019445,0.570923,0.333278,0.857373,-0.075538,1402.95,0.0,0.6,0.4
3,42.0,-0.522666,1.009923,0.276470,1.475289,-0.707013,0.355243,1.559849,-0.399579,-0.479813,...,0.069666,0.157820,-1.109224,-0.302369,0.318170,0.316910,243.66,0.0,0.8,0.2
4,55.0,-4.575093,-4.429184,3.402585,0.903915,3.002224,-0.491078,-2.705393,0.666451,1.922216,...,-0.971600,-0.114862,0.408300,-0.304576,0.547785,-0.456297,200.01,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
987,160792.0,-0.283373,0.465155,-0.126904,-0.563983,1.835674,-0.302165,0.940219,-0.373045,0.286529,...,-0.401573,0.204514,-0.530178,0.432400,-0.257356,-0.249556,1.99,0.0,,
988,78634.0,0.896786,-0.856679,0.732347,0.122525,-1.097110,-0.198544,-0.385943,-0.037021,0.874510,...,-0.176457,-0.014605,0.106133,1.007842,-0.069551,0.046387,196.92,0.0,,
989,52988.0,-0.350712,0.254978,0.552831,-1.621550,-0.967831,0.087754,-0.667752,-2.533414,-0.790715,...,-0.270141,-0.476262,0.797467,0.940991,0.195055,0.150789,118.00,0.0,,
990,161926.0,1.709154,-1.199642,-1.084415,-0.322759,-0.806666,-0.779844,-0.088780,-0.257100,1.743393,...,-0.132582,0.018951,0.004579,0.828962,-0.083130,-0.037102,189.73,0.0,,


### Amount의 Range = Max-Min

In [545]:
Range = data_df_tot['Amount'].max() - data_df_tot['Amount'].min()
print("Range :", round(Range, 0))

Range : 25691.0


# Q1. Range : 25691

### 검정 수행(검정 통계량 T-value)
- 두 집단 평균 검정 : 동질성 가정 전제 
- Class "0"과 Class "1"의 V2에 대해서

In [546]:
# Class "0"에 대한 V2 리스트
Class_0_V2 = data_df_tot[data_df_tot['Class'] == 0.0]['V2']
Class_1_V2 = data_df_tot[data_df_tot['Class'] == 1.0]['V2']
Class_0_V2_list = Class_0_V2.tolist()
Class_1_V2_list = Class_1_V2.tolist()

In [547]:
print(len(Class_0_V2_list))
print(len(Class_1_V2_list))
print(len(data_df_tot) == (len(Class_0_V2_list)+len(Class_1_V2_list)))

11033
18663
True


In [548]:
# 검정통계량 수행
ttest = stats.ttest_ind(Class_0_V2_list, Class_1_V2_list)
print(ttest)

Ttest_indResult(statistic=-36.28262034140589, pvalue=4.443749646953339e-282)


In [549]:
print("검정 통계량(T-value) = {}".format(abs(np.round(ttest[0], 0))))

검정 통계량(T-value) = 36.0


# Q2. 검정 통계량(T-value) = 36

In [550]:
data_df_linear = data_df_tot[data_df_tot['Class'] == 1]
data_df_linear = data_df_linear.drop(['Time', 'Class', 'prob_0', 'prob_1'], axis=1)
print(len(data_df_linear))
print(data_df_linear.columns)

18663
Index(['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11',
       'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21',
       'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount'],
      dtype='object')


In [551]:
y_linear = data_df_linear.Amount
X_linear = data_df_linear.drop(['Amount'], axis=1)

In [552]:
X_linear.head(5)

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28
0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,...,-2.261857,0.52498,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752
4,-4.575093,-4.429184,3.402585,0.903915,3.002224,-0.491078,-2.705393,0.666451,1.922216,-0.614312,...,-1.637573,-0.960963,-0.047365,0.85336,-0.9716,-0.114862,0.4083,-0.304576,0.547785,-0.456297
5,-0.773293,-4.146007,-0.932038,0.027094,-1.698307,0.460188,0.737344,-0.314216,-0.842673,0.017276,...,-0.00507,2.442782,0.890978,0.026123,-1.134769,-0.654958,0.098386,-0.20915,-0.171709,0.208057
8,0.270725,-1.615317,1.054982,1.66151,-1.737687,0.065894,-0.313977,0.089081,1.069842,-0.348638,...,-0.707148,0.754994,0.475338,0.496083,-0.482194,0.418871,0.235961,-0.265185,-0.001063,0.120126
9,0.607153,-0.957413,0.761505,1.59005,-1.177298,0.011232,-0.166991,0.15272,0.74286,-0.151018,...,0.154422,0.304714,0.166237,0.100682,-0.317001,0.543778,0.484232,-0.330425,-0.001545,0.062098


In [553]:
y_linear.head(5)

0     378.66
4     200.01
5    1142.02
8     459.39
9     268.78
Name: Amount, dtype: float64

In [554]:
mlr = LinearRegression()
mlr.fit(X_linear, y_linear)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [555]:
R_squared = mlr.score(X_linear, y_linear)
print("R-Squared = {}".format(round(R_squared, 2)))

R-Squared = 0.92


# Q3. R_squared = 0.92

In [556]:
# data split 
# X : V1~V28, Amount
# y : Class
# train data set : Time < 100,000  => 16,142 rows
# test data set : Time > 100,000  => 13,554 rows

In [557]:
train_data_set = data_df_tot[data_df_tot['Time'] <= 100000]
test_data_set = data_df_tot[data_df_tot['Time'] > 100000]
print(len(train_data_set), len(test_data_set))

16142 13554


In [558]:
X_train = train_data_set.drop(['Time', 'Class', 'prob_0', 'prob_1'], axis=1)
y_train = train_data_set.Class
X_test = test_data_set.drop(['Time', 'Class', 'prob_0', 'prob_1'], axis=1)
y_test = test_data_set.Class