# 신용카드 사용정보로 해당 Transaction 의 이상(Fraud)여부를 예측

In [2]:
import pandas as pd
import numpy as np
from scipy import stats
import statsmodels.formula.api as sm

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

In [3]:
data = pd.read_csv("15.csv")

In [4]:
data.describe()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
count,284806.0,284806.0,284806.0,284806.0,284806.0,284806.0,284806.0,284806.0,284806.0,284806.0,...,284806.0,284806.0,284806.0,284806.0,284806.0,284806.0,284806.0,284806.0,284806.0,992.0
mean,94813.585781,2e-06,6.661837e-07,-2e-06,2e-06,4.405008e-08,2e-06,-6e-06,1e-06,-2e-06,...,-9.166149e-07,-2e-06,-1e-06,-3.088756e-08,2e-06,3e-06,8.483873e-09,-4.792707e-08,88.349168,0.495968
std,47488.00453,1.958699,1.651311,1.516257,1.415871,1.380249,1.332273,1.237092,1.194355,1.098634,...,0.7345251,0.725702,0.624461,0.6056481,0.521278,0.482225,0.4036332,0.3300838,250.120432,0.500236
min,0.0,-56.40751,-72.71573,-48.325589,-5.683171,-113.7433,-26.160506,-43.557242,-73.216718,-13.434066,...,-34.83038,-10.933144,-44.807735,-2.836627,-10.295397,-2.604551,-22.56568,-15.43008,0.0,0.0
25%,54201.25,-0.920374,-0.5985522,-0.890368,-0.848642,-0.6915995,-0.768296,-0.55408,-0.208628,-0.643098,...,-0.2283974,-0.542351,-0.161846,-0.3545895,-0.317142,-0.326979,-0.07083961,-0.05295995,5.6,0.0
50%,84691.5,0.018109,0.06549621,0.179846,-0.019845,-0.05433621,-0.274186,0.040097,0.022358,-0.051429,...,-0.0294502,0.006781,-0.011196,0.04097671,0.016596,-0.052134,0.001342244,0.01124381,22.0,0.0
75%,139320.0,1.315645,0.8037257,1.027198,0.743348,0.6119267,0.398567,0.570426,0.327346,0.59714,...,0.1863701,0.528548,0.147641,0.439527,0.350716,0.240955,0.09104579,0.07828043,77.16,1.0
max,172788.0,2.45493,22.05773,9.382558,16.875344,34.80167,73.301626,120.589494,20.007208,15.594995,...,27.20284,10.50309,22.528412,4.584549,7.519589,3.517346,31.6122,33.84781,25691.16,1.0


In [5]:
data.sample(5)

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
37335,38568.0,-1.167605,1.466997,0.779613,1.513887,0.276054,-0.577632,1.153755,0.23792,-2.024241,...,0.226816,0.377257,-0.055328,0.535819,0.017618,-0.159549,-0.154842,0.037076,76.72,
3275,1828.0,1.277153,-1.297327,0.312629,-1.640637,-0.773766,1.317263,-1.407473,0.551383,-1.974433,...,-0.050738,0.300033,0.156144,-1.055581,0.007423,-0.092862,0.089864,-0.004879,20.0,
13518,22076.0,1.090666,0.336109,-0.072621,1.384967,0.060575,-1.096974,0.623667,-0.489797,0.919411,...,-0.082523,-0.131621,-0.226813,0.358635,0.810954,-0.346882,-0.050001,0.022698,99.99,
282843,171190.0,1.8843,-2.03602,-0.98746,-1.490773,-1.460388,-0.305543,-1.014028,-0.118446,-1.386881,...,-0.003478,0.070693,0.036431,-0.399682,-0.317717,-0.193146,-0.014699,-0.026547,207.65,
43900,41376.0,-2.5111,-1.221818,2.010529,1.553456,0.959792,-0.00296,-1.464054,1.125825,-0.66442,...,0.41442,0.294772,-0.111115,-0.297208,-0.090617,-0.224727,0.052261,-0.311082,22.8,


In [6]:
print(data['Class'].isnull().sum())   # null value : 283814개
print(data['Class'].count())   # not null : 992개
print(data['Time'].count())  # 284806개

283814
992
284806


## Analysis
- Time : 발생시간
- V1~V28 : 신용카드 사용 Transaction에 대한 정보(PCA를 통해 변형된 형태)
- Amount : 구매액
- Class : Fraud 여부(0: 정상, 1: Fraud)

1) Amount 200이하 데이터 제거(소액 제거)<br>
2) not null인 992개는 처리하지 않는다.

```
data_new_df = data[(~data['Class'].isnull()) | 
                   ((data['Class'].isnull()) & (data['Amount'] > 200))]
```
```
data_new_df = data[(data['Class'].notnull()) |
                   ((data['Class'].isnull()) & (data['Amount'] > 200))]
```

In [7]:
data_cp = data.copy()

In [8]:
data_df_isnull = data_cp[(data_cp['Class'].isnull()) & (data_cp['Amount'] > 200)]
# len : 28704
data_df_notnull = data_cp[data_cp['Class'].notnull()]
# len : 992

3) V1~V28을 이용하여 Class가 Null이 아닌 모든 데이터와 유클리디안 거리 계산.<br>
   (Class가 Null인 점마다 5개 가장 가까운 Class가 Null이 아닌 점을 선별)
  

- V1~V28을 이용하여 Class가 Null이 아닌 모든 데이터
```
data_df_notnull = data_df[data_df['Class'].notnull()]
data_df_isnull = data_df[data_df['Class'].isnull()]
print(len(data_df_isnull))
print(len(data_df_notnull))
```

In [9]:
data_df_isnull

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
994,1.0,-1.358354,-1.340163,1.773209,0.379780,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,
1012,16.0,0.694885,-1.361819,1.029221,0.834159,-1.191209,1.309109,-0.878586,0.445290,-0.446196,...,-0.295583,-0.571955,-0.050881,-0.304215,0.072001,-0.422234,0.086553,0.063499,231.71,
1043,36.0,-1.004929,-0.985978,-0.038039,3.710061,-6.631951,5.122103,4.371691,-2.006868,-0.278736,...,1.393406,-0.381671,0.969719,0.019445,0.570923,0.333278,0.857373,-0.075538,1402.95,
1056,42.0,-0.522666,1.009923,0.276470,1.475289,-0.707013,0.355243,1.559849,-0.399579,-0.479813,...,0.172401,1.011543,0.069666,0.157820,-1.109224,-0.302369,0.318170,0.316910,243.66,
1077,55.0,-4.575093,-4.429184,3.402585,0.903915,3.002224,-0.491078,-2.705393,0.666451,1.922216,...,-0.047365,0.853360,-0.971600,-0.114862,0.408300,-0.304576,0.547785,-0.456297,200.01,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
284732,172721.0,1.076175,-3.116353,-2.051439,-0.953189,-1.544838,-1.124645,0.385570,-0.698014,-1.829401,...,0.104853,-0.598243,-0.343164,0.088299,-0.267658,-0.313651,-0.132091,0.056029,664.60,
284733,172726.0,-1.353026,0.289945,0.173684,-3.780442,-1.173016,-0.159307,3.009433,-1.544902,1.422788,...,0.415421,-0.701799,-0.144663,1.056709,1.074381,-1.129447,0.033781,-0.325899,405.09,
284735,172727.0,-1.661169,-0.565425,0.294268,-1.549156,-2.301359,2.365956,-0.248881,-0.857361,0.137784,...,1.432397,-0.257828,-0.072471,-1.035804,-0.437889,-0.238543,0.365302,-0.448621,381.05,
284748,172738.0,1.634178,-0.486939,-1.975967,0.495364,0.263635,-0.713049,0.459925,-0.336879,0.743676,...,-0.113197,-0.493594,0.001993,0.602533,-0.049936,-0.145522,-0.040554,0.024884,220.28,


In [10]:
y = data_df_notnull['Class']
X = data_df_notnull.drop(['Class', 'Time','Amount'], axis=1)

test_X = data_df_isnull.drop(["Class", "Time", 'Amount'], axis=1)

In [11]:
X.head(5)

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28
0,-2.312227,1.951992,-1.609851,3.997906,-0.522188,-1.426545,-2.537387,1.391657,-2.770089,-2.772272,...,0.416956,0.126911,0.517232,-0.035049,-0.465211,0.320198,0.044519,0.17784,0.261145,-0.143276
1,-3.043541,-3.157307,1.088463,2.288644,1.359805,-1.064823,0.325574,-0.067794,-0.270953,-0.838587,...,0.283345,2.102339,0.661696,0.435477,1.375966,-0.293803,0.279798,-0.145362,-0.252773,0.035764
2,-2.30335,1.759247,-0.359745,2.330243,-0.821628,-0.075788,0.56232,-0.399147,-0.238253,-1.525412,...,-1.334441,-0.430022,-0.294166,-0.932391,0.172726,-0.08733,-0.156114,-0.542628,0.039566,-0.153029
3,-4.397974,1.358367,-2.592844,2.679787,-1.128131,-1.706536,-3.496197,-0.248778,-0.247768,-4.801637,...,0.308334,-0.171608,0.573574,0.176968,-0.436207,-0.053502,0.252405,-0.657488,-0.827136,0.849573
4,1.234235,3.01974,-4.304597,4.732795,3.624201,-1.357746,1.713445,-0.496358,-1.282858,-2.447469,...,-2.721853,0.009061,-0.379068,-0.704181,-0.656805,-1.632653,1.488901,0.566797,-0.010016,0.146793


In [12]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X, y)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [13]:
pred = knn.predict(test_X)
pred_prob = knn.predict_proba(test_X)

In [14]:
pred_df = pd.DataFrame(pred_prob)

In [15]:
# 컬럼이름 찾기
list(pred_df.columns)

[0, 1]

In [16]:
pred_df.rename(columns={0:"prob_0", 1:"prob_1"}, inplace=True)

```
# 인덱스 리셋 : data_df_isnull.reset_index()
data_df_isnull = data_df_isnull.reset_index().drop(['index'], axis=1)
```

In [19]:
pred_df.index = data_df_isnull.index

In [20]:
# Class가 null인 행들과 이를 knn으로 predict_prob한 값을 join해준다.
data_df_join = data_df_isnull.join(pred_df, how='right')

In [21]:
data_df_join

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V23,V24,V25,V26,V27,V28,Amount,Class,prob_0,prob_1
994,1.0,-1.358354,-1.340163,1.773209,0.379780,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,,1.0,0.0
1012,16.0,0.694885,-1.361819,1.029221,0.834159,-1.191209,1.309109,-0.878586,0.445290,-0.446196,...,-0.050881,-0.304215,0.072001,-0.422234,0.086553,0.063499,231.71,,0.8,0.2
1043,36.0,-1.004929,-0.985978,-0.038039,3.710061,-6.631951,5.122103,4.371691,-2.006868,-0.278736,...,0.969719,0.019445,0.570923,0.333278,0.857373,-0.075538,1402.95,,0.6,0.4
1056,42.0,-0.522666,1.009923,0.276470,1.475289,-0.707013,0.355243,1.559849,-0.399579,-0.479813,...,0.069666,0.157820,-1.109224,-0.302369,0.318170,0.316910,243.66,,0.8,0.2
1077,55.0,-4.575093,-4.429184,3.402585,0.903915,3.002224,-0.491078,-2.705393,0.666451,1.922216,...,-0.971600,-0.114862,0.408300,-0.304576,0.547785,-0.456297,200.01,,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
284732,172721.0,1.076175,-3.116353,-2.051439,-0.953189,-1.544838,-1.124645,0.385570,-0.698014,-1.829401,...,-0.343164,0.088299,-0.267658,-0.313651,-0.132091,0.056029,664.60,,1.0,0.0
284733,172726.0,-1.353026,0.289945,0.173684,-3.780442,-1.173016,-0.159307,3.009433,-1.544902,1.422788,...,-0.144663,1.056709,1.074381,-1.129447,0.033781,-0.325899,405.09,,0.8,0.2
284735,172727.0,-1.661169,-0.565425,0.294268,-1.549156,-2.301359,2.365956,-0.248881,-0.857361,0.137784,...,-0.072471,-1.035804,-0.437889,-0.238543,0.365302,-0.448621,381.05,,1.0,0.0
284748,172738.0,1.634178,-0.486939,-1.975967,0.495364,0.263635,-0.713049,0.459925,-0.336879,0.743676,...,0.001993,0.602533,-0.049936,-0.145522,-0.040554,0.024884,220.28,,0.6,0.4


```
data_df_join[(data_df_join['prob_0']==1) | (data_df_join['prob_1']==1)]
```

In [22]:
data_df_join['Class'] = np.where(data_df_join['prob_1'] > 0, 1.0, 0.0)

In [23]:
# Class가 1인 값들의 개수
data_df_join['Class'][data_df_join['Class']==1].count()

10579

In [24]:
data_df_tot = pd.concat([data_df_join, data_df_notnull], axis=0, sort=False)

In [25]:
data_df_tot[data_df_tot['Time']>100000][:10]

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V23,V24,V25,V26,V27,V28,Amount,Class,prob_0,prob_1
154154,100003.0,1.513001,-1.46022,0.573419,1.28172,-1.377666,1.134751,-1.397979,0.303419,3.763767,...,0.002744,0.611141,-0.369068,0.35934,-0.003411,0.002982,212.5,0.0,1.0,0.0
154159,100015.0,-5.596405,-8.732726,-0.80685,2.177562,-3.16808,2.781225,2.501315,-0.893235,2.021656,...,-2.741873,-0.968652,0.116252,-0.177389,-0.369793,0.62143,2793.6,1.0,0.6,0.4
154192,100122.0,1.451098,-1.305284,0.856605,2.197358,-1.58295,0.871658,-1.330239,0.305305,3.763204,...,-0.037731,-0.12246,-0.249242,-0.524634,0.070275,0.008423,208.0,0.0,1.0,0.0
154206,100186.0,-5.103307,-2.642674,-1.2051,-1.887425,0.519254,2.572556,2.243553,-3.863595,1.029016,...,-1.073555,-0.809874,0.982825,-0.019724,-1.197496,1.528615,618.87,1.0,0.8,0.2
154219,100231.0,-11.431664,-7.130159,-3.229779,-0.243238,0.429332,1.973722,1.887113,-6.583154,1.862978,...,2.262377,-1.419308,2.327538,0.350273,-0.693349,5.050808,560.03,1.0,0.8,0.2
154220,100241.0,-1.623548,-6.105468,-2.378487,2.257853,-2.309414,0.030371,2.101908,-0.662193,2.347694,...,-1.194434,-0.037456,-1.14241,-0.985416,-0.349231,0.283417,1873.12,1.0,0.6,0.4
154221,100245.0,-5.456901,-2.281444,-0.335536,-0.269222,-0.68217,0.385128,2.335741,-1.334832,4.275531,...,-0.907491,-0.211392,0.234043,0.007811,-0.31218,3.214474,500.0,1.0,0.6,0.4
154229,100310.0,-0.701508,-1.231016,0.242227,0.271435,-1.325206,1.044076,1.292137,0.108918,1.811668,...,1.313805,0.594874,-1.365827,-0.022,0.113249,0.320304,500.0,1.0,0.8,0.2
154233,100341.0,-0.431772,0.670348,-0.445506,0.080966,-1.211844,1.072223,2.069026,-1.175973,1.520056,...,0.75172,0.410956,-1.531651,-0.859704,0.543209,0.222248,464.04,0.0,1.0,0.0
154234,100352.0,-1.578734,-0.501762,0.645626,-1.05034,-2.080498,-0.468154,2.00469,-1.100976,2.718551,...,0.64329,0.994164,-0.594658,0.515928,-0.267095,-0.466762,378.0,0.0,1.0,0.0


In [26]:
data_df_tot[:10]

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V23,V24,V25,V26,V27,V28,Amount,Class,prob_0,prob_1
994,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0.0,1.0,0.0
1012,16.0,0.694885,-1.361819,1.029221,0.834159,-1.191209,1.309109,-0.878586,0.44529,-0.446196,...,-0.050881,-0.304215,0.072001,-0.422234,0.086553,0.063499,231.71,1.0,0.8,0.2
1043,36.0,-1.004929,-0.985978,-0.038039,3.710061,-6.631951,5.122103,4.371691,-2.006868,-0.278736,...,0.969719,0.019445,0.570923,0.333278,0.857373,-0.075538,1402.95,1.0,0.6,0.4
1056,42.0,-0.522666,1.009923,0.27647,1.475289,-0.707013,0.355243,1.559849,-0.399579,-0.479813,...,0.069666,0.15782,-1.109224,-0.302369,0.31817,0.31691,243.66,1.0,0.8,0.2
1077,55.0,-4.575093,-4.429184,3.402585,0.903915,3.002224,-0.491078,-2.705393,0.666451,1.922216,...,-0.9716,-0.114862,0.4083,-0.304576,0.547785,-0.456297,200.01,0.0,1.0,0.0
1081,59.0,-0.773293,-4.146007,-0.932038,0.027094,-1.698307,0.460188,0.737344,-0.314216,-0.842673,...,-1.134769,-0.654958,0.098386,-0.20915,-0.171709,0.208057,1142.02,0.0,1.0,0.0
1126,83.0,-1.897331,0.955626,0.052543,1.276656,-3.323084,3.229911,1.029631,1.515607,-0.059627,...,0.477537,-0.608981,-1.120892,-0.413851,0.061399,-0.187964,552.18,1.0,0.8,0.2
1132,87.0,-5.101877,1.897022,-3.458034,-1.277543,-5.517758,2.098366,3.329603,1.250966,0.271501,...,-0.5559,-0.76166,0.066611,0.767227,0.731634,-0.86031,919.6,1.0,0.8,0.2
1134,89.0,0.270725,-1.615317,1.054982,1.66151,-1.737687,0.065894,-0.313977,0.089081,1.069842,...,-0.482194,0.418871,0.235961,-0.265185,-0.001063,0.120126,459.39,0.0,1.0,0.0
1139,92.0,0.607153,-0.957413,0.761505,1.59005,-1.177298,0.011232,-0.166991,0.15272,0.74286,...,-0.317001,0.543778,0.484232,-0.330425,-0.001545,0.062098,268.78,0.0,1.0,0.0


### Amount의 Range = Max-Min

In [27]:
Range = data_df_tot['Amount'].max() - data_df_tot['Amount'].min()
print("Range :", round(Range, 0))

Range : 25691.0


# Q1. Range : 25691

### 검정 수행(검정 통계량 T-value)
- 두 집단 평균 검정 : 동질성 가정 전제 
- Class "0"과 Class "1"의 V2에 대해서

In [28]:
# Class "0"에 대한 V2 리스트
Class_0_V2 = data_df_tot[data_df_tot['Class'] == 0.0]['V2']
Class_1_V2 = data_df_tot[data_df_tot['Class'] == 1.0]['V2']
Class_0_V2_list = Class_0_V2.tolist()
Class_1_V2_list = Class_1_V2.tolist()

In [29]:
print(len(Class_0_V2_list))
print(len(Class_1_V2_list))
print(len(data_df_tot) == (len(Class_0_V2_list)+len(Class_1_V2_list)))

18625
11071
True


In [30]:
print(np.average(Class_0_V2_list))
print(np.average(Class_1_V2_list))

-1.450720210444725
-2.424646112206937


In [31]:
# 검정통계량 수행
ttest = stats.ttest_ind(Class_0_V2_list, Class_1_V2_list)
print(ttest)

Ttest_indResult(statistic=28.51539510452883, pvalue=1.811474220735816e-176)


In [32]:
print("검정 통계량(T-value) = {}".format(abs(np.round(ttest[0], 0))))

검정 통계량(T-value) = 29.0


# Q2. 검정 통계량(T-value) = 29

In [33]:
data_df_linear = data_df_tot[data_df_tot['Class'] == 1]
data_df_linear = data_df_linear.drop(['Time', 'Class', 'prob_0', 'prob_1'], axis=1)
print(len(data_df_linear))
print(data_df_linear.columns)

11071
Index(['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11',
       'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21',
       'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount'],
      dtype='object')


In [34]:
y_linear = data_df_linear.Amount
X_linear = data_df_linear.drop(['Amount'], axis=1)

In [35]:
X_linear.head(5)

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28
1012,0.694885,-1.361819,1.029221,0.834159,-1.191209,1.309109,-0.878586,0.44529,-0.446196,0.568521,...,-1.300408,-0.138334,-0.295583,-0.571955,-0.050881,-0.304215,0.072001,-0.422234,0.086553,0.063499
1043,-1.004929,-0.985978,-0.038039,3.710061,-6.631951,5.122103,4.371691,-2.006868,-0.278736,-0.230873,...,1.190398,-0.00198,1.393406,-0.381671,0.969719,0.019445,0.570923,0.333278,0.857373,-0.075538
1056,-0.522666,1.009923,0.27647,1.475289,-0.707013,0.355243,1.559849,-0.399579,-0.479813,0.516352,...,0.917399,-0.013016,0.172401,1.011543,0.069666,0.15782,-1.109224,-0.302369,0.31817,0.31691
1126,-1.897331,0.955626,0.052543,1.276656,-3.323084,3.229911,1.029631,1.515607,-0.059627,-1.461403,...,-0.930024,-0.83263,0.128416,0.776078,0.477537,-0.608981,-1.120892,-0.413851,0.061399,-0.187964
1132,-5.101877,1.897022,-3.458034,-1.277543,-5.517758,2.098366,3.329603,1.250966,0.271501,-0.305483,...,-0.698936,-1.270478,-0.871744,-0.678879,-0.5559,-0.76166,0.066611,0.767227,0.731634,-0.86031


In [36]:
y_linear.head(5)

1012     231.71
1043    1402.95
1056     243.66
1126     552.18
1132     919.60
Name: Amount, dtype: float64

In [37]:
mlr = LinearRegression()
mlr.fit(X_linear, y_linear)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [38]:
R_squared = mlr.score(X_linear, y_linear)
print("R-Squared = {}".format(round(R_squared, 2)))

R-Squared = 0.93


# Q3. R_squared = 0.93

In [39]:
# data split 
# X : V1~V28, Amount
# y : Class
# train data set : Time < 100,000  => 16,142 rows
# test data set : Time > 100,000  => 13,554 rows

In [40]:
train_data_set = data_df_tot[data_df_tot['Time'] <= 100000]
test_data_set = data_df_tot[data_df_tot['Time'] > 100000]
print(len(train_data_set), len(test_data_set))

16142 13554


In [41]:
X_train = train_data_set.drop(['Time', 'Class', 'prob_0', 'prob_1'], axis=1)
y_train = train_data_set.Class
X_test = test_data_set.drop(['Time', 'Class', 'prob_0', 'prob_1'], axis=1)
y_test = test_data_set.Class

In [42]:
log_reg = LogisticRegression(penalty='l2', solver='newton-cg', random_state=1234, C=100000)

In [43]:
log_reg.fit(X_train, y_train)

LogisticRegression(C=100000, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=1234, solver='newton-cg', tol=0.0001, verbose=0,
                   warm_start=False)

In [44]:
#pred = log_reg.predict(X_test)
pred = log_reg.predict_proba(X_test)

In [45]:
pred_df = pd.DataFrame(pred, columns=['prob_0', 'prob_1'])

In [46]:
pred_df['Class_pred'] = np.where(pred_df['prob_1'] > 0.4, 1, 0)

In [47]:
y_pred = pred_df['Class_pred']

In [48]:
print(len(y_test), len(y_pred))

13554 13554


In [49]:
acc = accuracy_score(y_test, y_pred)

In [50]:
print(y_test[:10], len(y_test))
print(y_pred[:10])

154154    0.0
154159    1.0
154192    0.0
154206    1.0
154219    1.0
154220    1.0
154221    1.0
154229    1.0
154233    0.0
154234    0.0
Name: Class, dtype: float64 13554
0    0
1    1
2    0
3    0
4    1
5    1
6    1
7    0
8    1
9    1
Name: Class_pred, dtype: int32


In [51]:
print('Accuracy :', round(acc, 3))

Accuracy : 0.815


# Q4. Accuracy : 0.815