# BIG DATA ANALYSIS : 이상탐지 적용
---


## 신용카드 사기 결제에 대한 데이터 로드

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
# df = pd.read_csv('credit2.csv')
df = pd.read_csv('credit.csv')

## EDA

In [None]:
df.head()


In [None]:
df.info()

In [None]:
df.describe()


## Class Imbalance 확인

In [None]:
df['Class'].value_counts()

In [None]:
LABELS = ["Normal", "Fraud"]

count_classes = pd.value_counts(df['Class'], sort = True)
count_classes.plot(kind = 'bar', rot=0)
plt.title("Transaction Class Distribution")
plt.xticks(range(2), LABELS)
plt.xlabel("Class")
plt.ylabel("Frequency");

## 사기와 정상 결제의 비교

In [None]:
Fraud = df[df['Class']==1]

Normal = df[df['Class']==0]

In [None]:
Fraud.shape

In [None]:
Normal.shape

In [None]:
Fraud.Amount.describe()

In [None]:
Normal.Amount.describe()

## 혹시 결제 금액에 따라 구별 할 수 있지 않을까?

In [None]:
f, (ax1, ax2) = plt.subplots(2, 1, sharex=True)
f.suptitle('Amount per transaction by class')
bins = 50
ax1.hist(Fraud.Amount, bins = bins)
ax1.set_title('Fraud')
ax2.hist(Normal.Amount, bins = bins)
ax2.set_title('Normal')
plt.xlabel('Amount ($)')
plt.ylabel('Number of Transactions')
plt.xlim((0, 20000))
plt.yscale('log')
plt.show();

## 혹시 결제 시간에 따라 구별 할 수 있지 않을까?

In [None]:
f, (ax1, ax2) = plt.subplots(2, 1, sharex=True)
f.suptitle('Time of transaction vs Amount by class')
ax1.scatter(Fraud.Time, Fraud.Amount)
ax1.set_title('Fraud')
ax2.scatter(Normal.Time, Normal.Amount)
ax2.set_title('Normal')
plt.xlabel('Time (in Seconds)')
plt.ylabel('Amount')
plt.show()


## 전체 Feature에 대한 시각화

In [None]:
data1= df.sample(frac = 1,random_state=42)

data1.shape

In [None]:
data1.hist(figsize=(20,20))
plt.show()

## 이상치의 비율 확인

In [None]:
Fraud = data1[data1['Class']==1]
Valid = data1[data1['Class']==0]
outlier_fraction = len(Fraud)/float(len(Valid))

In [None]:
print(outlier_fraction)
print("Fraud Cases : {}".format(len(Fraud)))
print("Valid Cases : {}".format(len(Valid)))

## 속성간의 상관관계 비교

In [None]:
correlation_matrix = data1.corr()
fig = plt.figure(figsize=(12,9))
sns.heatmap(correlation_matrix,vmax=0.8,square = True)
plt.show()

## 상관관계 절대값 순으로 정렬

In [None]:

correlation_matrix['Class'].abs().sort_values()

## 속성과 레이블 분리

In [None]:
columns = data1.columns.tolist()
# Filter the columns to remove data we do not want 
columns = [c for c in columns if c not in ["Class"]]
# columns = correlation_matrix['Class'].abs().sort_values()[-6:-1].keys()
columns = ["V14","V17","V12"]
print(columns)
# Store the variable we are predicting 
target = "Class"
# Define a random state 
X = data1[columns].values
y = data1[target].values

# Print the shapes of X & Y
print(X.shape)
print(y.shape)

In [None]:
normal = y==0
fraud  = y==1
plt.scatter(X[normal, columns.index("V14")], X[normal, columns.index("V17")], s=10, color='b',alpha=0.1)
plt.scatter(X[fraud, columns.index("V14")], X[fraud, columns.index("V17")], s=10, color='r',alpha=0.5)

In [None]:
# 만약 모두다 정상이라고 했을 때의 Accuracy
(y == 0).sum()/len(y)

## 이상탐지 알고리즘 Import

In [None]:
from sklearn.metrics import classification_report,accuracy_score
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.svm import OneClassSVM
from sklearn.metrics import plot_confusion_matrix


## 데이터셋 분리

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import confusion_matrix

# X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=1)
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.4, random_state=42)

for train_index, test_index in sss.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]


## 분석 시작

### 1. Logisting Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
    
model = LogisticRegression(max_iter=1000,random_state=42)
model.fit(X_train,y_train)
y_pred = model.predict(X_test)

print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test, y_pred))

### 2. Isolation Forest

In [None]:
train_normal = X_train[y_train==0]
train_outliers = X_train[y_train==1] 
outlier_prop = len(train_outliers) / len(X_train) 
print(outlier_prop)

clf = IsolationForest(contamination=0.0015,random_state=42)
clf.fit(X_train)
y_pred = clf.predict(X_test)

y_pred[y_pred == 1] = 0
y_pred[y_pred == -1] = 1

print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test, y_pred))

### 3. One-Class SVM

In [None]:
from sklearn.svm import OneClassSVM 
   
train_normal = X_train[y_train==0]
train_outliers = X_train[y_train==1] 


svm = OneClassSVM(kernel='rbf',nu=0.00095, gamma=0.002,verbose=True) 
svm.fit(train_normal)

y_pred = svm.predict(X_test) 
y_pred[y_pred == 1] = 0
y_pred[y_pred == -1] = 1

print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test, y_pred))