# BIG DATA ANALYSIS : 이상탐지 적용
---


## 신용카드 사기 결제에 대한 데이터 로드

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
df = pd.read_csv('creditcard.csv')


In [None]:
df.head()

In [None]:
df.to_csv()

## EDA

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()


## Class Imbalance 확인

In [None]:
df['Class'].value_counts()

In [None]:
LABELS = ["Normal", "Fraud"]

count_classes = pd.value_counts(df['Class'], sort = True)
count_classes.plot(kind = 'bar', rot=0)
plt.title("Transaction Class Distribution")
plt.xticks(range(2), LABELS)
plt.xlabel("Class")
plt.ylabel("Frequency");

## 사기와 정상 결제의 비교

In [None]:
Fraud = df[df['Class']==1]

Normal = df[df['Class']==0]

In [None]:
Fraud.shape

In [None]:
Normal.shape

In [None]:
Fraud.Amount.describe()

In [None]:
Normal.Amount.describe()

## 혹시 결제 금액에 따라 구별 할 수 있지 않을까?

In [None]:
f, (ax1, ax2) = plt.subplots(2, 1, sharex=True)
f.suptitle('Amount per transaction by class')
bins = 50
ax1.hist(Fraud.Amount, bins = bins)
ax1.set_title('Fraud')
ax2.hist(Normal.Amount, bins = bins)
ax2.set_title('Normal')
plt.xlabel('Amount ($)')
plt.ylabel('Number of Transactions')
plt.xlim((0, 20000))
plt.yscale('log')
plt.show();

## 혹시 결제 시간에 따라 구별 할 수 있지 않을까?

In [None]:
f, (ax1, ax2) = plt.subplots(2, 1, sharex=True)
f.suptitle('Time of transaction vs Amount by class')
ax1.scatter(Fraud.Time, Fraud.Amount)
ax1.set_title('Fraud')
ax2.scatter(Normal.Time, Normal.Amount)
ax2.set_title('Normal')
plt.xlabel('Time (in Seconds)')
plt.ylabel('Amount')
plt.show()


## 전체 Feature에 대한 시각화

In [None]:
data1= df.sample(frac = 0.2,random_state=1)

data1.shape

In [None]:
data1.hist(figsize=(20,20))
plt.show()

## 이상치의 비율 확인

In [None]:
Fraud = data1[data1['Class']==1]
Valid = data1[data1['Class']==0]
outlier_fraction = len(Fraud)/float(len(Valid))

In [None]:
print(outlier_fraction)
print("Fraud Cases : {}".format(len(Fraud)))
print("Valid Cases : {}".format(len(Valid)))

## 속성간의 상관관계 비교

In [None]:
correlation_matrix = data1.corr()
fig = plt.figure(figsize=(12,9))
sns.heatmap(correlation_matrix,vmax=0.8,square = True)
plt.show()

## 상관관계 절대값 순으로 정렬

In [None]:

correlation_matrix['Class'].abs().sort_values()

## 속성과 레이블 분리

In [None]:
columns = data1.columns.tolist()
# Filter the columns to remove data we do not want 
columns = [c for c in columns if c not in ["Class"]]
# columns = correlation_matrix['Class'].abs().sort_values()[-6:-1].keys()
print(columns)
# Store the variable we are predicting 
target = "Class"
# Define a random state 
state = np.random.RandomState(42)
X = data1[columns]
Y = data1[target]
X_outliers = state.uniform(low=0, high=1, size=(X.shape[0], X.shape[1]))
# Print the shapes of X & Y
print(X.shape)
print(Y.shape)

In [None]:
Y.sum()

In [None]:
# 만약 모두다 정상이라고 했을 때의 Accuracy
(Y == 0).sum()/Y.count()

## 이상탐지 알고리즘 Import

In [None]:
from sklearn.metrics import classification_report,accuracy_score
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.svm import OneClassSVM

In [None]:
# state = np.random.RandomState(42)

# classifiers = {
#     "Isolation Forest":IsolationForest(n_estimators=100, max_samples=len(X), 
#                                        contamination=outlier_fraction,random_state=state, verbose=0),
#     "Local Outlier Factor":LocalOutlierFactor(n_neighbors=20, algorithm='auto', 
#                                               leaf_size=30, metric='minkowski',
#                                               p=2, metric_params=None, contamination=outlier_fraction)
   
# }


In [None]:
state = np.random.RandomState(42)

classifiers = {
    "Isolation Forest":IsolationForest(n_estimators=100, max_samples=len(X), 
                                       contamination=outlier_fraction,random_state=state, verbose=0),
    "Local Outlier Factor":LocalOutlierFactor(n_neighbors=20, algorithm='auto', 
                                              leaf_size=30, metric='minkowski',
                                              p=2, metric_params=None, contamination=outlier_fraction),
    "Support Vector Machine":OneClassSVM(kernel='rbf', degree=3, gamma=0.1,nu=0.05, max_iter=-1)
   
}

## 분석 시작

In [None]:
n_outliers = len(Fraud)
for i, (clf_name,clf) in enumerate(classifiers.items()):
    #Fit the data and tag outliers
    if clf_name == "Local Outlier Factor":
        y_pred = clf.fit_predict(X)
        scores_prediction = clf.negative_outlier_factor_
    elif clf_name == "Support Vector Machine":
        clf.fit(X)
        y_pred = clf.predict(X)
    else:    
        clf.fit(X)
        scores_prediction = clf.decision_function(X)
        y_pred = clf.predict(X)
    #Reshape the prediction values to 0 for Valid transactions , 1 for Fraud transactions
    y_pred[y_pred == 1] = 0
    y_pred[y_pred == -1] = 1
    n_errors = (y_pred != Y).sum()
    n_outliers = (y_pred == 1).sum()
    n_fraud = (Y== 1).sum()
    # Run Classification Metrics
    print("{}: {}, {},{}".format(clf_name,n_errors,n_outliers,n_fraud))
    print("Accuracy Score :")
    print(accuracy_score(Y,y_pred))
    print("Classification Report :")
    print(classification_report(Y,y_pred))

위 classification_report 결과에서 <br>
0이라고 예측한 데이터의 100%만 실제로 0이었고 <br> 
1이라고 예측한 데이터의 26%만 실제로 1이었음을 알 수 있다. <br>
또한 br>
실제 0인 데이터 중의 100%만 0으로 판별되었고 <br>
실제 1인 데이터 중의 26%만 1로 판별되었음을 알 수 있다.<br>

## 지도학습과 비교

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=1)

model = LogisticRegression(max_iter=1000)
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
y_pred[y_pred < 0.5] = 0
y_pred[y_pred >= 0.5] = 1
n_errors = (y_pred != y_test).sum()
n_outliers = (y_pred == 1).sum()
n_fraud = (Y== 1).sum()
print(accuracy_score(y_test,y_pred))
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))