### Recognize Data

In [None]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import seaborn as sns
from keras.layers import Input, Dense
from keras.models import Model, Sequential
from keras import regularizers
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.manifold import TSNE
from sklearn import preprocessing 


plt.rcParams['font.sans-serif']=['SimHei'] #显示中文标签
plt.rcParams['axes.unicode_minus']=False   #这两行需要手动设置
# plot图片风格
plt.style.use('fivethirtyeight')
# pandas最大打印列数
pd.set_option('display.max_columns', None)
# pandas最大打印行数
pd.set_option('display.max_rows', 50)
# 设置随机数种子
np.random.seed(1)

In [None]:
# 读取数据
data = pd.read_csv('./data/creditcard/creditcard.csv')

In [None]:
# 查看基本信息
data.info()

一共284806行数据, 并且数据集中没有缺失的数据.

打印数据前10行查看一下具体的数据.

In [None]:
# 打印前10行数据
data.head(10)

由上表可以看到的是, 数据一共有31个特征, 并且最后一个class是我们要预测的目标特征.

并且, 因为信用卡交易涉及到了用户的隐私, 上述数据用了PCA方法, 对数据进行了特使处理.

可以看到, 除了时间, 交易量, 这两个特征以外, 其余的特征都已经被PCA处理过了.

下面让我们来看一下target的分布情况.

In [None]:
# 目标变量饼图
def plot_target_pie(target):
    labels = 'Not Fraud', 'Fraud'
    explode = (0, 0.1)
    fig1, ax1 = plt.subplots(figsize = (7,7))
    ax1.pie(target, explode=explode, labels=labels, autopct='%.4f%%',
            shadow=True, startangle=120)
    ax1.axis('equal')
    ax1.legend(labels, loc='best')
    plt.title('Data Distribution of Target Variable', fontsize='large')
    plt.savefig('./credit_figures/类别.png')
    plt.show()

plot_target_pie(np.array(data['Class'].value_counts()))

可以看到类别极端不平衡, 正类样本(欺诈)只占结果的0.1727%

接下来, 我们查看一下其他变量的数据分布

In [None]:
### 查看一下交易时间分布与交易量分布(直方图)
def plot_hist(labels):
    plt.figure(figsize = (12, 6))
    color = ['blue', 'darkorange']
    for i, label in enumerate(labels, 1):
        plt.subplot(1, 2, i)
        np_data = data[label]
        plt.hist(np_data,bins=100,alpha=0.8,color = color[i-1], edgecolor='none', label = label + ' Frequence')
        plt.xlabel(label)
        plt.ylabel('Frequence')
        plt.legend(loc = 'best')
        plt.title(f'Histogram of Transaction {label}', fontsize = 'large')
    plt.savefig('./credit_figures/交易量与时间.png')
    plt.show()    
plot_hist(['Amount', 'Time'])

可以看到, 交易量比较集中, 大额交易有, 但是大额交易少.

可以看到交易时间 呈现一个周期的走势, 符合白天交易频繁, 凌晨交易次数减少.

In [None]:
### heatmap
def plot_heatmap(corr_matrix):
    plt.figure(figsize = (22, 22))
    sns.heatmap(corr_matrix,
                 annot=True,
                 linewidths=0.2,
                 fmt=".2f",
                 cmap="YlGnBu",
               )
    plt.savefig('./credit_figures/热力图.png')
    plt.show()
corr_matrix = data.corr()
plot_heatmap(corr_matrix)

In [None]:
# 查看与class 相关性较强的变量
class_relative = corr_matrix['Class']
def plot_barh(class_relative, labels):
    np_labels = np.array(labels[:-1])
    np_data = np.array(class_relative)
    plt.figure(figsize = (12, 8))
    plt.barh(range(len(np_data)-1) ,np_data[:-1],facecolor='tan',height=0.5,edgecolor='r',alpha=0.6,tick_label=np_labels)
    plt.xlabel('Variable Correlation')
    plt.show()
    
plot_barh(class_relative, data.columns)

找出与变量正相关比较大的三个变量

V1, V4, V11

In [None]:
###  V1, V4, V11

def plot_box():
    f, axes = plt.subplots(ncols=3, figsize=(20,6))
    sns.boxplot(x="Class", y="V1", data=data, ax=axes[0])
    axes[0].set_title('V1 vs Class Positive Correlation')
    sns.boxplot(x="Class", y="V4", data=data,  ax=axes[1])
    axes[1].set_title('V4 vs Class Positive Correlation')
    sns.boxplot(x="Class", y="V11", data=data,  ax=axes[2])
    axes[2].set_title('V11 vs Class Positive Correlation')
    plt.savefig('./credit_figures/箱线图.png')
    plt.show()
    
plot_box()

从箱线图可以看出, 与变量正相关的数据分布并不相同

### UnderSampling

使用自编码器来提取特征

由于样本并不平衡, 我们对负例进行采样.

从负例中随机采样1000个样本.

In [None]:
not_fraud = data[data['Class'] == 0].sample(1000)
fraud = data[data['Class'] == 1]

可以看到没有明显的边界

In [None]:
df = not_fraud.append(fraud).sample(frac=1).reset_index(drop=True)
X = df.drop(['Class'], axis = 1).values
Y = df["Class"].values

In [None]:
# 目标变量饼图
def plot_target_pie(target):
    labels = 'Not Fraud', 'Fraud'
    explode = (0, 0.1)
    fig1, ax1 = plt.subplots(figsize = (7,7))
    ax1.pie(target, explode=explode, labels=labels, autopct='%.4f%%',
            shadow=True, startangle=120)
    ax1.axis('equal')
    ax1.legend(labels, loc='best')
    plt.title('Data Distribution of Target Variable', fontsize='large')
    plt.savefig('./credit_figures/类别.png')
    plt.show()

plot_target_pie(np.array(df['Class'].value_counts()))

In [None]:
input_layer = Input(shape=(X.shape[1],))

encoded = Dense(100, activation='tanh', activity_regularizer=regularizers.l1(10e-5))(input_layer)
encoded = Dense(50, activation='relu')(encoded)

decoded = Dense(50, activation='tanh')(encoded)
decoded = Dense(100, activation='tanh')(decoded)

output_layer = Dense(X.shape[1], activation='relu')(decoded)

In [None]:
autoencoder = Model(input_layer, output_layer)
autoencoder.compile(optimizer="SGD", loss="mse")

In [None]:
x = data.drop(["Class"], axis=1)
y = data["Class"].values

x_scale = preprocessing.MinMaxScaler().fit_transform(x.values)
x_norm, x_fraud = x_scale[y == 0], x_scale[y == 1]

In [None]:
autoencoder.fit(x_norm[0:2000], x_norm[0:2000], 
                batch_size = 256, epochs = 20, 
                shuffle = True, validation_split = 0.20);

In [None]:
hidden_representation = Sequential()
hidden_representation.add(autoencoder.layers[0])
hidden_representation.add(autoencoder.layers[1])
hidden_representation.add(autoencoder.layers[2])

In [None]:
norm_hid_rep = hidden_representation.predict(x_norm[:1000])
fraud_hid_rep = hidden_representation.predict(x_fraud)

In [None]:
def tsne_plot(x1, y1, name):
    tsne = TSNE(n_components=2, random_state=0)
    X_t = tsne.fit_transform(x1)

    plt.figure(figsize=(12, 8))
    plt.scatter(X_t[np.where(y1 == 0), 0], X_t[np.where(y1 == 0), 1], marker='o', color='darkblue', linewidth='1', alpha=0.8, label='Non Fraud')
    plt.scatter(X_t[np.where(y1 == 1), 0], X_t[np.where(y1 == 1), 1], marker='o', color='darkorange', linewidth='1', alpha=0.8, label='Fraud')
    plt.title('t-SNE Data Dimensionality Reduction')
    plt.legend(loc='best')
    plt.savefig(name)
    plt.show()
    
tsne_plot(X, Y, "./credit_figures/original.png")

In [None]:
rep_x = np.append(norm_hid_rep, fraud_hid_rep, axis = 0)
y_n = np.zeros(norm_hid_rep.shape[0])
y_f = np.ones(fraud_hid_rep.shape[0])
rep_y = np.append(y_n, y_f)
tsne_plot(rep_x, rep_y, './credit_figures/Autoencoder.png')

我们在这个基础上进行分类.

In [None]:
train_x, val_x, train_y, val_y = train_test_split(rep_x, rep_y, test_size=0.25)
clf = LogisticRegression(solver="lbfgs").fit(train_x, train_y)
pred_y = clf.predict(val_x)

print ("Classification Report: ")
print (classification_report(val_y, pred_y))

print ("Accuracy Score: ", accuracy_score(val_y, pred_y))

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV

models = {
    'Logistic Regression' : LogisticRegression(),
    'Decision Tree' : DecisionTreeClassifier(),
    'AdaBoost' : AdaBoostClassifier(),
    'GradientBoost' : GradientBoostingClassifier(),
    'RandomForest' : RandomForestClassifier(),
    'ETC' : ExtraTreesClassifier(),
    'Bag' : BaggingClassifier(),
    'XGBoost' : XGBClassifier(),
    'SVC' : SVC(),
    'KNN' : KNeighborsClassifier(),
    'NB' : GaussianNB(),
    'QDA' : QuadraticDiscriminantAnalysis(),
    'NN' : MLPClassifier(),
}

params = {
    'Logistic Regression' : {},
    'Decision Tree' : {},
    'AdaBoost' : {},
    'GradientBoost' : {},
    'RandomForest' : {},
    'XGBoost' : {},
    'SVC' : {},
    'KNN' : {},
    'NB' : {},
    'ETC' : {},
    'Bag' : {},
    'QDA' : {},
    'NN' : {},
}

In [None]:
class EstimatorSelectionHelper:
    # 初始化, 加载模型以及提前设置的一些超参数
    def __init__(self, models, params):
        if not set(models.keys()).issubset(set(params.keys())):
            missing_params = list(set(models.keys()) - set(params.keys()))
            raise ValueError("Some estimators are missing parameters: %s" % missing_params)
        self.models = models
        self.params = params
        self.keys = models.keys()
        self.grid_searches = {}
        self.best = {}
    # 对每个模型的每组超参数都进行交叉验证
    def fit(self, X, y, cv=3, n_jobs=3, verbose=1, scoring=None, refit=False):
        for key in self.keys:
            print("Running GridSearchCV for %s." % key)
            model = self.models[key]
            params = self.params[key]
            gs = GridSearchCV(model, params, cv=cv, n_jobs=n_jobs,
                              verbose=verbose, scoring=scoring, refit=refit,
                              return_train_score=True)
            gs.fit(X,y)
            self.best[key] = {'score':gs.best_score_,'params':gs.best_params_}
            self.grid_searches[key] = gs    
    # 对结果进行统计
    def score_summary(self, sort_by='mean_score'):
        def row(key, scores, params):
            d = {
                 'estimator': key,
                 'min_score': min(scores),
                 'max_score': max(scores),
                 'mean_score': np.mean(scores),
                 'std_score': np.std(scores),
            }
            return pd.Series({**params,**d})

        rows = []
        for k in self.grid_searches:
            print(k)
            params = self.grid_searches[k].cv_results_['params']
            scores = []
            for i in range(self.grid_searches[k].cv):
                key = "split{}_test_score".format(i)
                r = self.grid_searches[k].cv_results_[key]        
                scores.append(r.reshape(len(params),1))

            all_scores = np.hstack(scores)
            for p, s in zip(params,all_scores):
                rows.append((row(k, s, p)))

        df = pd.concat(rows, axis=1).T.sort_values([sort_by], ascending=False)

        columns = ['estimator', 'min_score', 'mean_score', 'max_score', 'std_score']
        columns = columns + [c for c in df.columns if c not in columns]

        return df[columns]
    # 最优超参数.
    def best_params(self):
        return self.best

In [None]:
Helper = EstimatorSelectionHelper(models, params)
Helper.fit(train_x, train_y, cv=5)
result = Helper.score_summary(sort_by='mean_score')

In [None]:
# 分类器性能比较 水平柱状图
def plot_performance(performance):

    np_labels = np.array(performance['estimator'])[::-1]
    np_data = np.array(performance['mean_score'])[::-1]
    plt.figure(figsize=(12, 8))
    plt.barh(range(len(np_data)), np_data, facecolor='darkorange', height=0.5, edgecolor=None, alpha=0.8,
             tick_label=np_labels)
    plt.xlabel('Performance (Mean Score)')
    plt.title('Performance Compare (Mean Score)')
    plt.savefig('./credit_figures/performance_compare.png')
    plt.show()

    
performance = result[['estimator','mean_score']]
print(performance['estimator'])
plot_performance(performance)

In [None]:
clf = SVC().fit(train_x, train_y)
pred_y = clf.predict(val_x)

print ("Classification Report: ")
print (classification_report(val_y, pred_y))

print ("Accuracy Score: ", accuracy_score(val_y, pred_y))

In [None]:
# learnin cruve
from sklearn.model_selection import learning_curve
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None, n_jobs=1, 
                        train_sizes=np.linspace(.1, 1., 10), verbose=0, plot=True):

    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes, verbose=verbose)
    
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    
    if plot:
        plt.figure(figsize = (10,10))
        plt.title(title)
        if ylim is not None:
            plt.ylim(*ylim)
        plt.xlabel("Number of Train Samples")
        plt.ylabel("Score")

        plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, 
                         alpha=0.1, color="b")
        plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, 
                         alpha=0.1, color="r")
        plt.plot(train_sizes, train_scores_mean, 'o-', color="b", label="Score in Train Set")
        plt.plot(train_sizes, test_scores_mean, 'o-', color="r", label="Score in Val Set")
    
        plt.legend(loc="best")
        
        plt.draw()
        plt.gca().invert_yaxis()
        plt.show()
    
    midpoint = ((train_scores_mean[-1] + train_scores_std[-1]) + (test_scores_mean[-1] - test_scores_std[-1])) / 2
    diff = (train_scores_mean[-1] + train_scores_std[-1]) - (test_scores_mean[-1] - test_scores_std[-1])
    return midpoint, diff

plot_learning_curve(SVC(), "Learning Curve", train_x, train_y)