In [771]:
%matplotlib inline
import gluonbook as gb
from mxnet import autograd, gluon, init, nd
from mxnet.gluon import data as gdata, loss as gloss, nn
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt

In [772]:
train_data = pd.read_csv('cstraining.csv')
test_data = pd.read_csv('cstest.csv')

In [773]:
train_data.shape

(150000, 12)

In [774]:
train_data["NumberOfDependents"].describe()

count    146076.000000
mean          0.757222
std           1.115086
min           0.000000
25%           0.000000
50%           0.000000
75%           1.000000
max          20.000000
Name: NumberOfDependents, dtype: float64

In [775]:
yichang_list = []

In [776]:
yichang_list.extend(train_data[train_data["NumberOfDependents"]==10].index.tolist())  #异常

In [777]:
yichang_list.extend(train_data[train_data["NumberOfDependents"]==11].index.tolist())  #异常

In [778]:
yichang_list.extend(train_data[train_data["NumberOfDependents"]==13].index.tolist())  #异常

In [779]:
yichang_list.extend(train_data[train_data["NumberOfDependents"]==20].index.tolist())  #异常

In [780]:
train_data = train_data.drop(yichang_list)

In [781]:
train_data["NumberOfDependents"].isnull().value_counts()

False    146069
True       3924
Name: NumberOfDependents, dtype: int64

In [782]:
train_data["NumberOfDependents"] = train_data["NumberOfDependents"].fillna('9999')

In [783]:
null_list = train_data[train_data["NumberOfDependents"]=='9999'].index.tolist()

In [784]:
train_data = train_data.drop(null_list)

In [785]:
train_data.shape

(146069, 12)

In [786]:
train_data.head(5)

Unnamed: 0.1,Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,1,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2
1,2,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1
2,3,0,0.65818,38,1,0.085113,3042.0,2,1,0,0,0
3,4,0,0.23381,30,0,0.03605,3300.0,5,0,0,0,0
4,5,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0


In [787]:
all_features = pd.concat((train_data.iloc[:, 1:-1], test_data.iloc[:, 1:]))

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  """Entry point for launching an IPython kernel.


In [788]:
all_features.shape

(247572, 11)

In [789]:
numeric_features = all_features.dtypes[all_features.dtypes != 'object'].index #all_features中dtypes不等于object的index
all_features[numeric_features] = all_features[numeric_features].apply(
    lambda x: (x - x.mean(0)) / (x.std(0))                             )
all_features = all_features.fillna(all_features.mean())#mode
all_features = nd.array(all_features)
#df.mean()等价于df.mean(0)。把轴向数据求平均，得到每列数据的平均值。
#df.mean(1)按照另外一个axis的方向来求平均，得到每行数据的平均值。

In [790]:
numeric_features

Index(['DebtRatio', 'MonthlyIncome', 'NumberOfDependents',
       'NumberOfOpenCreditLinesAndLoans',
       'NumberOfTime30-59DaysPastDueNotWorse',
       'NumberOfTime60-89DaysPastDueNotWorse', 'NumberOfTimes90DaysLate',
       'NumberRealEstateLoansOrLines', 'RevolvingUtilizationOfUnsecuredLines',
       'SeriousDlqin2yrs', 'age'],
      dtype='object')

In [791]:
n_train = train_data.shape[0]
train_features = nd.array(all_features[:n_train])
test_features = nd.array(all_features[n_train:])
train_labels = nd.array(train_data.NumberOfDependents.values).reshape((-1, 1))
#train_features = nd.array(train_features)
#test_features = nd.array(test_features)

In [792]:
def make_onehot(labels):
    labels.reshape(labels.shape[0])
    return (np.arange(10)==labels[:,None]).astype(np.integer)
def get_nd_onehot(np_one_hot):
    return nd.array(np.squeeze(np_one_hot))
train_labels_one_hot = get_nd_onehot(make_onehot(train_labels.asnumpy()))
train_labels_one_hot


[[0. 0. 1. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 ...
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]]
<NDArray 146069x10 @cpu(0)>

In [793]:
loss = gloss.SoftmaxCrossEntropyLoss()

def get_net():
    net = nn.Sequential()
    net.add(nn.Dense(10,activation = 'relu'),
            nn.Dense(10,activation = 'relu'),
            nn.Dense(10)
           )
    net.initialize()
    return net

In [794]:
#计算AUC函数
# input: y_true =[] and y_score=[]
# output: auc
def computeAUC(net,train_features,train_labels):
    y_score = net(train_features)
    auc = roc_auc_score(train_labels,y_score)
    return auc.asscalar()

In [795]:
def train(net, train_features, train_labels, test_features, test_labels,
          num_epochs, learning_rate, weight_decay, batch_size):
    train_ls, test_ls = [], []
    train_iter = gdata.DataLoader(gdata.ArrayDataset(
        train_features, train_labels), batch_size, shuffle=True)
    # 这里使用了 Adam 优化算法。
    trainer = gluon.Trainer(net.collect_params(), 'adam', {
        'learning_rate': learning_rate, 'wd': weight_decay})
    for epoch in range(num_epochs):
        for X, y in train_iter:
            #print("X in train_iter shape is:",X.shape)  #  (16, 11)
            #print("y in train_iter shape is:",y.shape)   # (16, 10)
            #print("netX in train_iter shape is:",net(X).shape) #  (16, 10)
            with autograd.record():
                l = loss(net(X), y)
            l.backward()
            trainer.step(batch_size)
        train_ls.append(computeAUC(net, train_features, train_labels))
        if test_labels is not None:
            test_ls.append(computeAUC(net, test_features, test_labels))
    return train_ls, test_ls

In [802]:
def get_k_fold_data(k, i, X, y):
    assert k > 1
    fold_size = X.shape[0] // k             # X.shape[0]多少个样本  fold_size每一折多少样本
    X_train, y_train = None, None
    for j in range(k):
        idx = slice(j * fold_size, (j + 1) * fold_size)
        X_part, y_part = X[idx, :], y[idx]
        if j == i:
            X_valid, y_valid = X_part, y_part
        elif X_train is None:
            X_train, y_train = X_part, y_part
        else:
            X_train = nd.concat(X_train, X_part, dim=0)
            y_train = nd.concat(y_train, y_part, dim=0)
    return X_train, y_train, X_valid, y_valid

In [803]:
def k_fold(k, X_train, y_train, num_epochs,
           learning_rate, weight_decay, batch_size):
    train_l_sum, valid_l_sum = 0, 0
    for i in range(k):
        data = get_k_fold_data(k, i, X_train, y_train)
        net = get_net()
        train_ls, valid_ls = train(net, *data, num_epochs, learning_rate,
                                  weight_decay, batch_size)
        train_l_sum += train_ls[-1]
        valid_l_sum += valid_ls[-1]
        if i == 0:
            gb.semilogy(range(1, num_epochs + 1), train_ls, 'epochs', 'auc',
                        range(1, num_epochs + 1), valid_ls,
                        ['train', 'valid'])
        print('fold %d, train auc: %f, valid auc: %f' % (
            i, train_ls[-1], valid_ls[-1]))
    return train_l_sum / k, valid_l_sum / k

In [804]:
k, num_epochs, lr, weight_decay, batch_size =5,50, 0.05,50, 16
verbose_epoch = num_epochs - 2
train_l, valid_l = k_fold(k, train_features, train_labels_one_hot, num_epochs, lr,
                         weight_decay, batch_size)
print('%d-fold validation: avg train auc: %f, avg valid auc: %f'
      % (k, train_l, valid_l))

MXNetError: Shape inconsistent, Provided = [16,10], inferred shape=[16,1]

In [499]:
%matplotlib inline
import gluonbook as gb
from mxnet import autograd, gluon, init, nd
from mxnet.gluon import data as gdata, loss as gloss, nn
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
train_data = pd.read_csv('cstraining.csv')
test_data = pd.read_csv('cstest.csv')
train_data.shape
train_data["NumberOfDependents"].describe()
yichang_list = []
yichang_list.extend(train_data[train_data["NumberOfDependents"]==10].index.tolist())  #异常
yichang_list.extend(train_data[train_data["NumberOfDependents"]==11].index.tolist())  #异常
yichang_list.extend(train_data[train_data["NumberOfDependents"]==13].index.tolist())  #异常
yichang_list.extend(train_data[train_data["NumberOfDependents"]==20].index.tolist())  #异常
train_data = train_data.drop(yichang_list)
train_data["NumberOfDependents"].isnull().value_counts()
train_data["NumberOfDependents"] = train_data["NumberOfDependents"].fillna('9999')
null_list = train_data[train_data["NumberOfDependents"]=='9999'].index.tolist()
train_data = train_data.drop(null_list)
train_data.shape
all_features = pd.concat((train_data.iloc[:, 1:-1], test_data.iloc[:, 1:]))
all_features.shape
numeric_features = all_features.dtypes[all_features.dtypes != 'object'].index #all_features中dtypes不等于object的index
all_features[numeric_features] = all_features[numeric_features].apply(
    lambda x: (x - x.mean(0)) / (x.std(0))                             )
all_features = all_features.fillna(all_features.mean())#mode
all_features = nd.array(all_features)
numeric_features
n_train = train_data.shape[0]
train_features = nd.array(all_features[:n_train])
test_features = nd.array(all_features[n_train:])
train_labels = nd.array(train_data.NumberOfDependents.values).reshape((-1, 1))

def make_onehot(labels):
    labels.reshape(labels.shape[0])
    return (np.arange(10)==labels[:,None]).astype(np.integer)
def get_nd_onehot(np_one_hot):
    return nd.array(np.squeeze(np_one_hot))
train_labels_one_hot = get_nd_onehot(make_onehot(train_labels.asnumpy()))
train_labels_one_hot
loss = gloss.SoftmaxCrossEntropyLoss()

def get_net():
    net = nn.Sequential()
    net.add(nn.Dense(10,activation = 'relu'),
            nn.Dense(10,activation = 'relu'),
            nn.Dense(10)
           )
    net.initialize()
    return net

#计算AUC函数
# input: y_true =[] and y_score=[]
# output: auc
def computeAUC(net,train_features,train_labels):
    y_score = net(train_features)
    auc = roc_auc_score(train_labels,y_score)
    return auc.asscalar()

def train(net, train_features, train_labels, test_features, test_labels,
          num_epochs, learning_rate, weight_decay, batch_size):
    train_ls, test_ls = [], []
    train_iter = gdata.DataLoader(gdata.ArrayDataset(
        train_features, train_labels), batch_size, shuffle=True)
    # 这里使用了 Adam 优化算法。
    trainer = gluon.Trainer(net.collect_params(), 'adam', {
        'learning_rate': learning_rate, 'wd': weight_decay})
    for epoch in range(num_epochs):
        for X, y in train_iter:
            #print("X in train_iter shape is:",X.shape)  #  (16, 11)
            #print("y in train_iter shape is:",y.shape)   # (16, 10)
            #print("netX in train_iter shape is:",net(X).shape) #  (16, 10)
            with autograd.record():
                l = loss(net(X), y)
            l.backward()
            trainer.step(batch_size)
        train_ls.append(computeAUC(net, train_features, train_labels))
        if test_labels is not None:
            test_ls.append(computeAUC(net, test_features, test_labels))
    return train_ls, test_ls

def get_k_fold_data(k, i, X, y):
    assert k > 1
    fold_size = X.shape[0] // k             # X.shape[0]多少个样本  fold_size每一折多少样本
    X_train, y_train = None, None
    for j in range(k):
        idx = slice(j * fold_size, (j + 1) * fold_size)
        X_part, y_part = X[idx, :], y[idx]
        if j == i:
            X_valid, y_valid = X_part, y_part
        elif X_train is None:
            X_train, y_train = X_part, y_part
        else:
            X_train = nd.concat(X_train, X_part, dim=0)
            y_train = nd.concat(y_train, y_part, dim=0)
    return X_train, y_train, X_valid, y_valid

def k_fold(k, X_train, y_train, num_epochs,
           learning_rate, weight_decay, batch_size):
    train_l_sum, valid_l_sum = 0, 0
    for i in range(k):
        data = get_k_fold_data(k, i, X_train, y_train)
        net = get_net()
        train_ls, valid_ls = train(net, *data, num_epochs, learning_rate,
                                  weight_decay, batch_size)
        train_l_sum += train_ls[-1]
        valid_l_sum += valid_ls[-1]
        if i == 0:
            gb.semilogy(range(1, num_epochs + 1), train_ls, 'epochs', 'auc',
                        range(1, num_epochs + 1), valid_ls,
                        ['train', 'valid'])
        print('fold %d, train auc: %f, valid auc: %f' % (
            i, train_ls[-1], valid_ls[-1]))
    return train_l_sum / k, valid_l_sum / k

k, num_epochs, lr, weight_decay, batch_size =5,50, 0.05,50, 16
verbose_epoch = num_epochs - 2
train_l, valid_l = k_fold(k, train_features, train_labels_one_hot, num_epochs, lr,
                         weight_decay, batch_size)
print('%d-fold validation: avg train auc: %f, avg valid auc: %f'
      % (k, train_l, valid_l))