# 1.Prepare data

将原始训练数据集,按照一定的ratio分割成训练集和测试集

In [7]:
from sklearn.cross_validation import train_test_split
from sklearn.metrics import confusion_matrix, roc_curve,roc_auc_score,classification_report 
import matplotlib.pyplot as plt
import csv
import pandas as pd


def get_train_data(train_path,test_path,val_ratio):
    train_data= pd.read_csv(train_path)
    y_train = train_data.ACTION
    X_train = train_data.drop("ACTION",1)
    
    test_data= pd.read_csv(test_path)
    X_test= test_data.drop("id",1)
    id_test=test_data.id
    
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=val_ratio, random_state=3)
    print("Size of training data: ",len(X_train),", Size of validation data: ",len(X_val))
    return X_train, X_val, y_train,y_val,X_test,id_test


# 2. Version1: Traditional machine learning method
一些传统机器学习方法的测试,其中随机森林的结果最佳,而逻辑斯特回归效果欠佳,原因可能和数据集中标签为1的数据较多有关

In [11]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm,linear_model
from sklearn.naive_bayes import GaussianNB, MultinomialNB
import numpy as np

def train_test(model,ans_path,X_train, X_val, y_train,y_val,X_test,id_test):
    RFfit = model.fit(X_train , y_train)
#     The AUC score of validation
    pre_val = RFfit.predict_proba(X_val)
    print("AUC:")
    print( roc_auc_score(y_val,pre_val[:,1]) )
#      Confusion matrix
    pre_val = RFfit.predict(X_val)
    print("Confusion Matrix:")
    print( confusion_matrix(y_val,pre_val) )
#   Saving predicting result
    pre = RFfit.predict_proba(X_test)
    ans=np.column_stack((id_test,pre))
    np.savetxt(ans_path, ans, delimiter = ',')  
    
def main():
    train_path="train.csv"
    test_path="test.csv"
    X_train, X_val, y_train,y_val,X_test,id_test=get_train_data(train_path,test_path,0.15)
    #random forest
    print("Random Forest:")
    forest = RandomForestClassifier(criterion='entropy',
                                    n_estimators=1000,
                                    random_state=1,
                                    n_jobs=2)
    train_test(forest,"forest.csv",X_train, X_val, y_train,y_val,X_test,id_test)
    print("\nLogistic Regression:")
    logist = linear_model.LogisticRegression()
    train_test(logist,"logist.csv",X_train, X_val, y_train,y_val,X_test,id_test)
    print("\nNaive Bayes:")
    nb=GaussianNB()
    train_test(nb,"naive_bayes.csv",X_train, X_val, y_train,y_val,X_test,id_test)
main()

Size of training data:  27853 , Size of validation data:  4916
Random Forest:
AUC:
0.860190210829386
Confusion Matrix:
[[ 110  181]
 [  58 4567]]

Logistic Regression:
AUC:
0.508703259960992
Confusion Matrix:
[[   0  291]
 [   0 4625]]

Naive Bayes:
AUC:
0.5738333797715242
Confusion Matrix:
[[  18  273]
 [ 194 4431]]


# 3.Verision2: Deep learning method
利用pytorch和keras两种框架尝试了,可以在validation上达到0.94的准确度

## 3.1 pytorch版本
Tricks:

(1)这里尝试了几种loss,目前使用的是Binary Cross Entropy(BCEloss).

(2)optimizer使用的是Adam, learning rate为1e-5

(3)在loss上加了权重约束,由于数据集中
30872 are 1
1897 are 0
因此给标签为0和1的样本,loss权重分别设置为2和0.5

In [43]:
import torch
import torch.nn.functional as nn
import torch.autograd as autograd
import torch.optim as optim
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import os
from torch.autograd import Variable
import sklearn
import time
import random
import re


def validation_acc(model,X_val,y_val,use_gpu,class_weight):
    lenth=len(X_val)
    val_ans=np.zeros(y_val.shape)
    score=model(X_val).squeeze()
    score_=score.data.cpu().numpy()
    y_val_=y_val.data.cpu().numpy()
    val_ans[score_>=0.5]=1
    val_ans=val_ans.astype("int64")
    correct = (y_val_ == val_ans).sum()
    accuracy = 1.0*correct / lenth
    #calculate AUC
    auc=roc_auc_score(y_val_,val_ans)
    
    weight = class_weight[y_val.long()]
    loss_fn = torch.nn.BCELoss(weight=weight)
    loss_val=loss_fn(score,y_val)
    return accuracy,auc,loss_val

def convert2onehot(label,config):
    class_num = config["y_dim"]
    batch_size = len(label)
    label = torch.LongTensor(batch_size, 1).random_() % class_num
    one_hot = torch.zeros(batch_size, class_num).scatter_(1, label, 1)
    
    return one_hot.numpy()

def train(X_train, X_val, y_train,y_val,config):
    train_path=config["train_path"]
    test_path=config["test_path"]
    ans_path=config["ans_path"]
    model_path=config["model_path"]
    batch_size=config["batch_size"]
    hidden_dim =config["hidden_dim"]
    X_dim=config["X_dim"]
    y_dim =config["y_dim"]
    epoch=config["epoch"]
    use_gpu=config["use_gpu"]
    use_one_hot=config["use_one_hot"]
    it_per_epoch=int( len(X_train)/batch_size )
    
    if use_gpu:
        X_train = Variable( torch.FloatTensor( torch.from_numpy(X_train).numpy() ).cuda() )
        y_train = Variable( torch.FloatTensor( torch.from_numpy(y_train).numpy() ).cuda() )
        X_val = Variable( torch.FloatTensor( torch.from_numpy(X_val).numpy() ).cuda() )
        y_val = Variable( torch.FloatTensor( torch.from_numpy(y_val).numpy() ).cuda() )
    else:
        X_train = Variable( torch.FloatTensor( torch.from_numpy(X).numpy() ) )
        y_train = Variable( torch.FloatTensor( torch.from_numpy(y).numpy() ) )
        X_val = Variable( torch.FloatTensor( torch.from_numpy(X_val).numpy() ) )
        y_val = Variable( torch.FloatTensor( torch.from_numpy(y_val).numpy() ) )
    
    
    D = torch.nn.Sequential(
        torch.nn.Linear(X_dim, hidden_dim[0]),
        torch.nn.ReLU(),
        torch.nn.Dropout(0.2),
        torch.nn.Linear(hidden_dim[0], hidden_dim[1]),
        torch.nn.ReLU(),
        torch.nn.Dropout(0.2),
        torch.nn.Linear(hidden_dim[1], hidden_dim[2]),
        torch.nn.ReLU(),
        torch.nn.Dropout(0.2),
        torch.nn.Linear(hidden_dim[2], hidden_dim[3]),
        torch.nn.ReLU(),
        torch.nn.Dropout(0.2),
        torch.nn.Linear(hidden_dim[3], hidden_dim[4]),
        torch.nn.ReLU(),
        torch.nn.Dropout(0.2),
        torch.nn.Linear(hidden_dim[4], hidden_dim[5]),
        torch.nn.ReLU(),
        torch.nn.Dropout(0.2),
        torch.nn.Linear(hidden_dim[5], 1),
        torch.nn.Sigmoid()
    )

    #Optimizer
    D_solver = optim.Adam(D.parameters(), lr=1e-6)
#     D_solver = optim.SGD(D.parameters(), lr=1e-5, momentum=0.9)
    
    if use_gpu: 
        D=D.cuda()
    #Loss
    loss_fn = torch.nn.BCELoss()
    
    #The positive samples are less 
#     30872 are 1
#     1897 are 0
    c_weight = list(sklearn.utils.class_weight.compute_class_weight('balanced', 
                                                               np.unique(y_train.data.cpu().numpy()), 
                                                                    y_train.data.cpu().numpy()) )
    c_weight[0]/=4
    print(c_weight)
    if use_gpu:
        class_weight = Variable(torch.FloatTensor(c_weight).cuda())
    else:
        class_weight = Variable(torch.FloatTensor(c_weight) )
#     loss_fn = torch.nn.CrossEntropyLoss()
    
    print("Begin training!,lenth of training dataset is %d"%(len(X_train)))
    
    for epoch in range(epoch):
        train_data_begin=0
        start=time.time()
        for it in range(it_per_epoch):
            # Sample data
            X = X_train[train_data_begin:train_data_begin+batch_size]
            y = y_train[train_data_begin:train_data_begin+batch_size]
            
            train_data_begin=train_data_begin+batch_size
            
            score=D(X).squeeze()
            weight = class_weight[y.long()]
            loss_fn = torch.nn.BCELoss(weight=weight)
            loss =loss_fn(score,y)
            
            loss.backward()
            D_solver.step()

            D.zero_grad()

            end=time.time()
        
        #Calculating precision of validation dataset
        val_acc,auc,loss_val=validation_acc(D,X_val,y_val,use_gpu,class_weight)
        
        print('Epoch: %d; Iters: %d; loss: %.4f ; accuracy of validation: %.4f ; loss_val: %.4f ; time elasped %.2f.'
                      %(epoch,it, loss.data[0],val_acc,loss_val.data[0],end-start))
#         print("The first 5 answers are: {}".format( score.data.cpu().numpy()[:5]) )
        
        if epoch%10==0:
            torch.save(D, model_path+'%d_D_model.pkl'%epoch)

def test(id_test,X_test,model_path,config):
    use_gpu=config["use_gpu"]
    ans_path=config["ans_path"]
    model = torch.load(model_path)
    if use_gpu:
        X_test=Variable( torch.FloatTensor( torch.from_numpy(X_test).numpy() ).cuda())
    else:
        X_test=Variable( torch.FloatTensor( torch.from_numpy(X_test).numpy() ))
    ans=model(X_test)
    ans=np.column_stack((  id_test,ans.data.cpu().numpy()  ))
    np.savetxt(ans_path, ans, delimiter = ',') 
    print("Ans saved.")
    
def main():
    config={
        "train_path":"train.csv",
        "test_path":"test.csv",
        "ans_path":'ans.csv',
        "model_path":"model/",
        "batch_size":64,
        "hidden_dim" : [16,64,128,256,128,64],
        "X_dim" :9,
        "y_dim" : 2,
        "epoch" :40,
        "use_gpu":True,
        "use_one_hot":False
    }
    X_train, X_val, y_train,y_val,X_test,id_test=get_train_data(config["train_path"],config["test_path"],0.2)
    
    train(X_train.values,X_val.values, y_train.values,y_val.values,config)
    
    test(id_test,X_test.values,config["model_path"]+"40_D_model.pkl",config)
main()

Size of training data:  26215 , Size of validation data:  6554
[2.1845833333333333, 0.5303459437588509]
Begin training!,lenth of training dataset is 26215
Epoch: 0; Iters: 408; loss: 11.6188 ; accuracy of validation: 0.5285 ; loss_val: 8.1185 ; time elasped 1.47.
Epoch: 1; Iters: 408; loss: 8.5586 ; accuracy of validation: 0.6454 ; loss_val: 6.7672 ; time elasped 2.69.
Epoch: 2; Iters: 408; loss: 7.8113 ; accuracy of validation: 0.7309 ; loss_val: 5.7922 ; time elasped 1.29.
Epoch: 3; Iters: 408; loss: 6.8388 ; accuracy of validation: 0.7890 ; loss_val: 5.1666 ; time elasped 1.45.
Epoch: 4; Iters: 408; loss: 4.0795 ; accuracy of validation: 0.8351 ; loss_val: 4.6950 ; time elasped 1.46.
Epoch: 5; Iters: 408; loss: 5.5615 ; accuracy of validation: 0.8651 ; loss_val: 4.3531 ; time elasped 1.48.
Epoch: 6; Iters: 408; loss: 5.5469 ; accuracy of validation: 0.8789 ; loss_val: 4.2750 ; time elasped 1.47.
Epoch: 7; Iters: 408; loss: 5.0536 ; accuracy of validation: 0.8892 ; loss_val: 4.1812 ;

## 3.2 keras版本
Tricks:

(1)loss依旧是Cross Entropy.

(2)optimizer使用的是Adam, learning rate为5e-5

(3)在loss上加了权重约束,loss权重分别设置为8和0.5

In [46]:
from keras.models import Model,load_model
from keras.layers import Input,Dense, Dropout, Activation, Flatten,advanced_activations
from keras.layers.normalization import BatchNormalization
import keras.backend as K
from keras import regularizers
from keras.optimizers import Adam
from keras.callbacks import ModelCheckpoint
from keras.utils import to_categorical
from sklearn.utils import class_weight
import math
import os
import numpy as np


import keras
from sklearn.metrics import roc_auc_score
import numpy as np


# create the base model
def dense_block(x,filters):
    x=Dense(filters,kernel_initializer='glorot_normal',kernel_regularizer=regularizers.l2(0))(x)
    x=Activation('relu')(x)
#     x=advanced_activations.LeakyReLU(alpha=0.3)(x)
    x=Dropout(0.2)(x)
    
    return x

def train(X_train,X_val, y_train,y_val,config):
    batch_size=config["batch_size"]
    num_classes =config["num_classes"]
    epochs=config["epoch"]
    train_path=config["train_path"]
    hidden_dim =config["hidden_dim"]
    use_gpu=config["use_gpu"]
    use_one_hot=config["use_one_hot"]
    filepath=config["model_path"]
    load_pre_model=config["load_pre_model"]
    
    main_input = Input(shape=X_train.shape[1:],name='input')
    x=main_input
    
    for i in range(len(hidden_dim)):
        x=dense_block(x,hidden_dim[i])

    predictions = Dense(num_classes, activation='sigmoid',name='main_output',kernel_initializer='glorot_normal',
                       kernel_regularizer=regularizers.l2(0))(x)
    
    if load_pre_model:
        model=load_model(filepath) 
    else:
        model = Model(inputs=main_input, outputs=predictions)

    opt=Adam(lr=5e-5, beta_1=0.99, beta_2=0.999, epsilon=1e-8)

    model.compile(loss='categorical_crossentropy',
                  optimizer=opt,
                  metrics=['accuracy'])
    
    c_weight = class_weight.compute_class_weight('balanced', np.unique(y_train[:,1]), y_train[:,1])
#     c_weight = {0 : 26,1: 1.}
    print(c_weight)
    
    model.fit(X_train,y_train,
              validation_data=(X_val,y_val),class_weight=c_weight,
              epochs=epochs, batch_size=batch_size,
              callbacks=[ModelCheckpoint(filepath,monitor='val_acc',
                                         verbose=0,save_best_only=True,mode='auto')])
    
    
def test(id_test,X_test,X_val,y_val,config):
    model = load_model(config["model_path"]) 
    scores = model.evaluate(X_val,y_val, verbose=1)
    print("validation accuracy:",scores)
    
    predict=model.predict(X_test)
    ans=np.column_stack((  id_test,predict[:,1]  ))
    np.savetxt(config["ans_path"], ans, delimiter = ',')   
    
def main():
    config={
        "train_path":"train.csv",
        "test_path":"test.csv",
        "ans_path":'ans_keras.csv',
        "model_path":"model/keras.h5",
        "batch_size":32,
        "hidden_dim" : [16,64,128,256,128,64],
        "X_dim" :9,
        "num_classes" : 2,
        "epoch" :40,
        "use_gpu":True,
        "use_one_hot":False,
        "load_pre_model":True
    }
    X_train, X_val, y_train,y_val,X_test,id_test=get_train_data(config["train_path"],config["test_path"],0.25)
    
    y_train = to_categorical(y_train, config["num_classes"])
    y_val =  to_categorical(y_val, config["num_classes"])
    
    train(X_train,X_val, y_train,y_val,config)
    
    test(id_test,X_test,X_val,y_val,config)
    
main()

Size of training data:  24576 , Size of validation data:  8193
[8.75213675 0.53029518]
Train on 24576 samples, validate on 8193 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40
validation accuracy: [0.22815047834697375, 0.9398266813133163]
