In [1]:
import pandas as pd
import numpy as np
import random
import math
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
import os
project_path = os.path.abspath(os.path.join(os.path.dirname(os.path.abspath('.')), '../'))
import sys
sys.path.append(project_path)
from util.visualization import draw_lines
from util.visualization import draw_scatters
from util.evaluate_process import mse, rmse, mae, classifier_evaluate
from sklearn.model_selection import train_test_split
from sklearn.metrics import auc, roc_auc_score, roc_curve

In [2]:
df = pd.read_csv('../../data/preprocessed.samecar.csv')

In [3]:
feats = [
    'colorp1', 'colorp2', 
    'fuel_typep1', 'fuel_typep2','displacement_standard1', 'displacement_standard2',
    'gearboxp1', 'gearboxp2', 'displacement_diff', 'displacement_diff_sparse',
    'mile_diff', 'mile_diff_sparse', 'mile_diff_rate', 'mile_diff_rate_sparse',
    'year_diff', 'year_diff_sparse', 'licensed_city_diff_sparse', 'title_diff', 
    'title_diff_sparse', 'register_time_diff', 'register_time_diff_sparse',
    'is_import_diff_sparse', 'transfer_times_diff', 'transfer_times_diff_sparse'
]
# 只保留离散特征
sparse_feats = [
    'colorp1', 'colorp2', 
    'fuel_typep1', 'fuel_typep2','displacement_standard1', 'displacement_standard2',
    'gearboxp1', 'gearboxp2', 'displacement_diff_sparse',
    'mile_diff_sparse', 'mile_diff_rate_sparse',
    'year_diff_sparse', 'licensed_city_diff_sparse', 
    'title_diff_sparse', 'register_time_diff_sparse',
    'is_import_diff_sparse', 'transfer_times_diff_sparse'
]

In [4]:
df[feats]

Unnamed: 0,colorp1,colorp2,fuel_typep1,fuel_typep2,displacement_standard1,displacement_standard2,gearboxp1,gearboxp2,displacement_diff,displacement_diff_sparse,...,year_diff,year_diff_sparse,licensed_city_diff_sparse,title_diff,title_diff_sparse,register_time_diff,register_time_diff_sparse,is_import_diff_sparse,transfer_times_diff,transfer_times_diff_sparse
0,0,0,-1,-1,-1,-1,-1,1,-1.0,0.5,...,-1.0,0.5,2.0,8.0,0.0,-24.0,0.0,2.0,-1.0,1
1,11,7,-1,-1,-1,-1,-1,1,-1.0,0.5,...,-1.0,0.5,0.0,7.0,0.0,34.0,0.0,2.0,0.0,2
2,11,-1,-1,-1,-1,-1,-1,1,-1.0,0.5,...,-1.0,0.5,2.0,4.0,1.0,65.0,0.0,2.0,-2.0,0
3,0,-1,-1,-1,-1,-1,-1,1,-1.0,0.5,...,-1.0,0.5,2.0,6.0,0.0,123.0,0.0,2.0,-1.0,1
4,7,7,-1,-1,-1,-1,-1,1,-1.0,0.5,...,-1.0,0.5,2.0,3.0,1.0,5.0,0.0,2.0,0.0,2
5,11,7,-1,-1,-1,-1,-1,1,-1.0,0.5,...,-1.0,0.5,2.0,5.0,0.0,46.0,0.0,2.0,-1.0,1
6,-1,0,-1,-1,-1,-1,-1,1,-1.0,0.5,...,-1.0,0.5,2.0,5.0,0.0,-69.0,0.0,2.0,1.0,1
7,-1,-1,-1,-1,-1,-1,-1,1,-1.0,0.5,...,-1.0,0.5,2.0,9.0,0.0,104.0,0.0,2.0,-1.0,1
8,0,0,-1,-1,-1,-1,-1,1,-1.0,0.5,...,-1.0,0.5,0.0,4.0,1.0,14.0,0.0,2.0,0.0,2
9,11,-1,-1,-1,-1,-1,-1,1,-1.0,0.5,...,-1.0,0.5,0.0,8.0,0.0,43.0,0.0,2.0,0.0,2


In [5]:
rdf = df[feats].copy()
rdf['c'] = 1
X_train, X_test, y_train, y_test = train_test_split(rdf, df.is_same, test_size=0.25, random_state=10)
len(X_train), len(X_test)

(8038, 2680)

In [6]:
def sigmoid(v):
    return 1.0/(1+math.exp(-v))

def sigmoid_arr(iy):
    ret = []
    for v in iy:
        try:
            ret.append(sigmoid(v))
        except:
            print(v)
            ret.append(0.5)
    return ret

In [9]:
# 
weight = [1 for i in range(len(feats)+1)]
run_times = []
ret_arr = {'recall':[], 'precision':[], 'accuracy': [], 'tp':[], 'tn':[], 'fp': [], 'fn':[]}
weight_arr = []
run_index = 0

In [14]:
eta = 0.00001
for i in range(500):
    pred = sigmoid_arr(np.dot(X_train, weight))
    intpred = [round(x) for x in pred]
    ###
    if run_index % 20 == 0:
        ret = classifier_evaluate(y_train, intpred)
        for k in ret:
            ret_arr[k].append(ret[k])
        run_times.append(run_index)
        weight_arr.append(weight)
    ###
    error = pred - y_train
    weight = weight - eta * np.dot(X_train.T, error) 
    ###
    run_index += 1

pred = sigmoid_arr(np.dot(X_train, weight))
intpred = [round(x) for x in pred]
###
ret = classifier_evaluate(y_train, intpred)
for k in ret:
    ret_arr[k].append(ret[k])
print(ret)
run_times.append(run_times[-1]+1)
weight_arr.append(weight)
###
w = np.array(weight_arr)
###
draw_lines(run_times, [ret_arr['accuracy'], ret_arr['recall'], ret_arr['precision']], ['acc', 'recall', 'precision'])

{'recall': 0.9779932860872809, 'precision': 0.9472543352601156, 'accuracy': 0.9744961433192336, 'tp': 2622, 'fp': 146, 'tn': 5211, 'fn': 59}


In [15]:
## 测试集
pred = sigmoid_arr(np.dot(X_test, weight))
intpred = [round(x) for x in pred]
###
ret = classifier_evaluate(y_test, intpred)
ret

{'recall': 0.9678899082568807,
 'precision': 0.9451287793952967,
 'accuracy': 0.9712686567164179,
 'tp': 844,
 'fp': 49,
 'tn': 1759,
 'fn': 28}

In [16]:
# 绘制roc, 计算auc
fpr, tpr, thresholds = roc_curve(y_test, pred)
roc_auc = auc(fpr, tpr)
print(f'auc{roc_auc}')
draw_lines(fpr, [tpr, fpr], ['tp', 'fp'])

auc0.9922563200251685


In [17]:
weight

array([-7.39681551e-02,  1.54805275e-02,  1.73703583e-01,  1.02684028e-01,
       -2.99325889e-01,  4.23062609e-03,  9.05478110e-01, -1.65674532e-01,
        8.22282272e-01, -5.18061402e-01, -1.56292964e-02,  1.07491859e+00,
        8.54638477e-01,  1.42915681e+00,  1.41727977e+00, -5.31684906e-01,
        2.06906380e-01, -2.42923723e-01, -3.51596962e-01,  1.99604187e-03,
        2.32753185e+00, -1.29402034e+00, -8.10080444e-03,  3.74225942e-01,
       -1.05382562e-01])

In [None]:
### sklearn 逻辑回归结果
array([[-0.07349292,  0.02651833,  0.06006925,  0.05488411,
        -0.05494073,  0.02156823,  1.45419423, -1.4281623 , 
        -0.81332124,  0.43560987,  0.05411366,  0.46573722, 
        -1.42587228,  2.27748505, -0.71848951,  -0.58548616,  
        0.81885668,  0.18227067,  1.01951188, -0.01014606,
         3.08776385, -1.73645331, -0.02141894,  0.80922805]])
array([-6.96266783])