In [64]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from logistic import LogisticRegression

数据读取

In [65]:
features_raw=pd.read_csv('./ex4x.dat',header=None,sep=r'\s+')
labels_raw=pd.read_csv('./ex4y.dat',header=None,sep=r'\s+')

数据载入

In [66]:
features=np.column_stack((features_raw[0].values.tolist(),features_raw[1].values.tolist()))
labels=np.array(labels_raw[0].values.tolist()).reshape(-1,1)
print('{},{}'.format(features.shape,labels.shape))

(80, 2),(80, 1)


K倍交叉验证数据切分

In [67]:
def K_fold_split(data,k,test_data_index):
    data_copy=np.array(data)
    splited_datas=np.array_split(data_copy,k)
    test_data=splited_datas[test_data_index]
    train_data=[]
    for i in range(k):
        if i==test_data_index:
            continue
        train_data.extend(splited_datas[i])
    
    train_data=np.array(train_data).reshape(-1,data.shape[1])
    return test_data,train_data

随机梯度下降的逻辑回归进行5倍交叉验证

In [68]:
accuracy_history=[]
k=5
max_epoch=5000
lr=0.1
batch=8
for index in range(k):
    features_test_data,features_train_data=K_fold_split(features,k,index)
    labels_test_data,labels_train_data=K_fold_split(labels,k,index)
    lgr=LogisticRegression(features_train_data,labels_train_data)
    lgr.train(max_epoch,lr,batch)
    acc=lgr.compute_accuracy(features_test_data,labels_test_data)
    
    accuracy_history.append(acc)


In [69]:
print("-----------------Logistic Regression------------------")
for index,acc in enumerate(accuracy_history):
    print("index:{}  accuracy:{}".format(index,acc))

print("--------------average--------------")
print("average accuracy:{}".format(np.mean(accuracy_history)))

-----------------Logistic Regression------------------
index:0  accuracy:1.0
index:1  accuracy:0.5625
index:2  accuracy:0.9375
index:3  accuracy:0.6875
index:4  accuracy:0.625
--------------average--------------
average accuracy:0.7625


接下来使用tensorflow

In [70]:
import tensorflow as tf

In [71]:
class Ann(tf.keras.Model):
    def __init__(self, input_size, hidden_size, output_size, activator):
        super(Ann, self).__init__()
        self.input_layer = tf.keras.layers.Dense(hidden_size, input_shape=(input_size,))
        self.hidden_layer = tf.keras.layers.Dense(output_size)
        self.activator = activator
        
    def call(self, x):
        x = self.input_layer(x)
        x = self.activator(x)
        x = self.hidden_layer(x)
        x = tf.sigmoid(x)
        return x

In [72]:
def train(model:Ann, max_epoch, lr, features_train_data, labels_train_data):
    optimizer = tf.keras.optimizers.SGD(learning_rate=lr)
    losses = []
    loss_fn=tf.nn.sigmoid_cross_entropy_with_logits
    for epoch in range(max_epoch):
        with tf.GradientTape() as tape:
            output = model(features_train_data)
            loss = loss_fn(labels=labels_train_data, logits=output)
            losses.append(loss.numpy())

        gradients = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(gradients, model.trainable_variables))

    return losses

In [73]:
def normalize_data(data):
    res = np.array(data.T)
    means = np.mean(res, axis=1)
    stds = np.std(res, axis=1)
    for i in range(res.shape[0]):
        res[i] = (res[i] - means[i]) / stds[i]
    res = res.T
    return res, means, stds

In [74]:
def normalize_test_data(test_data,means,stds):
    res=np.array(test_data.T)
    
    for i in range(res.shape[0]):
        res[i]=(res[i]-means[i])/stds[i]
    
    res=res.T
    return res

In [75]:
def classify(data,threshold=0.5):
    res=np.array(data)
    for i in range(res.shape[0]):
        if res[i][0]>=threshold:
            res[i][0]=1
        else:
            res[i][0]=0
    return res

In [76]:
def accuracy(prediction,labels_test_data):
    equal_count=0
    total=prediction.shape[0]
    for i in range(prediction.shape[0]):
        if prediction[i][0]==labels_test_data[i][0]:
            equal_count+=1
    return equal_count/total

In [77]:
hidden_size=2
input_size=2
output_size=1
max_epoch=5000
accuracy_history=[]
k=5
lr=0.1
for index in range(k):
    # 数据处理阶段
    features_test_data,features_train_data=K_fold_split(features,5,index) # 训练集与测试集的切分
    labels_test_data,labels_train_data=K_fold_split(labels,5,index)
    features_train_norm,means,stds=normalize_data(features_train_data) # 训练集标准化
    features_train_norm=tf.constant(features_train_norm)
    labels_train_data=tf.constant(labels_train_data,dtype=tf.float32)
    # 模型训练
    model=Ann(input_size,hidden_size,output_size,tf.nn.relu)# 隐藏层使用relu进行激活
    losses=train(model,max_epoch,lr,features_train_norm,labels_train_data)
    # 测试集数据处理
    features_test_data_norm=normalize_test_data(features_test_data,means,stds) # 测试集标准化
    features_test_data_norm=tf.constant(features_test_data_norm)
    y_pred=model(features_test_data_norm)
    y_pred=classify(y_pred)
    acc=accuracy(y_pred,labels_test_data)
    accuracy_history.append(acc)
average_accuracy=np.mean(accuracy_history)

In [78]:
print("-----------------Ann------------------")
for index,acc in enumerate(accuracy_history):
    print("index:{}  accuracy:{}".format(index,acc))

print("--------------average--------------")
print("average accuracy:{}".format(np.mean(accuracy_history)))

-----------------Ann------------------
index:0  accuracy:0.875
index:1  accuracy:0.4375
index:2  accuracy:0.75
index:3  accuracy:0.8125
index:4  accuracy:0.6875
--------------average--------------
average accuracy:0.7125


接下来使用SVM进行预测

In [79]:
from libsvm.svmutil import *

转独热编码

In [80]:
def to_onehot(data,type_num):
    sample_num=data.shape[0]
    onehot=np.zeros((sample_num,type_num))
    for i,cls in enumerate(data):
        onehot[i,int(cls[0])]=1
    return onehot

将dataframe转为libsvm的格式

In [81]:
def dataframe_to_libsvm(df:pd.DataFrame,labels_col:int,output_file):
    with open(output_file,'w') as f:
        for _,row in df.iterrows():
            label=row[labels_col]
            features=row.drop(labels_col)
            feature_str=' '.join([f"{index}:{value}" for index,value in features.items()])
            line=f'{label} {feature_str}\n'
            f.write(line)
    return output_file

In [82]:
def dataframe_k_fold(df,k,selected_index):
    n=len(df)
    chunk_size=n//k
    remainder=n%k
    if remainder == 0:
        chunks=np.array_split(df,k)
    else:
        chunks=np.array_split(df.iloc[:-remainder],k)
    
    selected_chunk=chunks[selected_index]
    remaining_chunks=np.concatenate(chunks[:selected_index]+chunks[selected_index+1:])
    return pd.DataFrame(selected_chunk,columns=df.columns),pd.DataFrame(remaining_chunks,columns=df.columns)

In [83]:
all_data_raw=pd.concat([labels_raw,features_raw],axis=1)
all_data_raw.columns=['y','1','2']
all_data_raw['y']=all_data_raw['y'].replace(0,-1)
all_data_raw

Unnamed: 0,y,1,2
0,1.0,55.5,69.5
1,1.0,41.0,81.5
2,1.0,53.5,86.0
3,1.0,46.0,84.0
4,1.0,41.0,73.5
...,...,...,...
75,-1.0,20.0,65.5
76,-1.0,38.0,65.0
77,-1.0,18.5,74.5
78,-1.0,16.0,72.5


直接处理出5倍交叉分割所需的数据,生成过后注释掉了

In [84]:
# k=5
# for index in range(k):
#     df_test,df_train=dataframe_k_fold(all_data_raw,5,index)
#     train_libsvm=dataframe_to_libsvm(df_train,'y',f'./data/data_train{index}.txt')
#     test_libsvm=dataframe_to_libsvm(df_test,'y',f'./data/data_test{index}.txt')

In [85]:
def normalize(data):
    values_1=[item[1] for item in data]
    values_2=[item[2] for item in data]
    means=[np.mean(values_1),np.mean(values_2)]
    stds=[np.std(values_1),np.std(values_2)]
    for item in data:
        item[1]=(item[1]-means[0])/stds[0]
        item[2]=(item[2]-means[1])/stds[1]
    return data,means,stds
    

In [86]:
def normalize_test(data,means,stds):
    for item in data:
        item[1]=(item[1]-means[0])/stds[0]
        item[2]=(item[2]-means[1])/stds[1]
    return data

接下来使用不同的核函数以及不同的c值分别进行5次交叉验证

In [87]:
k=5
svm_types=['0']
kernal_types=['0','1','2','3']
cs=[round(x*0.1,1) for x in range(1,51)]
svm_type_mp={
    '0':'软间隔',
}
kernal_type_mp={
    '0':'线性核函数',
    '1':'多项式核函数',
    '2':'径向基核函数',
    '3':'Sigmoid核函数'
}
res=[]
for svm_type in svm_types:
    for kernal_type in kernal_types:
        for c in cs:
            accuracys=[]
            for index in range(k):
                train_y,train_x=svm_read_problem(f'./data/data_train{index}.txt')
                test_y,test_x=svm_read_problem(f'./data/data_test{index}.txt')
                train_x_norm,means,stds=normalize(train_x)
                model=svm_train(train_y,train_x_norm,f'-s {svm_type} -t {kernal_type} -c {c}')
                test_x=normalize_test(test_x,means,stds)
                _,accuracy,_=svm_predict(test_y,test_x,model)
                accuracys.append(accuracy[0])
            average_accuracy=np.mean(accuracys)
            res.append({
                'svm_type':svm_type_mp[svm_type],
                'kernal_type':kernal_type_mp[kernal_type],
                "c":c,
                'average_accuracy':average_accuracy
            })

Accuracy = 87.5% (14/16) (classification)
Accuracy = 43.75% (7/16) (classification)
Accuracy = 87.5% (14/16) (classification)
Accuracy = 68.75% (11/16) (classification)


Accuracy = 50% (8/16) (classification)
Accuracy = 93.75% (15/16) (classification)
Accuracy = 43.75% (7/16) (classification)
Accuracy = 87.5% (14/16) (classification)
Accuracy = 68.75% (11/16) (classification)
Accuracy = 50% (8/16) (classification)
Accuracy = 93.75% (15/16) (classification)
Accuracy = 37.5% (6/16) (classification)
Accuracy = 87.5% (14/16) (classification)
Accuracy = 68.75% (11/16) (classification)
Accuracy = 50% (8/16) (classification)
Accuracy = 93.75% (15/16) (classification)
Accuracy = 50% (8/16) (classification)
Accuracy = 87.5% (14/16) (classification)
Accuracy = 68.75% (11/16) (classification)
Accuracy = 50% (8/16) (classification)
Accuracy = 93.75% (15/16) (classification)
Accuracy = 50% (8/16) (classification)
Accuracy = 87.5% (14/16) (classification)
Accuracy = 68.75% (11/16) (classification)
Accuracy = 50% (8/16) (classification)
Accuracy = 93.75% (15/16) (classification)
Accuracy = 50% (8/16) (classification)
Accuracy = 87.5% (14/16) (classification)
Accuracy

In [88]:
max_accuracy=0
best_item=None
for item in res:
    if item['average_accuracy']>max_accuracy:
        max_accuracy=item['average_accuracy']
        best_item=item
    print(item)
print("表现最好的是:")
print(best_item)

{'svm_type': '软间隔', 'kernal_type': '线性核函数', 'c': 0.1, 'average_accuracy': 67.5}
{'svm_type': '软间隔', 'kernal_type': '线性核函数', 'c': 0.2, 'average_accuracy': 68.75}
{'svm_type': '软间隔', 'kernal_type': '线性核函数', 'c': 0.3, 'average_accuracy': 67.5}
{'svm_type': '软间隔', 'kernal_type': '线性核函数', 'c': 0.4, 'average_accuracy': 70.0}
{'svm_type': '软间隔', 'kernal_type': '线性核函数', 'c': 0.5, 'average_accuracy': 70.0}
{'svm_type': '软间隔', 'kernal_type': '线性核函数', 'c': 0.6, 'average_accuracy': 70.0}
{'svm_type': '软间隔', 'kernal_type': '线性核函数', 'c': 0.7, 'average_accuracy': 71.25}
{'svm_type': '软间隔', 'kernal_type': '线性核函数', 'c': 0.8, 'average_accuracy': 70.0}
{'svm_type': '软间隔', 'kernal_type': '线性核函数', 'c': 0.9, 'average_accuracy': 72.5}
{'svm_type': '软间隔', 'kernal_type': '线性核函数', 'c': 1.0, 'average_accuracy': 73.75}
{'svm_type': '软间隔', 'kernal_type': '线性核函数', 'c': 1.1, 'average_accuracy': 71.25}
{'svm_type': '软间隔', 'kernal_type': '线性核函数', 'c': 1.2, 'average_accuracy': 71.25}
{'svm_type': '软间隔', 'kernal_type': 

通过以上分析发现使用线性核函数，c为1.0时，进行5次交叉验证拥有最好的平均准确率

通过使用基于随机梯度下降的logistic回归，三层前向神经网络，支持向量机发现，三者的预测效果在该数据集上相差不大，但是支持向量机的预测效果最稳定，经过多次实验发现其平均准确率始终稳定在一个值，而logistic和三层前向神经网络的准确率有时高有时低，但总体发现logistic回归的效果比三层前向神经网络要好。
对于训练速度而言，支持向量机快于Logistc快于三层前向神经网络