# regression with neriol networks 

---
### backgroud & goal

通过对比简单神经网络与回归方程，加深对两者的理解与印象

平台基于pytorch与sklearn构建，前期数据处理流程尽量保存一致。

---

### 建立的回归方程 The type Of regression：

1. 多元回归
2. 逻辑回归
3. 多分类

----

### 数据来源包括 data source：

1. 信用卡数据
2. 汽车数据

---

### 模型过程

数据读取->EDA->数据整理（edit data）->模型建立->模型验证->模型部署

不论是回归抑或神经网络都依照这一流程

> pytorch 部署时数据处理使用 dataset，dataloader包装。
> 分类判断 

## 实验1 波士顿房价数据 

In [3]:
import pandas as pd
df=pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data",header=None,sep='\s+')


df.columns=('CRIM','ZN','INDUS','CHAS','NOX','RM','AGE','DIS','RAD','TAX','PROATIO','B','LSTAT','MEDV')
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PROATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33,36.2


In [4]:
df.describe()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PROATIO,B,LSTAT,MEDV
count,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,3.613524,11.363636,11.136779,0.06917,0.554695,6.284634,68.574901,3.795043,9.549407,408.237154,18.455534,356.674032,12.653063,22.532806
std,8.601545,23.322453,6.860353,0.253994,0.115878,0.702617,28.148861,2.10571,8.707259,168.537116,2.164946,91.294864,7.141062,9.197104
min,0.00632,0.0,0.46,0.0,0.385,3.561,2.9,1.1296,1.0,187.0,12.6,0.32,1.73,5.0
25%,0.082045,0.0,5.19,0.0,0.449,5.8855,45.025,2.100175,4.0,279.0,17.4,375.3775,6.95,17.025
50%,0.25651,0.0,9.69,0.0,0.538,6.2085,77.5,3.20745,5.0,330.0,19.05,391.44,11.36,21.2
75%,3.677083,12.5,18.1,0.0,0.624,6.6235,94.075,5.188425,24.0,666.0,20.2,396.225,16.955,25.0
max,88.9762,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,396.9,37.97,50.0


数据基本说明：
其中除了RAD为分类数据，其他都是数值数据
![](https://ai-studio-static-online.cdn.bcebos.com/541592bee2b245c4af0dd26f3d77720823bb71f8103f43bf82a528a7fa5e4abd)


实验步骤：
1 首先考虑全部使用数值数据，预测MEDV
2 加入类别数据，预测MEDV

In [1]:
import os 
import pandas as pd 
import numpy as np 
import torch 
from torch import nn
from torch.utils.data import Dataset, DataLoader, random_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score
import os
from sklearn.model_selection import KFold
#os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

#在处理类别数据时，相关数据在dataset中处理，包括对数据分割为数值列，和类别列，并分别返回
class HgDataset(Dataset):
    def __init__(self,data,tranform=None,traget_tranform=None):#(self,file_path,tranform=None,traget_tranform=None):
        """_summary_

        Args:
            file_path (_type_): _description_
            tranform (_type_, optional): _description_. Defaults to None.
            traget_tranform (_type_, optional): _description_. Defaults to None.
        """
        #self.data=pd.read_csv(file_path)
        self.data=data
        self.y=self.data['MEDV'].to_numpy().reshape(-1,1)# 设置因变量
        self.x=self.data.drop('MEDV',axis=1,inplace=False) 
        #数值数据与分类数据的划分
       # self.numerical_x=self.x.drop('RAD',axis=1,inplace=False).to_numpy()
        #self.category_x=self.x['RAD'].to_numpy()

        if tranform:
            self.x=tranform(self.x)
        if traget_tranform:
            self.y=traget_tranform(self.y)
    
    def __len__(self):
        return len(self.data)
    
    def getLabels(self):#返回分类数据分类标签的数量
        unique_labels = np.unique(self.category_x)
        num_labels = len(unique_labels)
        return unique_labels,num_labels
    
    def getNumerAttribut(self):
        return int(self.numerical_x.shape[1])

    def __getitem__(self,index):
        '''torch.is_tensor(idx): 这个函数检查idx是否是一个PyTorch张量（torch.Tensor）。在某些情况下，
        尤其是在使用高级索引或者批处理时，索引可能会以张量的形式给出。这个检查是为了确定是否需要将张量索引转换成Python列表，以便后续处理。

         idx.tolist(): 如果idx确实是一个张量，tolist()方法将这个张量转换成一个Python列表。这是必要的，
         因为在接下来的数据检索过程中，通常需要索引作为整数或者整数列表来使用，而不是张量。'''
        if torch.is_tensor(index):
            index = index.tolist()

        y_i=self.y[index]
        x_i=self.x[index]
        #nx_i=self.numerical_x[index]
        #cx_i=self.category_x[index]
        #return nx_i,cx_i,y_i
        return x_i,y_i
def reset_weights(m:nn.Module):
    """try to reset the weights to avoid weight leakage

    Args:
        m (_type_): model
    """
    for layer in m.children():
        if hasattr(layer,'reset_weights'):
            print(f'Reset trainable parameters of layer = {layer}')
            layer.reset_parameters()

def my_transform(batch,device):
    #nx,cx,y=zip(*batch)
    x,y=zip(*batch)
    #print(type(x))
    #nx=np.array(nx)
   # cx=np.array(cx)
    x=np.array(x)
    y=np.array(y)

    scaler=StandardScaler()
    #nx=scaler.fit_transform(nx)
    #nx=torch.tensor(nx,dtype=torch.float32).to(device)
    #cx=torch.tensor(cx,dtype=torch.long).to(device)#category数据类型应该是int
    x=scaler.fit_transform(x)
    x=torch.tensor(x,dtype=torch.float32).to(device)
    y=torch.tensor(y,dtype=torch.float32).to(device)
    return nx,cx,y
################################################################
class Mymodel(nn.Module):
    def __init__(self,numercial_at,category_label,embed_dim,drop_p):
        super(Mymodel, self).__init__()
       
        #self.embedd=nn.Embedding(num_embeddings=category_label,embedding_dim=embed_dim)
        self.l1=nn.Linear(numercial_at,100)
        #self.l2=nn.Linear(embed_dim+100,50)
        self.l2=nn.Linear(100,50)
        self.b1=nn.BatchNorm1d(50)
        self.d1=nn.Dropout(p=drop_p)
        self.r1=nn.ReLU()
        self.l3=nn.Linear(50,1)
             
    '''
    torch.flatten(x_categorical, start_dim=1): 将嵌入向量展平。
    start_dim=1意味着保留批次维度（假设批次维度是第0维），
    将嵌入向量的所有维度（从第1维开始）展平为一维。
    这一步骤是为了将每个样本的所有嵌入向量合并成一个单一的向量，以便与数值数据拼接。

    torch.cat([x_numeric, x_categorical], dim=1):
      将处理过的数值数据和展平后的类别数据嵌入向量沿着特征维度（dim=1）进行拼接。
      这样，模型就可以同时考虑类别特征和数值特征。
    '''
    def forward(self,numercial_input,category_input):
       # cx=self.embedd(category_input)
        #nx=self.l1(numercial_input)
        x=self.l1(numercial_input)
        #cx=torch.flatten(cx,start_dim=1)
        #x=torch.cat([nx,cx],dim=1)# 注意保持数值和类别数据的顺序一致
        x=self.l2(x)
        x=self.b1(x)
        x=self.d1(x)
        x=self.r1(x)
        x=self.l3(x)
        return x

         
if __name__ == "__main__":
    torch.cuda.init()
    ################################加入k-fold数目#############################################################
      # Set fixed random number seed
    torch.manual_seed(42)

    #k_folds=5 #相等于20%为test数据
    #kfold = KFold(n_splits=k_folds, shuffle=True)#实例化fold,注意在此已经进行样本洗牌，实现随机


    #########################
    #filename='data/Hg_notime_nocooked.csv'
    # 得到categories 的labes数，以及numerical 
    df=pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data",header=None,sep='\s+')


    df.columns=('CRIM','ZN','INDUS','CHAS','NOX','RM','AGE','DIS','RAD','TAX','PROATIO','B','LSTAT','MEDV')
    hdataset=HgDataset(df)
    
    unique_labels,categories_num=hdataset.getLabels()
    print(unique_labels)
    numerical_dim=hdataset.getNumerAttribut()
    print("the labels of category attribute is {} and the numerical attribute dim is {}".format(categories_num,numerical_dim,))
    

   
    ################################超参等设置###################################
    lr=0.02
    category_dim=5
    drop_p=0.3
    epochs=500
    batch=20
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print("the current device is",device)
            #记录
    fold_test_loss=[]
    fold_train_loss=[]
    fold_acc=[]
    ################################################################
    #for fold, (train_ids,test_ids)in enumerate(kfold.split(hdataset)):


        #######################分割train，test############
    #dataset_len=len(hdataset)
    #print('the lenght of dataset',dataset_len)
    #train_size=int(0.8*dataset_len)
    #test_size=dataset_len-train_size
        #print("k-fold方法时，train与test集的划分基于kfold.split循环划分方法，本次是第{} fold".format(fold+1))
        #train_dataset, test_dataset = random_split(hdataset, [train_size, test_size])

    train_subsampler=torch.utils.data.SubsetRandomSampler(train_ids)
    test_subsampler=torch.utils.data.SubsetRandomSampler(test_ids)
    train_dataloader=DataLoader(hdataset,batch_size=batch,collate_fn=lambda x:my_transform(x,device),sampler=train_subsampler)
    test_dataloader=DataLoader(hdataset,batch_size=batch,collate_fn=lambda x:my_transform(x,device),sampler=test_subsampler)
        #############################实例化模型，以及优化模型，损失函数######################################
    model=Mymodel(numercial_at=numerical_dim,category_label= categories_num+1,embed_dim=5,drop_p=drop_p)
        #model.apply(reset_weights)
    model.to(device)
    loss_fn=nn.MSELoss()
    opt=torch.optim.Adam(params=model.parameters(),lr=lr) 
    #########################实现循环梯度下降################################################################
    train_losses = []
    test_losses = []
    epoch_r2_scores = []
    for i in range(epochs):
        train_loss=0
        model.train()#在每个 epoch 结束后，应该使用 model.train() 将模型设置为训练模式，以确保 BatchNormalization 和 Dropout 层正常工作。
        for nx_batch,cx_batch,y_batch in train_dataloader:
            yp_batch=model.forward(nx_batch,cx_batch)
            loss=loss_fn(yp_batch,y_batch)
            opt.zero_grad()
            loss.backward()
            opt.step()
                # 用于记录训练和测试损失的列表
            train_loss += loss.item() * nx_batch.size(0)
        train_loss /= len(train_dataloader.dataset)
        train_losses.append(train_loss)
        
        model.eval()#进入评估模式，停止dropout和bathnorm
        test_loss=0
        y_true=[]
        y_pred_list=[]
        with torch.no_grad():
            for nx_batch,cx_batch,y_batch in test_dataloader:
                yp_batch=model.forward(nx_batch,cx_batch)
                loss=loss_fn(yp_batch,y_batch)
                test_loss += loss.item() * nx_batch.size(0)
                y_true.append(y_batch.cpu().numpy())  # 收集真实值
                y_pred_list.append(yp_batch.cpu().numpy())  # 收集预测值，注意此处应使用yp_batch

                # 由于对应的是回归方程，故计算r
        test_loss /= len(test_dataloader.dataset)
        test_losses.append(test_loss)

        y_pred_list=np.concatenate(y_pred_list,axis=0)
        y_true=np.concatenate(y_true,axis=0)
        r2=r2_score(y_true,y_pred_list)
        epoch_r2_scores.append(r2)
        if i==epochs-1:
            print(f'R^2 Score: {r2}')
      
            fold_acc.append(r2)
            # 打印每个epoch的损失
            print(f"Epoch {i+1}/{epochs}.. Train loss: {train_loss:.4f}.. Test loss: {test_loss:.4f}")
            fold_test_loss.append(test_loss)
            fold_train_loss.append(train_loss)
        # 绘制训练和测试损失
    plt.figure(figsize=(5, 3))
 
    plt.plot(test_losses, label='Test loss')
    plt.title('Loss vs. Epochs')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.show()

    # 绘制批次R²分数曲线
# 所有epoch完成后，绘制epoch R²分数曲线
    plt.figure(figsize=(5, 3))
    plt.plot(epoch_r2_scores, marker='o', linestyle='-', color='blue')
    #plt.title('Epoch-wise R² Score of {} fold'.format(fold))
    plt.xlabel('Epoch')
    plt.ylabel('R² Score')
    plt.grid(True)
    plt.show()
 

[ 1  2  3  4  5  6  7  8 24]
the labels of category attribute is 9 and the numerical attribute dim is 12
the current device is cuda
k-fold方法时，train与test集的划分基于kfold.split循环划分方法，本次是第1 fold


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [3]:
import os 
import pandas as pd 
import numpy as np 
import torch 
from torch import nn
from torch.utils.data import Dataset, DataLoader, random_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score
import os
from sklearn.model_selection import KFold
#os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

#在处理类别数据时，相关数据在dataset中处理，包括对数据分割为数值列，和类别列，并分别返回
class HgDataset(Dataset):
    def __init__(self,data,tranform=None,traget_tranform=None):#(self,file_path,tranform=None,traget_tranform=None):
        """_summary_

        Args:
            file_path (_type_): _description_
            tranform (_type_, optional): _description_. Defaults to None.
            traget_tranform (_type_, optional): _description_. Defaults to None.
        """
        #self.data=pd.read_csv(file_path)
        self.data=data
        self.y=self.data['MEDV'].to_numpy().reshape(-1,1)# 设置因变量
        self.x=self.data.drop('MEDV',axis=1,inplace=False) 
        #数值数据与分类数据的划分
        self.numerical_x=self.x.drop('RAD',axis=1,inplace=False).to_numpy()
        self.category_x=self.x['RAD'].to_numpy()

        if tranform:
            self.x=tranform(self.x)
        if traget_tranform:
            self.y=traget_tranform(self.y)
    
    def __len__(self):
        return len(self.data)
    
    def getLabels(self):#返回分类数据分类标签的数量
        unique_labels = np.unique(self.category_x)
        num_labels = len(unique_labels)
        return unique_labels,num_labels
    
    def getNumerAttribut(self):
        return int(self.numerical_x.shape[1])

    def __getitem__(self,index):
        '''torch.is_tensor(idx): 这个函数检查idx是否是一个PyTorch张量（torch.Tensor）。在某些情况下，
        尤其是在使用高级索引或者批处理时，索引可能会以张量的形式给出。这个检查是为了确定是否需要将张量索引转换成Python列表，以便后续处理。

         idx.tolist(): 如果idx确实是一个张量，tolist()方法将这个张量转换成一个Python列表。这是必要的，
         因为在接下来的数据检索过程中，通常需要索引作为整数或者整数列表来使用，而不是张量。'''
        if torch.is_tensor(index):
            index = index.tolist()

        y_i=self.y[index]
        nx_i=self.numerical_x[index]
        cx_i=self.category_x[index]
        return nx_i,cx_i,y_i
def reset_weights(m:nn.Module):
    """try to reset the weights to avoid weight leakage

    Args:
        m (_type_): model
    """
    for layer in m.children():
        if hasattr(layer,'reset_weights'):
            print(f'Reset trainable parameters of layer = {layer}')
            layer.reset_parameters()

def my_transform(batch,device):
    nx,cx,y=zip(*batch)
    #print(type(x))
    nx=np.array(nx)
    cx=np.array(cx)
    y=np.array(y)

    scaler=StandardScaler()
    nx=scaler.fit_transform(nx)
    nx=torch.tensor(nx,dtype=torch.float32).to(device)
    cx=torch.tensor(cx,dtype=torch.long).to(device)#category数据类型应该是int
    y=torch.tensor(y,dtype=torch.float32).to(device)
    return nx,cx,y
################################################################
class Mymodel(nn.Module):
    def __init__(self,numercial_at,category_label,embed_dim,drop_p):
        super(Mymodel, self).__init__()
       
        #self.embedd=nn.Embedding(num_embeddings=category_label,embedding_dim=embed_dim)
        self.l1=nn.Linear(numercial_at,100)
        #self.l2=nn.Linear(embed_dim+100,50)
        self.l2=nn.Linear(100,50)
        self.b1=nn.BatchNorm1d(50)
        self.d1=nn.Dropout(p=drop_p)
        self.r1=nn.ReLU()
        self.l3=nn.Linear(50,1)
             
    '''
    torch.flatten(x_categorical, start_dim=1): 将嵌入向量展平。
    start_dim=1意味着保留批次维度（假设批次维度是第0维），
    将嵌入向量的所有维度（从第1维开始）展平为一维。
    这一步骤是为了将每个样本的所有嵌入向量合并成一个单一的向量，以便与数值数据拼接。

    torch.cat([x_numeric, x_categorical], dim=1):
      将处理过的数值数据和展平后的类别数据嵌入向量沿着特征维度（dim=1）进行拼接。
      这样，模型就可以同时考虑类别特征和数值特征。
    '''
    def forward(self,numercial_input,category_input):
        #cx=self.embedd(category_input)
        nx=self.l1(numercial_input)

        #cx=torch.flatten(cx,start_dim=1)
        #x=torch.cat([nx,cx],dim=1)# 注意保持数值和类别数据的顺序一致
        x=self.l2(x)
        x=self.b1(x)
        x=self.d1(x)
        x=self.r1(x)
        x=self.l3(x)
        return x

         
if __name__ == "__main__":
    torch.cuda.init()
    ################################加入k-fold数目#############################################################
      # Set fixed random number seed
   # torch.manual_seed(42)

    k_folds=5 #相等于20%为test数据
    kfold = KFold(n_splits=k_folds, shuffle=True)#实例化fold,注意在此已经进行样本洗牌，实现随机


    #########################
    #filename='data/Hg_notime_nocooked.csv'
    # 得到categories 的labes数，以及numerical 
    df=pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data",header=None,sep='\s+')


    df.columns=('CRIM','ZN','INDUS','CHAS','NOX','RM','AGE','DIS','RAD','TAX','PROATIO','B','LSTAT','MEDV')
    hdataset=HgDataset(df)
    
    unique_labels,categories_num=hdataset.getLabels()
    print(unique_labels)
    numerical_dim=hdataset.getNumerAttribut()
    print("the labels of category attribute is {} and the numerical attribute dim is {}".format(categories_num,numerical_dim,))
    

   
    ################################超参等设置###################################
    lr=0.02
    category_dim=5
    drop_p=0.3
    epochs=500
    batch=20
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print("the current device is",device)
            #记录
    fold_test_loss=[]
    fold_train_loss=[]
    fold_acc=[]
    ################################################################
    for fold, (train_ids,test_ids)in enumerate(kfold.split(hdataset)):


        #######################分割train，test############
    #dataset_len=len(hdataset)
    #print('the lenght of dataset',dataset_len)
    #train_size=int(0.8*dataset_len)
    #test_size=dataset_len-train_size
        print("k-fold方法时，train与test集的划分基于kfold.split循环划分方法，本次是第{} fold".format(fold+1))
        #train_dataset, test_dataset = random_split(hdataset, [train_size, test_size])

        train_subsampler=torch.utils.data.SubsetRandomSampler(train_ids)
        test_subsampler=torch.utils.data.SubsetRandomSampler(test_ids)
        train_dataloader=DataLoader(hdataset,batch_size=batch,collate_fn=lambda x:my_transform(x,device),sampler=train_subsampler)
        test_dataloader=DataLoader(hdataset,batch_size=batch,collate_fn=lambda x:my_transform(x,device),sampler=test_subsampler)
        #############################实例化模型，以及优化模型，损失函数######################################
        model=Mymodel(numercial_at=numerical_dim,category_label= categories_num+1,embed_dim=5,drop_p=drop_p)
        #model.apply(reset_weights)
        model.to(device)
        loss_fn=nn.MSELoss()
        opt=torch.optim.Adam(params=model.parameters(),lr=lr) 
    #########################实现循环梯度下降################################################################
        train_losses = []
        test_losses = []
        epoch_r2_scores = []
        for i in range(epochs):
            train_loss=0
            model.train()#在每个 epoch 结束后，应该使用 model.train() 将模型设置为训练模式，以确保 BatchNormalization 和 Dropout 层正常工作。
            for nx_batch,cx_batch,y_batch in train_dataloader:
                yp_batch=model.forward(nx_batch,cx_batch)
                loss=loss_fn(yp_batch,y_batch)
                opt.zero_grad()
                loss.backward()
                opt.step()
                    # 用于记录训练和测试损失的列表
                train_loss += loss.item() * nx_batch.size(0)
            train_loss /= len(train_dataloader.dataset)
            train_losses.append(train_loss)
            
            model.eval()#进入评估模式，停止dropout和bathnorm
            test_loss=0
            y_true=[]
            y_pred_list=[]
            with torch.no_grad():
                for nx_batch,cx_batch,y_batch in test_dataloader:
                    yp_batch=model.forward(nx_batch,cx_batch)
                    loss=loss_fn(yp_batch,y_batch)
                    test_loss += loss.item() * nx_batch.size(0)
                    y_true.append(y_batch.cpu().numpy())  # 收集真实值
                    y_pred_list.append(yp_batch.cpu().numpy())  # 收集预测值，注意此处应使用yp_batch

                    # 由于对应的是回归方程，故计算r
            test_loss /= len(test_dataloader.dataset)
            test_losses.append(test_loss)

            y_pred_list=np.concatenate(y_pred_list,axis=0)
            y_true=np.concatenate(y_true,axis=0)
            r2=r2_score(y_true,y_pred_list)
            epoch_r2_scores.append(r2)
            if i==epochs-1:
               print(f'R^2 Score: {r2}')
               print(fold)
               fold_acc.append(r2)
                # 打印每个epoch的损失
               print(f"Epoch {i+1}/{epochs}.. Train loss: {train_loss:.4f}.. Test loss: {test_loss:.4f}")
               fold_test_loss.append(test_loss)
               fold_train_loss.append(train_loss)
            # 绘制训练和测试损失
        plt.figure(figsize=(5, 3))
        plt.plot(train_losses, label='Training loss of {} fold'.format( fold))
        plt.plot(test_losses, label='Test loss')
        plt.title('Loss vs. Epochs')
        plt.xlabel('Epochs')
        plt.ylabel('Loss')
        plt.legend()
        plt.show()

        # 绘制批次R²分数曲线
    # 所有epoch完成后，绘制epoch R²分数曲线
        plt.figure(figsize=(5, 3))
        plt.plot(epoch_r2_scores, marker='o', linestyle='-', color='blue')
        plt.title('Epoch-wise R² Score of {} fold'.format(fold))
        plt.xlabel('Epoch')
        plt.ylabel('R² Score')
        plt.grid(True)
        plt.show()
    for i in range(k_folds):
        print("-----------------------")
        print("The K-fold crosss validation results of {} folds:".format(k_folds))
        print("the r2 of {}  folds:{}".format(i+1,fold_acc[i]))
        print("the test loss of {}  folds:{}".format(i+1,fold_test_loss[i]))
        print("the train loss of {}  folds:{}".format(i+1,fold_train_loss[i]))
        print("-----------------------")
        print("the average R² score is {} , the average train losss is {} ,the average test loss is {}".format(np.mean(fold_acc),np.mean(fold_train_loss),np.mean(fold_test_loss)))

[ 1  2  3  4  5  6  7  8 24]
the labels of category attribute is 9 and the numerical attribute dim is 12
the current device is cuda
k-fold方法时，train与test集的划分基于kfold.split循环划分方法，本次是第1 fold


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


## 实验1：汽车价格数据
### overview:

In this lab you will apply regression to some realistic data. In this lab you will work with the automotive price dataset. Your goal is to construct a linear regression model to predict the price of automobiles from their characteristics. 

In this lab will learn to:

1. Use categorical data with scikit-learn. 
2. Apply transformations to features and labels to improve model performance. 
3. Compare regression models to improve model performance. 
4. Apply neorl network to the same data.

----

critial points:

1，EDA
2. Data processing 中 dumy数据处理
3. Embedding 层的使用
4. 使用sklearn pipeline



[](https://towardsdatascience.com/deep-learning-for-tabular-data-using-pytorch-1807f2858320)

In [17]:
import sys,os 
import common.util
sys.path.append(os.pardir)
print(os.getcwd())
print(os.path.abspath('.'))
print(os.pardir)
current_path = os.path.abspath('.')
print("current",current_path)
file=os.path.join(current_path,'/data/Automobile price data _Raw_.csv')
print(file)
 

c:\Users\tom\OneDrive\文档\GitHub\IRM_class\Deep Learning
c:\Users\tom\OneDrive\文档\GitHub\IRM_class\Deep Learning
..
current c:\Users\tom\OneDrive\文档\GitHub\IRM_class\Deep Learning
c:/data/Automobile price data _Raw_.csv


1.1 eda 
[参考](https://towardsdatascience.com/exploratory-data-analysis-eda-visualization-using-pandas-ca5a04271607)

In [18]:
import numpy as np 
import pandas as pd 
####################Preping Data###################### 
# 1. get data
current_path = os.getcwd()
#file=os.path.join(current_path,'/data/Automobile price data _Raw_.csv')
file='data/Automobile price data _Raw_.csv'
print(file)
auto_price_df=pd.read_csv(file)
# 2. eda base on pandas functions 基于pandas 对数据eda 
print("数据基本信息:")
print(auto_price_df.info())
print("****************************************************************")
print("数据描述信息:")
print(auto_price_df.describe())
print("****************************************************************")
head=5
print("前{}行数据：".format(head))
print(auto_price_df.head(head))
print("****************************************************************")


data/Automobile price data _Raw_.csv
数据基本信息:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 26 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   symboling          205 non-null    int64  
 1   normalized-losses  205 non-null    object 
 2   make               205 non-null    object 
 3   fuel-type          205 non-null    object 
 4   aspiration         205 non-null    object 
 5   num-of-doors       205 non-null    object 
 6   body-style         205 non-null    object 
 7   drive-wheels       205 non-null    object 
 8   engine-location    205 non-null    object 
 9   wheel-base         205 non-null    float64
 10  length             205 non-null    float64
 11  width              205 non-null    float64
 12  height             205 non-null    float64
 13  curb-weight        205 non-null    int64  
 14  engine-type        205 non-null    object 
 15  num-of-cylinders   205 non-nu

对数据中Null值，缺失值进行分析，

In [19]:
import pandas as pd 
def datainfo_null(df:pd.DataFrame)->None:
    if df.isnull().values.any(): 
        null_count=df.isnull().sum()
        al_count=df.shape[0]
        print("数据框存在null行{},占比{:.2f}".format(null_count, null_count/al_count))
    else:
        print("数据框没有非空行")

def datainfo_duplicate(df:pd.DataFrame)->None:
    if auto_price_df.duplicated().values.any(): 
        null_count=auto_price_df.duplicated().sum()
        al_count=df.shape[0]
        print("数据框存在重复行{},占比{:.2f}".format(null_count, null_count/al_count))
    else:
        print("数据框没有重复行")
if __name__=="__main__":
    datainfo_null(auto_price_df)
    datainfo_duplicate(auto_price_df)
 

数据框没有非空行
数据框没有重复行


In [None]:
!pip install pandas-profiling

数据分布分析

为了简化，可以采用 pandas-profiling 对Dataframe数据进行DEA

在以上数据基础上，再通过箱图观察，得到float的具体极端值数据。

In [1]:
import pandas_profiling as pp
pp.ProfileReport(auto_price_df)



ModuleNotFoundError: No module named 'pandas_profiling'

根据数据报告，得到：
1. 部分属性出现 high cardinality 高基数

high-cardinality categorical attributes，从字面上理解，即对于某个category特征，不同值的数量非常多，这里暂且把它叫做高数量类别属性。反之，即低数量类别属性（low-cardinality）

对于低数量类别属性，通常在data science中采用的方式是将其转化为one-hot编码，即给每一个类别增加一个特征。但是当类别数量增加的时候，ont-hot编码增加的特征也在增加。所以， **one-hot编码无法适用于高数量特征属性**。

2. 部分数据出现 imbalance现象

---

单纯的依靠对数据观察无法指定下一步的数据清洗与分析计划，例如normalized-losses出现了high cardinality 现象，并存在大量的'?'字符，但该列代表含义不明确的情况下难以对其进行处理。如果normalized-losses出现问题数据过多，则直接删除该列。

---



数据观察得到，其中price作为因变量数据类型为obejct(str)对象类型，应该转换称为float浮点型

[参考](https://sparkbyexamples.com/pandas/pandas-convert-string-to-float-type-dataframe/)

但转换时，数据中出现'?'符号，不能直接通过astype转换，需要首先，找到并去除出现'?'的行。

同时其他列中也可能出现'?'或空白字符串'  '为此，自定义函数，首先进行统计，并去掉该列中无效行。
---

同时由于列名中出现'-'字符，会影响后期数据处理，将其替换为'_'

> Recode names

Notice that several of the column names contain the '-' character. Python will not correctly recognize character strings containing '-'.  Rather, such a name will be recognized as two character strings. The same problem will occur with column values containing many special characters including, '-', ',', '*', '/', '|', '>', '<', '@', '!' etc. If such characters appear in column names of values, they must be replaced with another character. 

Execute the code in the cell below to replace the '-' characters by '_':

In [20]:
def error_info(df:pd.DataFrame):
    
    #all=df.__len__
    #print(all)
    
        
    for col in df.columns:
        #if df[col].dtype == object:
            count=0
            for row in df[col]:
                if  row=='?':
                    count+=count
    print('列{}中含有非法字符数据{}行'.format(col,count))
                    
 
if __name__ == "__main__":
    error_info(auto_price_df)
    try:
        auto_price_df['price']=auto_price_df['price'].astype(float)
        #print(auto_price_df.dtypes)
        print("price数据类型转换成功")
    except Exception as e:
        print("出现异常：",e)
        print("通过异常处理程序，price数据类型转换成功")
        #print(auto_price_df.loc[auto_price_df['price']=='?'])
        auto_price_df.drop(auto_price_df[auto_price_df['price']=='?'].index, inplace=True)
        auto_price_df['price']=auto_price_df['price'].astype(float)
        print(auto_price_df.dtypes)

################################列改名################################
    auto_price_df.columns = auto_price_df.columns = [str.replace('-', '_') for str in auto_price_df.columns]
    print(type(auto_price_df.columns.name))

列price中含有非法字符数据0行
出现异常： could not convert string to float: '?'
通过异常处理程序，price数据类型转换成功
symboling              int64
normalized-losses     object
make                  object
fuel-type             object
aspiration            object
num-of-doors          object
body-style            object
drive-wheels          object
engine-location       object
wheel-base           float64
length               float64
width                float64
height               float64
curb-weight            int64
engine-type           object
num-of-cylinders      object
engine-size            int64
fuel-system           object
bore                  object
stroke                object
compression-ratio    float64
horsepower            object
peak-rpm              object
city-mpg               int64
highway-mpg            int64
price                float64
dtype: object
<class 'NoneType'>


In [21]:
auto_price_df

Unnamed: 0,symboling,normalized_losses,make,fuel_type,aspiration,num_of_doors,body_style,drive_wheels,engine_location,wheel_base,...,engine_size,fuel_system,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
0,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495.0
1,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500.0
2,1,?,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500.0
3,2,164,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.40,10.0,102,5500,24,30,13950.0
4,2,164,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.40,8.0,115,5500,18,22,17450.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,-1,95,volvo,gas,std,four,sedan,rwd,front,109.1,...,141,mpfi,3.78,3.15,9.5,114,5400,23,28,16845.0
201,-1,95,volvo,gas,turbo,four,sedan,rwd,front,109.1,...,141,mpfi,3.78,3.15,8.7,160,5300,19,25,19045.0
202,-1,95,volvo,gas,std,four,sedan,rwd,front,109.1,...,173,mpfi,3.58,2.87,8.8,134,5500,18,23,21485.0
203,-1,95,volvo,diesel,turbo,four,sedan,rwd,front,109.1,...,145,idi,3.01,3.40,23.0,106,4800,26,27,22470.0


In [None]:
auto_price_df['']

step.数据预处理 data processing

* 基于pandas sklearn 的数据预处理
由于numpy单纯为数据向量，没有列名，故在分割数据时，可以先基于Dataframe更为直观方便，包括


1. 因变量自变量数据分割，

```python
from sklearn.model_selection import train_test_split
```

2. category数据处理

3. trani,test 数据分割

4. 对train数据的标准化

 



In [58]:
from sklearn.model_selection import train_test_split
################################因变量自变量分割，转换numpy################################
dependentVars=auto_price_df['price'].to_numpy()
print("因变量price为向量，形状为{}".format(dependentVars.shape))
#删除因变量，创建自变量DataFrame
independentVars=auto_price_df.drop('price',axis=1)
#train_data,test_data = train_test_split(auto_price_df, test_size=0.33, random_state=42)
################################实现category类型变量处理###################################
print("通过数据观察，数据集确定其中nominal 类型包括：")

因变量price为向量，形状为(205,)


Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,num-of-cylinders,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg
0,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,four,130,mpfi,3.47,2.68,9.0,111,5000,21,27
1,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,four,130,mpfi,3.47,2.68,9.0,111,5000,21,27
2,1,?,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,six,152,mpfi,2.68,3.47,9.0,154,5000,19,26
3,2,164,audi,gas,std,four,sedan,fwd,front,99.8,...,four,109,mpfi,3.19,3.40,10.0,102,5500,24,30
4,2,164,audi,gas,std,four,sedan,4wd,front,99.4,...,five,136,mpfi,3.19,3.40,8.0,115,5500,18,22
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,-1,95,volvo,gas,std,four,sedan,rwd,front,109.1,...,four,141,mpfi,3.78,3.15,9.5,114,5400,23,28
201,-1,95,volvo,gas,turbo,four,sedan,rwd,front,109.1,...,four,141,mpfi,3.78,3.15,8.7,160,5300,19,25
202,-1,95,volvo,gas,std,four,sedan,rwd,front,109.1,...,six,173,mpfi,3.58,2.87,8.8,134,5500,18,23
203,-1,95,volvo,diesel,turbo,four,sedan,rwd,front,109.1,...,six,145,idi,3.01,3.40,23.0,106,4800,26,27


pandas.core.series.Series