In [19]:
# -*- coding: utf-8 -*-
import random
import torch
import math
import torch.nn.functional as F
import pandas as pd
from torch.utils.data import Dataset
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split, KFold,GridSearchCV

# 获取数据

In [2]:
def getData():
    data1 = pd.read_csv('./data/data_1110.csv', na_values={'dist_out_in':0,'dist_out_mid':0,'dist_mid_in':0,'dist_in_hard':0})
    data2 = pd.read_csv('./data/data_1205.csv',na_values={'dist_out_in':0,'dist_out_mid':0,'dist_mid_in':0,'dist_in_hard':0})
    raw_data = pd.concat([data1,data2], axis=0).iloc[:,1:].reset_index(drop=True)
    return raw_data.iloc[:,:-1], raw_data['HIC15']

In [3]:
raw_data, raw_label = getData()

In [4]:
raw_data.shape

(357, 58)

# 异常值检测

In [12]:
def imputerVar(data, label, threshold=1):
    '''缺失处理和方差筛选'''
    data = pd.DataFrame(SimpleImputer().fit_transform(data),columns=data.columns)
    varModel = VarianceThreshold(threshold=threshold).fit(data)
#     print(data.shape)
    print(set(varModel.feature_names_in_)-set(varModel.get_feature_names_out()))
    data = varModel.transform(data)
    data = pd.DataFrame(data, columns=varModel.get_feature_names_out())
    return data, label

In [13]:
data, label = imputerVar(raw_data, raw_label, 0)

{'delta_in', 'T_out', 'Y_S_out', 'delta_mid_R', 'T_S_out', 'T_in', 'T_mid_F', 'Y_S_mid_R', 'JL_Y', 'Y_S_mid_F', 'JL_X', 'delta_out', 'Y_S_in', 'T_S_mid_F', 'T_mid_R', 'head_V', 'JR_X', 'JR_Z', 'delta_mid_F', 'T_S_mid_R', 'T_S_in', 'JR_Y'}


# 数据标准化与拆分

In [18]:
def stdSplitData(data, label):
    '''数据标准化，然后拆分数据集'''
    data = pd.DataFrame(StandardScaler().fit_transform(data), columns=data.columns)
    x_train,x_test,y_train,y_test = train_test_split(data,label,test_size=0.1,random_state=420)
    return x_train,x_test,y_train,y_test
xTrain,xTest,yTrain,yTest = stdSplitData(data, label)

# Custom Dataset

In [None]:
class CustomDataset(Dataset):
    '''自定义数据集，符合标准'''
    def __init__(self, ):
        
    
    
    
    
    
    

In [18]:
input_size = 1
hidden_size = 128
output_size = 1
batch_size = 16

In [19]:
class DynamicNet(torch.nn.Module):
    def __init__(self):
        """
        In the constructor we instantiate five parameters and assign them as members.
        """
        super().__init__()
        # 第一层: 全连接层
        self.fc1 = torch.nn.Linear(input_size, hidden_size)
        # 第二层: 全连接层
        self.fc2 = torch.nn.Linear(hidden_size, output_size)
        self.e = torch.nn.Parameter(torch.randn(()))
    def forward(self, x):
        """
        For the forward pass of the model, we randomly choose either 4, 5
        and reuse the e parameter to compute the contribution of these orders.

        Since each forward pass builds a dynamic computation graph, we can use normal
        Python control-flow operators like loops or conditional statements when
        defining the forward pass of the model.

        Here we also see that it is perfectly safe to reuse the same parameter many
        times when defining a computational graph.
        """
        y = F.sigmoid(self.fc1(x))
        y = self.fc2(y)
        for exp in range(2, random.randint(2, 4)):
            y = y + self.e * x ** exp
        return y

    def string(self):
        """
        Just like any class in Python, you can also define custom method on PyTorch modules
        """
        return f'{self.e.item()} x^2 ? + {self.e.item()} x^3 ?'

# 数据

In [None]:
def prepareData():
    # 导入数据, (110, 10) (110,)
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.1, random_state = 42)
    # 数据标准化处理
    x_train_t = torch.from_numpy(x_train.astype(np.float32))
    y_train_t = torch.from_numpy(y_train.astype(np.float32))
    x_test_t = torch.from_numpy(x_test.astype(np.float32))
    y_test_t = torch.from_numpy(y_test.astype(np.float32))

    train_data = Data.TensorDataset(X_train_t, y_train_t)
    test_data = Data.TensorDataset(X_test_t, y_test_t)
    train_loader = Data.DataLoader(dataset = train_data, batch_size = 8, shuffle = True, num_workers = 1)
    return train_loader

In [None]:
def validate(net):
    net.eval()
    with torch.no_grad():
        y_pre = net(X_test_t)
        y_pre = y_pre.data.cpu().numpy()
        mse = mean_squared_error(y_test, y_pre)
    return mse

In [26]:
def train():
    # Construct our model by instantiating the class defined above
    net = DynamicNet()
    # Construct our loss function and an Optimizer. Training this strange model with
    # vanilla stochastic gradient descent is tough, so we use momentum
    criterion = torch.nn.MSELoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=1e-8, momentum=0.9)
    # 对模型迭代训练，总共epoch轮
    net.train()
    for epoch in range(1000):
        avg_loss = []
        # 对训练数据的加载器进行迭代计算
        for step, (x, y) in enumerate(train_loader):
            output = net(x)
            loss = loss_func(output, y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            avg_loss.append(loss.item())
        avg_loss = np.array(avg_loss).mean()
        validate_loss = validate(net)
        print("Epoch {}, train loss:{}, val loss:{}".format(epoch, avg_loss, validate_loss))


1999 0.6851100325584412
3999 0.6805182099342346
5999 0.8272582292556763
7999 0.6900101900100708
9999 0.6829481720924377
11999 0.6630676984786987
13999 0.6687527894973755
15999 0.6622170209884644
17999 0.7855096459388733
19999 0.7789382338523865
21999 0.6432156562805176
23999 0.6370040774345398
25999 0.6310117244720459
27999 0.6252623200416565
29999 0.748222291469574
Result: 0.05724804475903511 x^2 ? + 0.05724804475903511 x^3 ?


In [None]:
class Linear_Model():
    def __init__(self):
        """
        Initialize the Linear Model
        """
        self.learning_rate = 0.001
        self.epoches = 10000
        self.loss_function = torch.nn.MSELoss()
        self.create_model()
    def create_model(self):
        self.model = DynamicNet()
        self.optimizer = torch.optim.SGD(self.model.parameters(), lr=self.learning_rate)
    
    def train(self, data, model_save_path="model.pth"):
        """
        Train the model and save the parameters
        Args:
            model_save_path: saved name of model
            data: (x, y) = data, and y = kx + b
        Returns: 
            None
        """
        x = data["x"]
        y = data["y"]
        for epoch in range(self.epoches):
            prediction = self.model(x)
            loss = self.loss_function(prediction, y)

            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()

            if epoch % 500 == 0:
                print("epoch: {}, loss is: {}".format(epoch, loss.item()))
        torch.save(self.model.state_dict(), "linear.pth")
      
        
    def test(self, x, model_path="linear.pth"):
        """
        Reload and test the model, plot the prediction
        Args:
            model_path: the model's path and name
            data: (x, y) = data, and y = kx + b
        Returns:
            None
        """
        x = data["x"]
        y = data["y"]
        self.model.load_state_dict(torch.load(model_path))
        prediction = self.model(x)
        
        plt.scatter(x.numpy(), y.numpy(), c=x.numpy())
        plt.plot(x.numpy(), prediction.detach().numpy(), color="r")
        plt.show()

# CNN

In [None]:
class Cnn(nn.Module):
    def __init__(self, out_node):
        super(Cnn, self).__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(in_channels=1, out_channels=10, kernel_size=3, stride=1), #卷积层
            # nn.ReLU(True),
            nn.MaxPool2d(kernel_size=2, stride=1),#最大池化层
            nn.Conv2d(in_channels=10, out_channels=10, kernel_size=3, stride=1),
            # nn.ReLU(True),
            nn.MaxPool2d(kernel_size=2, stride=1),
            nn.Flatten() # 展平操作，为了与全连接层匹配
        )
        self.fc = nn.Sequential(
            nn.Linear(3240, 200), #线性层
            nn.ReLU(inplace=True), # relu激活函数
            # nn.Dropout(),
            nn.Linear(200, out_node),
            nn.ReLU(inplace=True),
        )

    def forward(self, x):  #
        out = self.conv(x)
        print(out.shape)
        # out = out.view(out.size(0), 10 * 25 * 25)
        # print(out.shape)
        out = self.fc(out)
        # print(out.shape)
        return out
