In [104]:
import torch
from torch import nn
from torch import Tensor
from torch import optim
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np

In [105]:
EPOCH = 100

batch_size = 2048

In [106]:
import pandas as pd

dataset = pd.read_csv("../archive/data.csv")

dataset.drop(columns=["title", "movieId", "year"], inplace=True)
dataset.head()

Unnamed: 0,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,...,Mystery,Sci-Fi,War,Musical,Documentary,IMAX,Western,Film-Noir,(no genres listed),rating
0,1,1,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3.92093
1,1,0,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3.431818
2,0,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3.259615
3,0,0,0,1,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,2.357143
4,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3.071429


In [107]:
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset

data_train, data_val = train_test_split(dataset, test_size=0.2, random_state=42)

# 训练集特征和标签
X_train = torch.FloatTensor(data_train.iloc[:, :20].values)
y_train = torch.FloatTensor(data_train['rating'].values)

# 验证集特征和标签
X_val = torch.FloatTensor(data_val.iloc[:, :20].values)
y_val = torch.FloatTensor(data_val['rating'].values)

# 将数据转换为 DataLoader
train_dataset = TensorDataset(X_train, y_train)

val_dataset = TensorDataset(X_val, y_val)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
# train_loader

In [108]:
class MyModule(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(MyModule, self).__init__()
        self.fc1= nn.Linear(input_size, hidden_size)
        self.fc2= nn.Linear(hidden_size, out_features=1)

    def forward(self, input: Tensor):
        x = self.fc1(input)
        x = self.fc2(x)
        x = torch.sigmoid(x) * 5
        return x

In [109]:
model = MyModule(input_size=20, hidden_size=50)

In [110]:
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

for epoch in range(0, EPOCH):
    model.train()
    for inputs, labels in train_loader:
        # print(type(batch))
        # (x, y) = batch[:, :20], batch[:, 20:]
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs.squeeze(), labels)
        loss.backward()
        optimizer.step()
    
    #eval
    model.eval()
    val_predictions=[]
    with torch.no_grad():
        for inputs, labels in val_loader:
            outputs = model(inputs)
            val_predictions.extend(outputs.squeeze().tolist())
    
    val_predictions = torch.FloatTensor(val_predictions)
    val_loss = mean_squared_error(y_val.numpy(), val_predictions.numpy())
    val_abs_loss = mean_absolute_error(y_val.numpy(), val_predictions.numpy())
    rmse = np.sqrt(((y_val.numpy() - val_predictions.numpy())**2).mean())
    print(f'Epoch {epoch+1}/{EPOCH}, Validation Loss: {val_loss:.4f}, abs Loss:{val_abs_loss:.4f}, RMSE: {rmse:.4f}')



Epoch 1/100, Validation Loss: 1.3261, abs Loss:0.9834, RMSE: 1.1516
Epoch 2/100, Validation Loss: 1.2022, abs Loss:0.9295, RMSE: 1.0965
Epoch 3/100, Validation Loss: 1.0968, abs Loss:0.8810, RMSE: 1.0473
Epoch 4/100, Validation Loss: 1.0093, abs Loss:0.8378, RMSE: 1.0047
Epoch 5/100, Validation Loss: 0.9397, abs Loss:0.8001, RMSE: 0.9694
Epoch 6/100, Validation Loss: 0.8864, abs Loss:0.7691, RMSE: 0.9415
Epoch 7/100, Validation Loss: 0.8482, abs Loss:0.7447, RMSE: 0.9210
Epoch 8/100, Validation Loss: 0.8226, abs Loss:0.7267, RMSE: 0.9070
Epoch 9/100, Validation Loss: 0.8067, abs Loss:0.7144, RMSE: 0.8982
Epoch 10/100, Validation Loss: 0.7973, abs Loss:0.7059, RMSE: 0.8929
Epoch 11/100, Validation Loss: 0.7917, abs Loss:0.7007, RMSE: 0.8898
Epoch 12/100, Validation Loss: 0.7875, abs Loss:0.6965, RMSE: 0.8874
Epoch 13/100, Validation Loss: 0.7833, abs Loss:0.6929, RMSE: 0.8851
Epoch 14/100, Validation Loss: 0.7788, abs Loss:0.6895, RMSE: 0.8825
Epoch 15/100, Validation Loss: 0.7738, abs 