In [1]:
!pip install torch optuna pandas numpy scikit-learn


Collecting torch
  Downloading torch-2.6.0-cp312-none-macosx_11_0_arm64.whl.metadata (28 kB)
Collecting optuna
  Downloading optuna-4.2.1-py3-none-any.whl.metadata (17 kB)
Collecting sympy==1.13.1 (from torch)
  Downloading sympy-1.13.1-py3-none-any.whl.metadata (12 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading torch-2.6.0-cp312-none-macosx_11_0_arm64.whl (66.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.5/66.5 MB[0m [31m18.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading sympy-1.13.1-py3-none-any.whl (6.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.2/6.2 MB[0m [31m16.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading optuna-4.2.1-py3-none-any.whl (383 kB)
Downloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: sympy, colorlog, torch, optuna
  Attempting uninstall: sympy
    Found existing installation: sy

In [3]:
import torch
print(torch.__version__)


2.6.0


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import pandas as pd
import numpy as np
import optuna
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


try:
    import micropip
except ModuleNotFoundError:
    print("Warning: micropip module not found. Ensure the environment supports required dependencies.")


df = pd.read_csv("genome_scores.csv")


df['movieId'] = df['movieId'].astype('category').cat.codes


features = ['movieId', 'tagId']  
target = 'relevance'


X = df[features].values
y = df[target].values

print("X shape:", X.shape)
print("y shape:", y.shape)

# Split dataset
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Normalize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

# Convert to PyTorch tensors
X_train, y_train = torch.tensor(X_train, dtype=torch.float32), torch.tensor(y_train, dtype=torch.float32)
X_val, y_val = torch.tensor(X_val, dtype=torch.float32), torch.tensor(y_val, dtype=torch.float32)
X_test, y_test = torch.tensor(X_test, dtype=torch.float32), torch.tensor(y_test, dtype=torch.float32)

# Create Datasets and DataLoaders
class MovieRatingDataset(data.Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_dataset = MovieRatingDataset(X_train, y_train)
val_dataset = MovieRatingDataset(X_val, y_val)
test_dataset = MovieRatingDataset(X_test, y_test)

train_loader = data.DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = data.DataLoader(val_dataset, batch_size=64, shuffle=False)
test_loader = data.DataLoader(test_dataset, batch_size=64, shuffle=False)

# Define a simple feedforward neural network
class MovieRatingNN(nn.Module):
    def __init__(self, input_dim, hidden_dim1=128, hidden_dim2=64, dropout_rate=0.2):
        super(MovieRatingNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim1)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout_rate)
        self.fc2 = nn.Linear(hidden_dim1, hidden_dim2)
        self.fc3 = nn.Linear(hidden_dim2, 1)

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.relu(self.fc2(x))
        x = self.fc3(x)
        return x

# Training function
def train_model(model, train_loader, val_loader, criterion, optimizer, epochs=10):
    for epoch in range(epochs):
        model.train()
        train_loss = 0
        for X_batch, y_batch in train_loader:
            optimizer.zero_grad()
            outputs = model(X_batch).squeeze()
            loss = criterion(outputs, y_batch)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
        
        # Validation
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for X_batch, y_batch in val_loader:
                outputs = model(X_batch).squeeze()
                loss = criterion(outputs, y_batch)
                val_loss += loss.item()
        
        print(f"Epoch {epoch+1}/{epochs} - Train Loss: {train_loss/len(train_loader):.4f} - Val Loss: {val_loss/len(val_loader):.4f}")

# Define Optuna objective function
def objective(trial):
    hidden_dim1 = trial.suggest_int('hidden_dim1', 64, 256)
    hidden_dim2 = trial.suggest_int('hidden_dim2', 32, 128)
    dropout_rate = trial.suggest_float('dropout_rate', 0.1, 0.5)
    lr = trial.suggest_float('lr', 1e-4, 1e-2, log=True)  
    model = MovieRatingNN(input_dim=X_train.shape[1], hidden_dim1=hidden_dim1, hidden_dim2=hidden_dim2, dropout_rate=dropout_rate)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    
    train_model(model, train_loader, val_loader, criterion, optimizer, epochs=10)
    
    val_loss = 0
    model.eval()
    with torch.no_grad():
        for X_batch, y_batch in val_loader:
            outputs = model(X_batch).squeeze()
            loss = criterion(outputs, y_batch)
            val_loss += loss.item()
    return val_loss / len(val_loader)

# Run Optuna hyperparameter optimization
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=20)

# Print best parameters
print("Best hyperparameters:", study.best_params)

# 确保数据加载正常
print(f"Number of training batches: {len(train_loader)}")
print(f"Number of validation batches: {len(val_loader)}")

# 在训练前先检查一下数据
for i, (X_batch, y_batch) in enumerate(train_loader):
    print(f"Batch {i}: X_batch shape = {X_batch.shape}, y_batch shape = {y_batch.shape}")
    break  # 只检查第一批数据



In [19]:
import os
print(os.getcwd())  
print(os.listdir())  

/Users/zhangyue/Desktop/LIS640
['Untitled5.ipynb', '.DS_Store', 'Untitled.ipynb', 'genome_scores.csv.zip', 'Intro2ADL', '.ipynb_checkpoints', 'genome_scores.csv']


In [21]:
data_path = "genome_scores.csv"  
df = pd.read_csv(data_path)


In [23]:
data_path = "genome_scores.csv"
df = pd.read_csv(data_path)
print(df.head())  


   movieId  tagId  relevance
0        1      1    0.02500
1        1      2    0.02500
2        1      3    0.05775
3        1      4    0.09675
4        1      5    0.14675


In [29]:
print(df.columns)


Index(['movieId', 'tagId', 'relevance'], dtype='object')
