In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, f1_score
from sklearn.utils.class_weight import compute_class_weight
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

In [5]:
SEQUENCE_LENGTH = 5
BATCH_SIZE = 128
EPOCHS = 5
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [35]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("retailrocket/ecommerce-dataset")

print("Path to dataset files:", path)

ModuleNotFoundError: No module named 'kagglehub'

In [6]:

df = pd.read_csv('events.csv')
df['timestamp'] = pd.to_datetime(df['timestamp'], unit='ms')
print(df.columns)

Index(['timestamp', 'visitorid', 'event', 'itemid', 'transactionid'], dtype='object')


In [7]:
view_df = df[df['event'] == 'view'].sample(frac=0.08, random_state=42)
non_view_df = df[df['event'] != 'view']
df = pd.concat([view_df, non_view_df])
print(f"downsampling {len(df)}")

downsampling 304934


In [8]:
def create_features(df):
    df.sort_values(['visitorid', 'timestamp'], inplace=True)
    df['time_diff'] = df.groupby('visitorid')['timestamp'].diff().dt.total_seconds().fillna(0)
    df['action_count'] = df.groupby('visitorid').cumcount() + 1
    df['cart_abandon'] = ((df['event'] == 'addtocart') & (df['event'].shift(-1) != 'transaction')).astype(int).fillna(0)
    df['time_since_cart'] = df.groupby('visitorid')['timestamp'].diff().dt.total_seconds().fillna(0)
    df['long_cart_abandon'] = ((df['event'] == 'addtocart') & (df['time_since_cart'] > 300)).astype(int)

    for col in ['event', 'itemid']:
        df[col] = LabelEncoder().fit_transform(df[col].astype(str))
    return df

df = create_features(df)


In [9]:
FEATURE_COLUMNS = ['time_diff', 'action_count', 'cart_abandon', 'long_cart_abandon', 'itemid']
TARGET_COLUMN = 'event'

In [10]:
from tqdm import tqdm

def build_sequences(df, max_users=None):
    X, y = [], []
    grouped = df.groupby('visitorid')
    if max_users:
        grouped = list(grouped)[:max_users]

    for uid, user_df in tqdm(grouped, desc="Building sequences"):
        user_df = user_df.sort_values('timestamp')
        features = user_df[FEATURE_COLUMNS].values
        targets = user_df[TARGET_COLUMN].values
        for i in range(len(features) - SEQUENCE_LENGTH):
            X.append(features[i:i + SEQUENCE_LENGTH])
            y.append(targets[i + SEQUENCE_LENGTH])
    return np.array(X), np.array(y)

X, y = build_sequences(df)

Building sequences: 100%|██████████| 205155/205155 [01:56<00:00, 1767.52it/s]


In [14]:
class EventDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.long)
    def __len__(self):
        return len(self.X)
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
train_loader = DataLoader(EventDataset(X_train, y_train), batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(EventDataset(X_test, y_test), batch_size=BATCH_SIZE)
classes = np.unique(y_train)
weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
class_weights = torch.tensor(weights, dtype=torch.float32).to(DEVICE)
print("✅ 类别权重:", class_weights)

✅ 类别权重: tensor([0.7818, 1.3461, 1.0224])


In [15]:
class LSTMClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super().__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True)
        self.fc1 = nn.Linear(hidden_dim, 64)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(64, output_dim)

    def forward(self, x):
        _, (h_n, _) = self.lstm(x)
        x = self.relu(self.fc1(h_n[-1]))
        return self.fc2(x)

model = LSTMClassifier(input_dim=X.shape[2], hidden_dim=64, output_dim=len(classes)).to(DEVICE)
criterion = nn.CrossEntropyLoss(weight=class_weights)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

In [20]:
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    for xb, yb in train_loader:
        xb, yb = xb.to(DEVICE), yb.to(DEVICE)
        pred = model(xb)
        loss = criterion(pred, yb)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"📈 Epoch {epoch+1}, Loss: {total_loss:.4f}")

# 11. 验证效果
model.eval()
all_preds, all_labels = [], []
with torch.no_grad():
    for xb, yb in test_loader:
        xb = xb.to(DEVICE)
        logits = model(xb)
        preds = torch.argmax(logits, dim=1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(yb.numpy())

print(" Accuracy:", accuracy_score(all_labels, all_preds))
print(" F1 Score:", f1_score(all_labels, all_preds, average='macro'))
print(" Classification Report:\n", classification_report(all_labels, all_preds))

# 12. 导出为 ONNX
model.eval()
dummy_input = torch.randn(1, SEQUENCE_LENGTH, X.shape[2]).to(DEVICE)
torch.onnx.export(
    model, dummy_input, "user_behavior_predictor.onnx",
    input_names=["input"], output_names=["output"],
    dynamic_axes={"input": {0: "batch_size"}, "output": {0: "batch_size"}},
    opset_version=11
)

print(" model saved user_behavior_predictor.onnx")

📈 Epoch 1, Loss: 259.5338
📈 Epoch 2, Loss: 260.1338
📈 Epoch 3, Loss: 259.6490
📈 Epoch 4, Loss: 258.9954
📈 Epoch 5, Loss: 259.1168
 Accuracy: 0.4703432609793034
 F1 Score: 0.3943581898233086
 Classification Report:
               precision    recall  f1-score   support

           0       0.47      0.70      0.56      3354
           1       0.28      0.09      0.13      1944
           2       0.52      0.46      0.49      2626

    accuracy                           0.47      7924
   macro avg       0.42      0.42      0.39      7924
weighted avg       0.44      0.47      0.43      7924

 model saved user_behavior_predictor.onnx




In [5]:
import onnx
model = onnx.load("user_behavior_predictor.onnx")
onnx.checker.check_model(model)