In [None]:
#TZXR
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader


def read_feature_file(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read().strip()
            content = content.strip('[]')
            features = [float(x.strip()) for x in content.split(',')]
            return np.array(features)
    except Exception:
        return None

def get_layoutxlm_features(product_id, province, base_path):
    product_path = os.path.join(base_path, province, str(product_id), '')
    if not os.path.exists(product_path):
        return None
    cls_path = os.path.join(product_path, '')
    img_path = os.path.join(product_path, '')
    cls_features = read_feature_file(cls_path)
    img_features = read_feature_file(img_path)
    if cls_features is None or img_features is None:
        return None
    return np.concatenate([cls_features, img_features])

def get_visual_features(product_id, province, base_path):
    visual_path = os.path.join(base_path, province, str(product_id), '', '')
    if not os.path.exists(visual_path):
        return None
    visual_features = read_feature_file(visual_path)
    if visual_features is not None:
        # 视觉特征现在是768维，原代码没裁剪
        if len(visual_features) < 768:
            visual_features = np.pad(visual_features, (0, 768 - len(visual_features)), 'constant')
        elif len(visual_features) > 768:
            visual_features = visual_features[:768]
    return visual_features

def extract_all_features(df, base_path=""):
    layoutxlm_features = []
    visual_features = []
    valid_indices = []
    total = len(df)
    valid_count = 0
    print(f"Processing {total} samples...")

    for idx, row in df.iterrows():
        product_id = row['ID']
        province = row['省份']

        lxlm_feat = get_layoutxlm_features(product_id, province, base_path)
        vis_feat = get_visual_features(product_id, province, base_path)

        if lxlm_feat is not None and vis_feat is not None:
            layoutxlm_features.append(lxlm_feat)
            visual_features.append(vis_feat)
            valid_indices.append(idx)
            valid_count += 1

        if (idx + 1) % 100 == 0:
            print(f"Processed {idx + 1}/{total} samples, {valid_count} valid features found")

    print(f"\nFeature extraction completed:")
    print(f"Total samples: {total}")
    print(f"Valid samples: {valid_count}")
    print(f"Invalid samples: {total - valid_count}")

    if not layoutxlm_features or not visual_features:
        raise ValueError("No valid features found")

    layoutxlm_matrix = np.stack(layoutxlm_features)
    visual_matrix = np.stack(visual_features)

    # 处理NaN
    for arr in [layoutxlm_matrix, visual_matrix]:
        nan_mask = np.isnan(arr)
        if nan_mask.any():
            for col in range(arr.shape[1]):
                col_mean = np.nanmean(arr[:, col])
                arr[:, col] = np.nan_to_num(arr[:, col], nan=col_mean)

    # 去除常量列
    var_lxlm = np.var(layoutxlm_matrix, axis=0)
    non_constant_cols_lxlm = var_lxlm > 1e-6
    layoutxlm_matrix = layoutxlm_matrix[:, non_constant_cols_lxlm]
    print(f"Removed {sum(~non_constant_cols_lxlm)} constant features from layoutxlm")

    var_vis = np.var(visual_matrix, axis=0)
    non_constant_cols_vis = var_vis > 1e-6
    visual_matrix = visual_matrix[:, non_constant_cols_vis]
    print(f"Removed {sum(~non_constant_cols_vis)} constant features from visual")

    # 标准化
    scaler_lxlm = StandardScaler()
    layoutxlm_matrix = scaler_lxlm.fit_transform(layoutxlm_matrix)

    scaler_vis = StandardScaler()
    visual_matrix = scaler_vis.fit_transform(visual_matrix)

    return layoutxlm_matrix, visual_matrix, valid_indices

# ----------- 数据预处理 -----------

# 
def prepare_baseline_only(df):
    numeric_features = ['价格', '评分', '点评数']
    binary_features = ['无购物', '无自费', '成团保障', '退改政策', '是否促销']

    X_numeric = df[numeric_features].copy()
    X_numeric['价格'] = X_numeric['价格'].fillna(X_numeric['价格'].mean())
    X_numeric['评分'] = X_numeric['评分'].fillna(X_numeric['评分'].mean())
    X_numeric['点评数'] = X_numeric['点评数'].fillna(X_numeric['点评数'].mean())
    X_numeric['价格'] = np.log1p(X_numeric['价格'])
    X_numeric['点评数'] = np.log1p(X_numeric['点评数'])

    X_binary = df[binary_features].fillna(0).astype(float)

    X = pd.concat([X_numeric, X_binary], axis=1)

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    X_scaled = np.nan_to_num(X_scaled, nan=0)
    return X_scaled

def prepare_baseline_data(df):
    numeric_features = ['价格', '评分', '点评数']
    binary_features = ['无购物', '无自费', '成团保障', '退改政策', '是否促销']
    index_features = ['index_6', 'index_7', 'index_8', 'index_9', 'index_10']

    X_numeric = df[numeric_features].copy()
    X_numeric['价格'] = X_numeric['价格'].fillna(X_numeric['价格'].mean())
    X_numeric['评分'] = X_numeric['评分'].fillna(X_numeric['评分'].mean())
    X_numeric['点评数'] = X_numeric['点评数'].fillna(X_numeric['点评数'].mean())
    X_numeric['价格'] = np.log1p(X_numeric['价格'])
    X_numeric['点评数'] = np.log1p(X_numeric['点评数'])

    X_binary = df[binary_features].fillna(0).astype(float)
    X_index = df[index_features].copy()
    X_index = X_index.fillna(X_index.mean())
    X_index = np.log1p(X_index)

    X = pd.concat([X_numeric, X_binary, X_index], axis=1)
    # 保留标准化或取消，若保留，需要告诉模型输入维度13
    # 这里保持现有标准化逻辑：
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    X_scaled = np.nan_to_num(X_scaled, nan=0)
    return X_scaled


def prepare_data_combined(df, base_path="", use_layoutxlm=True, use_visual=True):
    
    X_basic = prepare_baseline_data(df)
    basic_dim = X_basic.shape[1]  # 64维左右

    
    layoutxlm_features, visual_features, valid_indices = extract_all_features(df, base_path)

  
    df_valid = df.iloc[valid_indices].reset_index(drop=True)
    X_basic_valid = X_basic[valid_indices]


    features_list = [X_basic_valid]
    if use_layoutxlm:
        features_list.append(layoutxlm_features)
    if use_visual:
        features_list.append(visual_features)

    X_final = np.concatenate(features_list, axis=1)
    return X_final, df_valid, basic_dim, layoutxlm_features.shape[1], visual_features.shape[1]

def prepare_targets(df):
    sales_columns = ['6月月销量', '7月月销量', '8月月销量', '9月月销量', '10月月销量']
    sales_data = df[sales_columns].fillna(0).values
    sales_data = np.clip(sales_data, a_min=0, a_max=None)
    sales_data = np.log1p(sales_data)
    sales_data = np.nan_to_num(sales_data, nan=0)
    return sales_data

# ----------- 数据集与模型 -----------

class TourismDataset(Dataset):
    def __init__(self, features, targets):
        self.features = torch.FloatTensor(features.astype(np.float32))
        self.targets = torch.FloatTensor(targets.astype(np.float32))

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return self.features[idx], self.targets[idx]

class HuberMSELoss(nn.Module):
    def __init__(self, delta=1.0, alpha=0.7):
        super(HuberMSELoss, self).__init__()
        self.huber = nn.HuberLoss(delta=delta)
        self.mse = nn.MSELoss()
        self.alpha = alpha

    def forward(self, pred, target):
        return self.alpha * self.mse(pred, target) + (1 - self.alpha) * self.huber(pred, target)

class FeatureFusionMLP(nn.Module):
    def __init__(self, basic_dim, layoutxlm_dim, visual_dim, reduced_dim=64):
        super(FeatureFusionMLP, self).__init__()
        self.reduced_dim = reduced_dim

      
        self.basic_reduce = nn.Linear(basic_dim, reduced_dim)
        self.layoutxlm_reduce = nn.Linear(layoutxlm_dim, reduced_dim) if layoutxlm_dim is not None else None
        self.visual_reduce = nn.Linear(visual_dim, reduced_dim) if visual_dim is not None else None

        input_dim = reduced_dim * 3  # 

        self.net = nn.Sequential(
            nn.Linear(input_dim, 64),  
            nn.ReLU(),
            nn.BatchNorm1d(64),
            nn.Dropout(0.3),

            nn.Linear(64, 32),
            nn.ReLU(),
            nn.BatchNorm1d(32),
            nn.Dropout(0.2),

            nn.Linear(32, 16),
            nn.ReLU(),
            nn.BatchNorm1d(16),

            nn.Linear(16, 1)
        )

    def forward(self, x):
        
        start = 0
        basic_feat = x[:, start:start + self.basic_reduce.in_features]
        start += self.basic_reduce.in_features

        lxlm_feat = None
        if self.layoutxlm_reduce:
            lxlm_feat = x[:, start:start + self.layoutxlm_reduce.in_features]
            start += self.layoutxlm_reduce.in_features

        visual_feat = None
        if self.visual_reduce:
            visual_feat = x[:, start:start + self.visual_reduce.in_features]

       
        basic_reduced = self.basic_reduce(basic_feat)
        lxlm_reduced = self.layoutxlm_reduce(lxlm_feat) if lxlm_feat is not None else torch.zeros_like(basic_reduced)
        visual_reduced = self.visual_reduce(visual_feat) if visual_feat is not None else torch.zeros_like(basic_reduced)

        fused = torch.cat([basic_reduced, lxlm_reduced, visual_reduced], dim=1)
        out = self.net(fused)
        return out


def train_eval(model, train_loader, val_loader, test_loader, device, epochs=100):
    criterion = HuberMSELoss(delta=1.0, alpha=0.7)
    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4, weight_decay=1e-4)
    scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=5e-4, epochs=epochs, steps_per_epoch=len(train_loader),pct_start=0.1,
        anneal_strategy='cos')

    best_val_loss = float('inf')
    best_model = None
    patience = 10
    counter = 0

    for epoch in range(epochs):
        model.train()
        train_loss = 0
        for features, targets in train_loader:
            features, targets = features.to(device), targets.to(device)
            optimizer.zero_grad()
            outputs = model(features)
            loss = criterion(outputs, targets.view(-1,1))
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            scheduler.step()
            train_loss += loss.item()

        model.eval()
        val_loss = 0
        with torch.no_grad():
            for features, targets in val_loader:
                features, targets = features.to(device), targets.to(device)
                outputs = model(features)
                loss = criterion(outputs, targets.view(-1,1))
                val_loss += loss.item()

        avg_val_loss = val_loss / len(val_loader)
        avg_train_loss = train_loss / len(train_loader)

        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            best_model = model.state_dict()
            counter = 0
        else:
            counter += 1

        if counter >= patience:
            print(f"Early stopping at epoch {epoch+1}")
            break

        if (epoch + 1) % 10 == 0 or epoch == 0:
            print(f"Epoch {epoch+1}, Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}")

    if best_model is not None:
        model.load_state_dict(best_model)

    model.eval()
    predictions = []
    actuals = []
    with torch.no_grad():
        for features, targets in test_loader:
            features, targets = features.to(device), targets.to(device)
            outputs = model(features)
            predictions.extend(outputs.cpu().numpy())
            actuals.extend(targets.cpu().numpy())

    predictions = np.array(predictions).reshape(-1)
    actuals = np.array(actuals).reshape(-1)
    mse = mean_squared_error(actuals, predictions)
    rmse = np.sqrt(mse)
    r2 = r2_score(actuals, predictions)

    return {'MSE': mse, 'RMSE': rmse, 'R2': r2}


def run_compare_five_models(data_path):
    print("读取数据...")
    df = pd.read_excel(data_path)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

  
    print("提取统一样本子集（layoutxlm+视觉）...")
    layoutxlm_features, visual_features, valid_indices = extract_all_features(df)
    df_common = df.iloc[valid_indices].reset_index(drop=True)

  
    print("准备Baseline数据...")
    X0 = prepare_baseline_only(df_common)
    y = prepare_targets(df_common)
    y = y.reshape(-1, 1)
    X0_repeat = np.repeat(X0, 5, axis=0)
    y_repeat = y.reshape(-1)

   
    print("准备Baseline+百度数据...")
    X1 = prepare_baseline_data(df_common)
    X1_repeat = np.repeat(X1, 5, axis=0)

    print("准备Baseline+百度+LayoutXLM数据...")
    X2 = np.concatenate([X1, layoutxlm_features], axis=1)
    X2_repeat = np.repeat(X2, 5, axis=0)

    print("准备Baseline+百度+LayoutXLM+视觉数据...")
    X3 = np.concatenate([X1, layoutxlm_features, visual_features], axis=1)
    X3_repeat = np.repeat(X3, 5, axis=0)

    print("准备Baseline+百度+视觉数据...")
    X4 = np.concatenate([X1, visual_features], axis=1)
    X4_repeat = np.repeat(X4, 5, axis=0)


    basic_dim = X1.shape[1]
    layoutxlm_dim = layoutxlm_features.shape[1]
    visual_dim = visual_features.shape[1]

   
    results = {}
    for X, name in zip(
        [X0_repeat, X1_repeat, X2_repeat, X3_repeat, X4_repeat],
        ['Baseline',
         'Baseline+百度',
         'Baseline+百度+LayoutXLM',
         'Baseline+百度+LayoutXLM+视觉',
         'Baseline+百度+视觉']):

        print(f"\n训练模型: {name}")
        X_train_val, X_test, y_train_val, y_test = train_test_split(X, y_repeat, test_size=0.2, random_state=42)
        X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.2, random_state=42)

        batch_size = min(32, len(X_train))
        train_loader = DataLoader(TourismDataset(X_train, y_train), batch_size=batch_size, shuffle=True)
        val_loader = DataLoader(TourismDataset(X_val, y_val), batch_size=batch_size)
        test_loader = DataLoader(TourismDataset(X_test, y_test), batch_size=batch_size)

        if name == 'Baseline':
            model = FeatureFusionMLP(X0.shape[1], 0, 0).to(device)
        elif name == 'Baseline+百度':
            model = FeatureFusionMLP(basic_dim, 0, 0).to(device)
        elif name == 'Baseline+百度+LayoutXLM':
            model = FeatureFusionMLP(basic_dim, layoutxlm_dim, 0).to(device)
        elif name == 'Baseline+百度+LayoutXLM+视觉':
            model = FeatureFusionMLP(basic_dim, layoutxlm_dim, visual_dim).to(device)
        else:  # Baseline+百度+视觉
            model = FeatureFusionMLP(basic_dim, 0, visual_dim).to(device)

        res = train_eval(model, train_loader, val_loader, test_loader, device)
        results[name] = res


    for name, metrics in results.items():
        print(f"{name:<30} {metrics['MSE']:10.4f} {metrics['RMSE']:10.4f} {metrics['R2']:10.4f}")


if __name__ == '__main__':
    data_path = r""
    run_compare_five_models(data_path)


In [None]:

import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader



def read_feature_file(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read().strip()
            content = content.strip('[]')
            features = [float(x.strip()) for x in content.split(',')]
            return np.array(features)
    except Exception:
        return None

def get_layoutxlm_features(product_id, province, base_path):
    product_path = os.path.join(base_path, province, str(product_id), '')
    if not os.path.exists(product_path):
        return None
    cls_path = os.path.join(product_path, '')
    img_path = os.path.join(product_path, '')
    cls_features = read_feature_file(cls_path)
    img_features = read_feature_file(img_path)
    if cls_features is None or img_features is None:
        return None
    return np.concatenate([cls_features, img_features])

def get_visual_features(product_id, province, base_path):
    visual_path = os.path.join(base_path, province, str(product_id), '', '')
    if not os.path.exists(visual_path):
        return None
    visual_features = read_feature_file(visual_path)
    if visual_features is not None:
        
        if len(visual_features) < 768:
            visual_features = np.pad(visual_features, (0, 768 - len(visual_features)), 'constant')
        elif len(visual_features) > 768:
            visual_features = visual_features[:768]
    return visual_features

def get_nonvisual_features(product_id, province, base_path):
    nonvisual_path = os.path.join(base_path, province, str(product_id), '', '')
    if not os.path.exists(nonvisual_path):
        return None
    nonvisual_features = read_feature_file(nonvisual_path)
    if nonvisual_features is not None:
        
        if len(nonvisual_features) < 768:
            nonvisual_features = np.pad(nonvisual_features, (0, 768 - len(nonvisual_features)), 'constant')
        elif len(nonvisual_features) > 768:
            nonvisual_features = nonvisual_features[:768]
    return nonvisual_features

def extract_all_features(df, base_path=""):
    layoutxlm_features = []
    visual_features = []
    nonvisual_features = []
    valid_indices = []
    total = len(df)
    valid_count = 0
    print(f"Processing {total} samples...")

    for idx, row in df.iterrows():
        product_id = row['ID']
        province = row['省份']

        lxlm_feat = get_layoutxlm_features(product_id, province, base_path)
        vis_feat = get_visual_features(product_id, province, base_path)
        nonvis_feat = get_nonvisual_features(product_id, province, base_path)

        if lxlm_feat is not None and vis_feat is not None and nonvis_feat is not None:
            layoutxlm_features.append(lxlm_feat)
            visual_features.append(vis_feat)
            nonvisual_features.append(nonvis_feat)
            valid_indices.append(idx)
            valid_count += 1

        if (idx + 1) % 100 == 0:
            print(f"Processed {idx + 1}/{total} samples, {valid_count} valid features found")

    

    if not layoutxlm_features or not visual_features or not nonvisual_features:
        raise ValueError("No valid features found")

    layoutxlm_matrix = np.stack(layoutxlm_features)
    visual_matrix = np.stack(visual_features)
    nonvisual_matrix = np.stack(nonvisual_features)

    
    for arr in [layoutxlm_matrix, visual_matrix, nonvisual_matrix]:
        nan_mask = np.isnan(arr)
        if nan_mask.any():
            for col in range(arr.shape[1]):
                col_mean = np.nanmean(arr[:, col])
                arr[:, col] = np.nan_to_num(arr[:, col], nan=col_mean)

    # 去除常量列
    var_lxlm = np.var(layoutxlm_matrix, axis=0)
    non_constant_cols_lxlm = var_lxlm > 1e-6
    layoutxlm_matrix = layoutxlm_matrix[:, non_constant_cols_lxlm]
    print(f"Removed {sum(~non_constant_cols_lxlm)} constant features from layoutxlm")

    var_vis = np.var(visual_matrix, axis=0)
    non_constant_cols_vis = var_vis > 1e-6
    visual_matrix = visual_matrix[:, non_constant_cols_vis]
    print(f"Removed {sum(~non_constant_cols_vis)} constant features from visual")

    var_nonvis = np.var(nonvisual_matrix, axis=0)
    non_constant_cols_nonvis = var_nonvis > 1e-6
    nonvisual_matrix = nonvisual_matrix[:, non_constant_cols_nonvis]
    print(f"Removed {sum(~non_constant_cols_nonvis)} constant features from nonvisual")


    scaler_lxlm = StandardScaler()
    layoutxlm_matrix = scaler_lxlm.fit_transform(layoutxlm_matrix)

    scaler_vis = StandardScaler()
    visual_matrix = scaler_vis.fit_transform(visual_matrix)

    scaler_nonvis = StandardScaler()
    nonvisual_matrix = scaler_nonvis.fit_transform(nonvisual_matrix)

    return layoutxlm_matrix, visual_matrix, nonvisual_matrix, valid_indices



def prepare_baseline_data(df):
    numeric_features = ['价格', '评分', '点评数']
    binary_features = ['无购物', '无自费', '成团保障', '退改政策', '是否促销']
    index_features = ['index_6', 'index_7', 'index_8', 'index_9', 'index_10']

    X_numeric = df[numeric_features].copy()
    X_numeric['价格'] = X_numeric['价格'].fillna(X_numeric['价格'].mean())
    X_numeric['评分'] = X_numeric['评分'].fillna(X_numeric['评分'].mean())
    X_numeric['点评数'] = X_numeric['点评数'].fillna(X_numeric['点评数'].mean())
    X_numeric['价格'] = np.log1p(X_numeric['价格'])
    X_numeric['点评数'] = np.log1p(X_numeric['点评数'])

    X_binary = df[binary_features].fillna(0).astype(float)
    X_index = df[index_features].copy()
    X_index = X_index.fillna(X_index.mean())
    X_index = np.log1p(X_index)

    X = pd.concat([X_numeric, X_binary, X_index], axis=1)
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    X_scaled = np.nan_to_num(X_scaled, nan=0)
    return X_scaled

def prepare_targets(df):
    sales_columns = ['6月月销量', '7月月销量', '8月月销量', '9月月销量', '10月月销量']
    sales_data = df[sales_columns].fillna(0).values
    sales_data = np.clip(sales_data, a_min=0, a_max=None)
    sales_data = np.log1p(sales_data)
    sales_data = np.nan_to_num(sales_data, nan=0)
    return sales_data


class TourismDataset(Dataset):
    def __init__(self, features, targets):
        self.features = torch.FloatTensor(features.astype(np.float32))
        self.targets = torch.FloatTensor(targets.astype(np.float32))

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return self.features[idx], self.targets[idx]

class HuberMSELoss(nn.Module):
    def __init__(self, delta=1.0, alpha=0.7):
        super(HuberMSELoss, self).__init__()
        self.huber = nn.HuberLoss(delta=delta)
        self.mse = nn.MSELoss()
        self.alpha = alpha

    def forward(self, pred, target):
        return self.alpha * self.mse(pred, target) + (1 - self.alpha) * self.huber(pred, target)

class FeatureFusionMLP(nn.Module):
    def __init__(self, basic_dim, layoutxlm_dim=None, visual_dim=None, nonvisual_dim=None, reduced_dim=64):
        super(FeatureFusionMLP, self).__init__()
        self.reduced_dim = reduced_dim

       
        self.basic_reduce = nn.Linear(basic_dim, reduced_dim)
        self.layoutxlm_reduce = nn.Linear(layoutxlm_dim, reduced_dim) if layoutxlm_dim is not None else None
        self.visual_reduce = nn.Linear(visual_dim, reduced_dim) if visual_dim is not None else None
        self.nonvisual_reduce = nn.Linear(nonvisual_dim, reduced_dim) if nonvisual_dim is not None else None

        
        input_dim = reduced_dim
        if self.layoutxlm_reduce:
            input_dim += reduced_dim
        if self.visual_reduce:
            input_dim += reduced_dim
        if self.nonvisual_reduce:
            input_dim += reduced_dim

        self.net = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.BatchNorm1d(64),
            nn.Dropout(0.3),

            nn.Linear(64, 32),
            nn.ReLU(),
            nn.BatchNorm1d(32),
            nn.Dropout(0.2),

            nn.Linear(32, 16),
            nn.ReLU(),
            nn.BatchNorm1d(16),

            nn.Linear(16, 1)
        )

    def forward(self, x):
       
        start = 0
        basic_feat = x[:, start:start + self.basic_reduce.in_features]
        start += self.basic_reduce.in_features

        lxlm_feat = None
        if self.layoutxlm_reduce:
            lxlm_feat = x[:, start:start + self.layoutxlm_reduce.in_features]
            start += self.layoutxlm_reduce.in_features

        visual_feat = None
        if self.visual_reduce:
            visual_feat = x[:, start:start + self.visual_reduce.in_features]
            start += self.visual_reduce.in_features

        nonvisual_feat = None
        if self.nonvisual_reduce:
            nonvisual_feat = x[:, start:start + self.nonvisual_reduce.in_features]

      
        basic_reduced = self.basic_reduce(basic_feat)
        lxlm_reduced = self.layoutxlm_reduce(lxlm_feat) if lxlm_feat is not None else torch.zeros_like(basic_reduced)
        visual_reduced = self.visual_reduce(visual_feat) if visual_feat is not None else torch.zeros_like(basic_reduced)
        nonvisual_reduced = self.nonvisual_reduce(nonvisual_feat) if nonvisual_feat is not None else torch.zeros_like(basic_reduced)

     
        fused = [basic_reduced]
        if lxlm_feat is not None:
            fused.append(lxlm_reduced)
        if visual_feat is not None:
            fused.append(visual_reduced)
        if nonvisual_feat is not None:
            fused.append(nonvisual_reduced)
        
        fused = torch.cat(fused, dim=1)
        out = self.net(fused)
        return out



def train_eval(model, train_loader, val_loader, test_loader, device, epochs=100):
    criterion = HuberMSELoss(delta=1.0, alpha=0.7)
    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4, weight_decay=1e-4)
    scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=5e-4, epochs=epochs, steps_per_epoch=len(train_loader),pct_start=0.1,
        anneal_strategy='cos')

    best_val_loss = float('inf')
    best_model = None
    patience = 10
    counter = 0

    for epoch in range(epochs):
        model.train()
        train_loss = 0
        for features, targets in train_loader:
            features, targets = features.to(device), targets.to(device)
            optimizer.zero_grad()
            outputs = model(features)
            loss = criterion(outputs, targets.view(-1,1))
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            scheduler.step()
            train_loss += loss.item()

        model.eval()
        val_loss = 0
        with torch.no_grad():
            for features, targets in val_loader:
                features, targets = features.to(device), targets.to(device)
                outputs = model(features)
                loss = criterion(outputs, targets.view(-1,1))
                val_loss += loss.item()

        avg_val_loss = val_loss / len(val_loader)
        avg_train_loss = train_loss / len(train_loader)

        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            best_model = model.state_dict()
            counter = 0
        else:
            counter += 1

        if counter >= patience:
            print(f"Early stopping at epoch {epoch+1}")
            break

        if (epoch + 1) % 10 == 0 or epoch == 0:
            print(f"Epoch {epoch+1}, Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}")

    if best_model is not None:
        model.load_state_dict(best_model)

    model.eval()
    predictions = []
    actuals = []
    with torch.no_grad():
        for features, targets in test_loader:
            features, targets = features.to(device), targets.to(device)
            outputs = model(features)
            predictions.extend(outputs.cpu().numpy())
            actuals.extend(targets.cpu().numpy())

    predictions = np.array(predictions).reshape(-1)
    actuals = np.array(actuals).reshape(-1)
    mse = mean_squared_error(actuals, predictions)
    rmse = np.sqrt(mse)
    r2 = r2_score(actuals, predictions)

    return {'MSE': mse, 'RMSE': rmse, 'R2': r2}



def run_three_models_comparison(data_path):
    print("读取数据...")
    df = pd.read_excel(data_path)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

 
    print("提取统一样本子集（layoutxlm+视觉+非视觉）...")
    layoutxlm_features, visual_features, nonvisual_features, valid_indices = extract_all_features(df)
    df_common = df.iloc[valid_indices].reset_index(drop=True)

    
    print("准备Baseline+百度特征...")
    X_basic = prepare_baseline_data(df_common)
    basic_dim = X_basic.shape[1]
    layoutxlm_dim = layoutxlm_features.shape[1]
    visual_dim = visual_features.shape[1]
    nonvisual_dim = nonvisual_features.shape[1]


    y = prepare_targets(df_common)
    y_repeat = y.reshape(-1, 1)

 

    X_model1 = np.concatenate([X_basic, layoutxlm_features, visual_features], axis=1)
    X_model1_repeat = np.repeat(X_model1, 5, axis=0)
    
    # 划分数据集
    X_train_val, X_test, y_train_val, y_test = train_test_split(X_model1_repeat, y_repeat, test_size=0.2, random_state=42)
    X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.2, random_state=42)
    
    batch_size = min(32, len(X_train))
    train_loader = DataLoader(TourismDataset(X_train, y_train), batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(TourismDataset(X_val, y_val), batch_size=batch_size)
    test_loader = DataLoader(TourismDataset(X_test, y_test), batch_size=batch_size)
    
    model1 = FeatureFusionMLP(basic_dim, layoutxlm_dim, visual_dim).to(device)
    res1 = train_eval(model1, train_loader, val_loader, test_loader, device)


    print("\n训练模型2: Baseline+百度+LayoutXLM+视觉+非视觉")
    X_model2 = np.concatenate([X_basic, layoutxlm_features, visual_features, nonvisual_features], axis=1)
    X_model2_repeat = np.repeat(X_model2, 5, axis=0)
    

    X_train_val, X_test, y_train_val, y_test = train_test_split(X_model2_repeat, y_repeat, test_size=0.2, random_state=42)
    X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.2, random_state=42)
    
    train_loader = DataLoader(TourismDataset(X_train, y_train), batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(TourismDataset(X_val, y_val), batch_size=batch_size)
    test_loader = DataLoader(TourismDataset(X_test, y_test), batch_size=batch_size)
    
    model2 = FeatureFusionMLP(basic_dim, layoutxlm_dim, visual_dim, nonvisual_dim).to(device)
    res2 = train_eval(model2, train_loader, val_loader, test_loader, device)


    print("\n训练模型3: Baseline+百度+LayoutXLM+非视觉")
    X_model3 = np.concatenate([X_basic, layoutxlm_features, nonvisual_features], axis=1)
    X_model3_repeat = np.repeat(X_model3, 5, axis=0)
    

    X_train_val, X_test, y_train_val, y_test = train_test_split(X_model3_repeat, y_repeat, test_size=0.2, random_state=42)
    X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.2, random_state=42)
    
    train_loader = DataLoader(TourismDataset(X_train, y_train), batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(TourismDataset(X_val, y_val), batch_size=batch_size)
    test_loader = DataLoader(TourismDataset(X_test, y_test), batch_size=batch_size)
    
    model3 = FeatureFusionMLP(basic_dim, layoutxlm_dim, None, nonvisual_dim).to(device)
    res3 = train_eval(model3, train_loader, val_loader, test_loader, device)




if __name__ == '__main__':
    data_path = r""
    run_three_models_comparison(data_path)

In [None]:

import os
import numpy as np
import pandas as pd
from typing import Optional, Tuple, List

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# ========= 基础 I/O =========

def read_feature_file(file_path: str) -> Optional[np.ndarray]:
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read().strip().strip('[]')
            if not content:
                return None
            arr = np.array([float(x.strip()) for x in content.split(',')], dtype=float)
            return arr.ravel()
    except Exception:
        return None

def find_sensory_dir(base_path: str, province: str, product_id: str) -> Optional[str]:
    prod_dir = os.path.join(base_path, str(province), str(product_id))
    if not os.path.isdir(prod_dir):
        return None
    exact = os.path.join(prod_dir, "")
    if os.path.isdir(exact):
        return exact
    # 兜底：名称包含“感官”的子目录
    for name in os.listdir(prod_dir):
        p = os.path.join(prod_dir, name)
        if os.path.isdir(p) and ("感官" in name):
            return p
    return None

# ========= 特征读取 =========

def get_layoutxlm_features(product_id, province, base_path) -> Optional[np.ndarray]:
    product_path = os.path.join(base_path, str(province), str(product_id), '')
    if not os.path.exists(product_path):
        return None
    cls_path = os.path.join(product_path, '')
    img_path = os.path.join(product_path, '')
    cls_features = read_feature_file(cls_path)
    img_features = read_feature_file(img_path)
    if cls_features is None or img_features is None:
        return None
    return np.concatenate([cls_features, img_features], axis=0)

def _to_768(vec: Optional[np.ndarray]) -> Optional[np.ndarray]:
    if vec is None:
        return None
    v = vec
    if len(v) < 768:
        v = np.pad(v, (0, 768 - len(v)), mode='constant')
    elif len(v) > 768:
        v = v[:768]
    return v

def get_visual_mean_vector(product_id, province, base_path, filename="") -> Optional[np.ndarray]:
    sdir = find_sensory_dir(base_path, province, product_id)
    if sdir is None:
        return None
    v = read_feature_file(os.path.join(sdir, filename))
    return _to_768(v)

def get_one_nonvisual_vector(product_id, province, base_path, filename="") -> Optional[np.ndarray]:
   
    sdir = find_sensory_dir(base_path, province, product_id)
    if sdir is None:
        return None
    v = read_feature_file(os.path.join(sdir, filename))
    return _to_768(v)



def extract_subset_for_A_B(df: pd.DataFrame, base_path=r""
                           ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, List[int]]:
    lxlm_list, vis_mean_list, one_nonvis_list, valid_idx = [], [], [], []
    total = len(df); valid = 0
    print(f".")

    for idx, row in df.iterrows():
        pid = row['ID']; prov = row['省份']
        lxlm = get_layoutxlm_features(pid, prov, base_path)
        vis  = get_visual_mean_vector(pid, prov, base_path, filename="")
        one  = get_one_nonvisual_vector(pid, prov, base_path, filename="")
        if lxlm is not None and vis is not None and one is not None:
            lxlm_list.append(lxlm); vis_mean_list.append(vis); one_nonvis_list.append(one)
            valid_idx.append(idx); valid += 1
        if (idx + 1) % 100 == 0:
            print(f"  已处理 {idx+1}/{total}，有效 {valid}")

    if valid == 0:
        raise ValueError("没有找到满足条件的样本。")

    lxlm_mat       = np.stack(lxlm_list)
    vis_mean_mat   = np.stack(vis_mean_list)
    one_nonvis_mat = np.stack(one_nonvis_list)
    print(f"完成：有效样本 {valid} / {total}")
    return lxlm_mat, vis_mean_mat, one_nonvis_mat, valid_idx

# ========= 预处理：Baseline+百度 / 目标 =========

def prepare_baseline_data(df: pd.DataFrame) -> np.ndarray:
    numeric_features = ['价格', '评分', '点评数']
    binary_features  = ['无购物', '无自费', '成团保障', '退改政策', '是否促销']
    index_features   = ['index_6', 'index_7', 'index_8', 'index_9', 'index_10']

    Xn = df[numeric_features].copy()
    Xn['价格'] = np.log1p(Xn['价格'].fillna(Xn['价格'].mean()))
    Xn['评分'] = Xn['评分'].fillna(Xn['评分'].mean())
    Xn['点评数'] = np.log1p(Xn['点评数'].fillna(Xn['点评数'].mean()))

    Xb = df[binary_features].fillna(0).astype(float)
    Xi = np.log1p(df[index_features].copy().fillna(df[index_features].mean()))

    X = pd.concat([Xn, Xb, Xi], axis=1)
    X = StandardScaler().fit_transform(X)
    return np.nan_to_num(X, nan=0.0)

def prepare_targets(df: pd.DataFrame) -> np.ndarray:
    cols = ['6月月销量', '7月月销量', '8月月销量', '9月月销量', '10月月销量']
    y = df[cols].fillna(0).values
    y = np.clip(y, a_min=0, a_max=None)
    y = np.log1p(y)
    return np.nan_to_num(y, nan=0.0)

# ========= 清洗：NaN/常量列/标准化 =========

def clean_matrix(mat: np.ndarray, name: str) -> np.ndarray:
    # NaN -> 列均值
    if np.isnan(mat).any():
        col_means = np.nanmean(mat, axis=0)
        inds = np.where(np.isnan(mat))
        mat[inds] = np.take(col_means, inds[1])
    # 去常量列
    var = np.var(mat, axis=0)
    keep = var > 1e-6
    dropped = int((~keep).sum())
    if dropped:
        print(f"{name}: 移除常量列 {dropped} 个")
    mat = mat[:, keep] if keep.any() else mat
    # 标准化
    mat = StandardScaler().fit_transform(mat)
    return mat

# ========= 数据集 & 模型 =========

class TourismDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.as_tensor(X, dtype=torch.float32)
        self.y = torch.as_tensor(y, dtype=torch.float32)
    def __len__(self): return len(self.X)
    def __getitem__(self, i): return self.X[i], self.y[i]

class HuberMSELoss(nn.Module):
    def __init__(self, delta=1.0, alpha=0.7):
        super().__init__()
        self.huber = nn.HuberLoss(delta=delta)
        self.mse   = nn.MSELoss()
        self.alpha = alpha
    def forward(self, pred, target):
        return self.alpha * self.mse(pred, target) + (1 - self.alpha) * self.huber(pred, target)

class FeatureFusionMLP(nn.Module):
    def __init__(self, basic_dim, layoutxlm_dim=None, visual_dim=None, nonvisual_dim=None, reduced_dim=64):
        super().__init__()
        self.basic_reduce     = nn.Linear(basic_dim, reduced_dim)
        self.layoutxlm_reduce = nn.Linear(layoutxlm_dim, reduced_dim) if layoutxlm_dim else None
        self.visual_reduce    = nn.Linear(visual_dim, reduced_dim)    if visual_dim else None
        self.nonvisual_reduce = nn.Linear(nonvisual_dim, reduced_dim) if nonvisual_dim else None

        input_dim = reduced_dim
        if self.layoutxlm_reduce: input_dim += reduced_dim
        if self.visual_reduce:    input_dim += reduced_dim
        if self.nonvisual_reduce: input_dim += reduced_dim

        self.net = nn.Sequential(
            nn.Linear(input_dim, 64), nn.ReLU(), nn.BatchNorm1d(64), nn.Dropout(0.3),
            nn.Linear(64, 32), nn.ReLU(), nn.BatchNorm1d(32), nn.Dropout(0.2),
            nn.Linear(32, 16), nn.ReLU(), nn.BatchNorm1d(16),
            nn.Linear(16, 1),
        )

    def forward(self, x):
        start = 0
        b_in = self.basic_reduce.in_features
        basic = x[:, start:start+b_in]; start += b_in

        lxlm = None
        if self.layoutxlm_reduce:
            l_in = self.layoutxlm_reduce.in_features
            lxlm = x[:, start:start+l_in]; start += l_in

        vis = None
        if self.visual_reduce:
            v_in = self.visual_reduce.in_features
            vis = x[:, start:start+v_in]; start += v_in

        nonvis = None
        if self.nonvisual_reduce:
            n_in = self.nonvisual_reduce.in_features
            nonvis = x[:, start:start+n_in]

        b = self.basic_reduce(basic)
        l = self.layoutxlm_reduce(lxlm) if lxlm is not None else torch.zeros_like(b)
        v = self.visual_reduce(vis)      if vis   is not None else torch.zeros_like(b)
        n = self.nonvisual_reduce(nonvis)if nonvis is not None else torch.zeros_like(b)

        feats = [b]
        if lxlm is not None: feats.append(l)
        if vis  is not None: feats.append(v)
        if nonvis is not None: feats.append(n)
        return self.net(torch.cat(feats, dim=1))

# ========= 训练/验证/测试 =========

def train_eval(model, train_loader, val_loader, test_loader, device, epochs=100):
    criterion = HuberMSELoss(delta=1.0, alpha=0.7)
    optim = torch.optim.AdamW(model.parameters(), lr=1e-4, weight_decay=1e-4)
    sched = torch.optim.lr_scheduler.OneCycleLR(
        optim, max_lr=5e-4, epochs=epochs, steps_per_epoch=len(train_loader),
        pct_start=0.1, anneal_strategy='cos'
    )
    best_val, best_state, patience, cnt = float('inf'), None, 10, 0
    for ep in range(epochs):
        model.train(); tl = 0.0
        for xb, yb in train_loader:
            xb, yb = xb.to(device), yb.to(device)
            optim.zero_grad()
            pred = model(xb)
            loss = criterion(pred, yb.view(-1,1))
            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optim.step(); sched.step()
            tl += loss.item()
        tl /= max(1, len(train_loader))

        model.eval(); vl = 0.0
        with torch.no_grad():
            for xb, yb in val_loader:
                xb, yb = xb.to(device), yb.to(device)
                vl += criterion(model(xb), yb.view(-1,1)).item()
        vl /= max(1, len(val_loader))

        if vl < best_val:
            best_val, best_state, cnt = vl, model.state_dict(), 0
        else:
            cnt += 1
            if cnt >= patience:
                print(f"Early stopping @ epoch {ep+1}")
                break
        if ep == 0 or (ep + 1) % 10 == 0:
            print(f"Epoch {ep+1:03d} | Train {tl:.4f} | Val {vl:.4f}")

    if best_state is not None:
        model.load_state_dict(best_state)

    # Test
    model.eval(); preds, gts = [], []
    with torch.no_grad():
        for xb, yb in test_loader:
            xb, yb = xb.to(device), yb.to(device)
            pr = model(xb)
            preds.append(pr.cpu().numpy()); gts.append(yb.cpu().numpy())
    y_pred = np.concatenate(preds, axis=0).reshape(-1)
    y_true = np.concatenate(gts, axis=0).reshape(-1)
    mse, rmse, r2 = mean_squared_error(y_true, y_pred), np.sqrt(mean_squared_error(y_true, y_pred)), r2_score(y_true, y_pred)
    return {'MSE': mse, 'RMSE': rmse, 'R2': r2}

# ========= 主流程（模型A vs 模型B） =========

def run_AB(data_path, base_path=r"D:\有效数据汇总", epochs=100, seed=42):
    np.random.seed(seed); torch.manual_seed(seed)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    print("读取数据...")
    df_all = pd.read_excel(data_path)

    print("抽取统一子集：LayoutXLM + 视觉均值向量.txt + 视觉+仅一种.txt")
    lxlm_mat, vis_mean_mat, one_nonvis_mat, valid_idx = extract_subset_for_A_B(df_all, base_path)
    df = df_all.iloc[valid_idx].reset_index(drop=True)

    
    lxlm_mat       = clean_matrix(lxlm_mat,       "LayoutXLM")
    vis_mean_mat   = clean_matrix(vis_mean_mat,   "VisualMean")
    one_nonvis_mat = clean_matrix(one_nonvis_mat, "OneNonVisual")

    
    X_basic = prepare_baseline_data(df)
    y_mat   = prepare_targets(df)    
    y_flat  = y_mat.reshape(-1, 1)  

    basic_dim     = X_basic.shape[1]
    layoutxlm_dim = lxlm_mat.shape[1]
    visual_dim    = vis_mean_mat.shape[1]
    one_dim       = one_nonvis_mat.shape[1]

  
    X_A = np.concatenate([X_basic, lxlm_mat, vis_mean_mat], axis=1)         
    X_B = np.concatenate([X_basic, lxlm_mat, one_nonvis_mat], axis=1)     
    X_A_rep = np.repeat(X_A, 5, axis=0)
    X_B_rep = np.repeat(X_B, 5, axis=0)

   
    Nrep = X_A_rep.shape[0]
    all_idx = np.arange(Nrep)
    idx_train_val, idx_test = train_test_split(all_idx, test_size=0.2, random_state=seed, shuffle=True)
    idx_train, idx_val = train_test_split(idx_train_val, test_size=0.2, random_state=seed, shuffle=True)

    def make_loaders(Xrep):
        X_train, X_val, X_test = Xrep[idx_train], Xrep[idx_val], Xrep[idx_test]
        y_train, y_val, y_test = y_flat[idx_train], y_flat[idx_val], y_flat[idx_test]
        bs = min(32, max(1, len(idx_train)//8)) if len(idx_train) > 0 else 16
        bs = max(8, min(64, bs))
        return (
            DataLoader(TourismDataset(X_train, y_train), batch_size=bs, shuffle=True),
            DataLoader(TourismDataset(X_val,   y_val),   batch_size=bs),
            DataLoader(TourismDataset(X_test,  y_test),  batch_size=bs),
        )

    trA, vaA, teA = make_loaders(X_A_rep)
    trB, vaB, teB = make_loaders(X_B_rep)



if __name__ == "__main__":
    data_path = r""
    run_AB(data_path, base_path=r"", epochs=100, seed=42)


In [None]:

import os
import numpy as np
import pandas as pd
from typing import Optional, Tuple, List

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score



def read_feature_file(file_path: str) -> Optional[np.ndarray]:
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read().strip().strip('[]')
            if not content:
                return None
            arr = np.array([float(x.strip()) for x in content.split(',')], dtype=float)
            return arr.ravel()
    except Exception:
        return None

def find_sensory_dir(base_path: str, province: str, product_id: str) -> Optional[str]:
    prod_dir = os.path.join(base_path, str(province), str(product_id))
    if not os.path.isdir(prod_dir):
        return None
    exact = os.path.join(prod_dir, "")
    if os.path.isdir(exact):
        return exact
    # 兜底：名称包含“感官”的子目录
    for name in os.listdir(prod_dir):
        p = os.path.join(prod_dir, name)
        if os.path.isdir(p) and ("" in name):
            return p
    return None

# ========= 特征读取 =========

def get_layoutxlm_features(product_id, province, base_path) -> Optional[np.ndarray]:
    product_path = os.path.join(base_path, str(province), str(product_id), '')
    if not os.path.exists(product_path):
        return None
    cls_path = os.path.join(product_path, '')
    img_path = os.path.join(product_path, '')
    cls_features = read_feature_file(cls_path)
    img_features = read_feature_file(img_path)
    if cls_features is None or img_features is None:
        return None
    return np.concatenate([cls_features, img_features], axis=0)

def _to_768(vec: Optional[np.ndarray]) -> Optional[np.ndarray]:
    if vec is None:
        return None
    v = vec
    if len(v) < 768:
        v = np.pad(v, (0, 768 - len(v)), mode='constant')
    elif len(v) > 768:
        v = v[:768]
    return v

def get_visual_mean_vector(product_id, province, base_path, filename="") -> Optional[np.ndarray]:
    sdir = find_sensory_dir(base_path, province, product_id)
    if sdir is None:
        return None
    v = read_feature_file(os.path.join(sdir, filename))
    return _to_768(v)

def get_two_or_more_vector(product_id, province, base_path, filename="") -> Optional[np.ndarray]:
    # 直接按 768 维处理，和视觉均值向量一致
    sdir = find_sensory_dir(base_path, province, product_id)
    if sdir is None:
        return None
    v = read_feature_file(os.path.join(sdir, filename))
    return _to_768(v)

# ========= 子集抽取：确保三者同时存在 =========

def extract_subset_for_A_C(df: pd.DataFrame, base_path=r""
                           ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, List[int]]:
    lxlm_list, vis_mean_list, two_or_more_list, valid_idx = [], [], [], []
    total = len(df); valid = 0
    

    for idx, row in df.iterrows():
        pid = row['ID']; prov = row['省份']
        lxlm = get_layoutxlm_features(pid, prov, base_path)
        vis  = get_visual_mean_vector(pid, prov, base_path, filename="")
        two  = get_two_or_more_vector(pid, prov, base_path, filename="")
        if lxlm is not None and vis is not None and two is not None:
            lxlm_list.append(lxlm); vis_mean_list.append(vis); two_or_more_list.append(two)
            valid_idx.append(idx); valid += 1
        if (idx + 1) % 100 == 0:
            print(f"  已处理 {idx+1}/{total}，有效 {valid}")

    if valid == 0:
        raise ValueError("没有找到满足条件的样本。")

    lxlm_mat       = np.stack(lxlm_list)
    vis_mean_mat   = np.stack(vis_mean_list)
    two_or_more_mat= np.stack(two_or_more_list)
    print(f"完成：有效样本 {valid} / {total}")
    return lxlm_mat, vis_mean_mat, two_or_more_mat, valid_idx

# ========= 预处理：Baseline+百度 / 目标 =========

def prepare_baseline_data(df: pd.DataFrame) -> np.ndarray:
    numeric_features = ['价格', '评分', '点评数']
    binary_features  = ['无购物', '无自费', '成团保障', '退改政策', '是否促销']
    index_features   = ['index_6', 'index_7', 'index_8', 'index_9', 'index_10']

    Xn = df[numeric_features].copy()
    Xn['价格']   = np.log1p(Xn['价格'].fillna(Xn['价格'].mean()))
    Xn['评分']   = Xn['评分'].fillna(Xn['评分'].mean())
    Xn['点评数'] = np.log1p(Xn['点评数'].fillna(Xn['点评数'].mean()))

    Xb = df[binary_features].fillna(0).astype(float)
    Xi = np.log1p(df[index_features].copy().fillna(df[index_features].mean()))

    X = pd.concat([Xn, Xb, Xi], axis=1)
    X = StandardScaler().fit_transform(X)
    return np.nan_to_num(X, nan=0.0)

def prepare_targets(df: pd.DataFrame) -> np.ndarray:
    cols = ['6月月销量', '7月月销量', '8月月销量', '9月月销量', '10月月销量']
    y = df[cols].fillna(0).values
    y = np.clip(y, a_min=0, a_max=None)
    y = np.log1p(y)
    return np.nan_to_num(y, nan=0.0)

# ========= 清洗：NaN/常量列/标准化 =========

def clean_matrix(mat: np.ndarray, name: str) -> np.ndarray:
    if np.isnan(mat).any():
        col_means = np.nanmean(mat, axis=0)
        inds = np.where(np.isnan(mat))
        mat[inds] = np.take(col_means, inds[1])
    var = np.var(mat, axis=0)
    keep = var > 1e-6
    dropped = int((~keep).sum())
    if dropped:
        print(f"{name}: 移除常量列 {dropped} 个")
    mat = mat[:, keep] if keep.any() else mat
    mat = StandardScaler().fit_transform(mat)
    return mat

# ========= 数据集 & 模型 =========

class TourismDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.as_tensor(X, dtype=torch.float32)
        self.y = torch.as_tensor(y, dtype=torch.float32)
    def __len__(self): return len(self.X)
    def __getitem__(self, i): return self.X[i], self.y[i]

class HuberMSELoss(nn.Module):
    def __init__(self, delta=1.0, alpha=0.7):
        super().__init__()
        self.huber = nn.HuberLoss(delta=delta)
        self.mse   = nn.MSELoss()
        self.alpha = alpha
    def forward(self, pred, target):
        return self.alpha * self.mse(pred, target) + (1 - self.alpha) * self.huber(pred, target)

class FeatureFusionMLP(nn.Module):
    def __init__(self, basic_dim, layoutxlm_dim=None, visual_dim=None, nonvisual_dim=None, reduced_dim=64):
        super().__init__()
        self.basic_reduce     = nn.Linear(basic_dim, reduced_dim)
        self.layoutxlm_reduce = nn.Linear(layoutxlm_dim, reduced_dim) if layoutxlm_dim else None
        self.visual_reduce    = nn.Linear(visual_dim, reduced_dim)    if visual_dim else None
        self.nonvisual_reduce = nn.Linear(nonvisual_dim, reduced_dim) if nonvisual_dim else None

        input_dim = reduced_dim
        if self.layoutxlm_reduce: input_dim += reduced_dim
        if self.visual_reduce:    input_dim += reduced_dim
        if self.nonvisual_reduce: input_dim += reduced_dim

        self.net = nn.Sequential(
            nn.Linear(input_dim, 64), nn.ReLU(), nn.BatchNorm1d(64), nn.Dropout(0.3),
            nn.Linear(64, 32), nn.ReLU(), nn.BatchNorm1d(32), nn.Dropout(0.2),
            nn.Linear(32, 16), nn.ReLU(), nn.BatchNorm1d(16),
            nn.Linear(16, 1),
        )

    def forward(self, x):
        start = 0
        b_in = self.basic_reduce.in_features
        basic = x[:, start:start+b_in]; start += b_in

        lxlm = None
        if self.layoutxlm_reduce:
            l_in = self.layoutxlm_reduce.in_features
            lxlm = x[:, start:start+l_in]; start += l_in

        vis = None
        if self.visual_reduce:
            v_in = self.visual_reduce.in_features
            vis = x[:, start:start+v_in]; start += v_in

        nonvis = None
        if self.nonvisual_reduce:
            n_in = self.nonvisual_reduce.in_features
            nonvis = x[:, start:start+n_in]

        b = self.basic_reduce(basic)
        l = self.layoutxlm_reduce(lxlm) if lxlm is not None else torch.zeros_like(b)
        v = self.visual_reduce(vis)      if vis   is not None else torch.zeros_like(b)
        n = self.nonvisual_reduce(nonvis)if nonvis is not None else torch.zeros_like(b)

        feats = [b]
        if lxlm is not None: feats.append(l)
        if vis  is not None: feats.append(v)
        if nonvis is not None: feats.append(n)
        return self.net(torch.cat(feats, dim=1))

# ========= 训练/验证/测试 =========

def train_eval(model, train_loader, val_loader, test_loader, device, epochs=100):
    criterion = HuberMSELoss(delta=1.0, alpha=0.7)
    optim = torch.optim.AdamW(model.parameters(), lr=1e-4, weight_decay=1e-4)
    sched = torch.optim.lr_scheduler.OneCycleLR(
        optim, max_lr=5e-4, epochs=epochs, steps_per_epoch=len(train_loader),
        pct_start=0.1, anneal_strategy='cos'
    )
    best_val, best_state, patience, cnt = float('inf'), None, 10, 0
    for ep in range(epochs):
        model.train(); tl = 0.0
        for xb, yb in train_loader:
            xb, yb = xb.to(device), yb.to(device)
            optim.zero_grad()
            pred = model(xb)
            loss = criterion(pred, yb.view(-1,1))
            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optim.step(); sched.step()
            tl += loss.item()
        tl /= max(1, len(train_loader))

        model.eval(); vl = 0.0
        with torch.no_grad():
            for xb, yb in val_loader:
                xb, yb = xb.to(device), yb.to(device)
                vl += criterion(model(xb), yb.view(-1,1)).item()
        vl /= max(1, len(val_loader))

        if vl < best_val:
            best_val, best_state, cnt = vl, model.state_dict(), 0
        else:
            cnt += 1
            if cnt >= patience:
                print(f"Early stopping @ epoch {ep+1}")
                break
        if ep == 0 or (ep + 1) % 10 == 0:
            print(f"Epoch {ep+1:03d} | Train {tl:.4f} | Val {vl:.4f}")

    if best_state is not None:
        model.load_state_dict(best_state)

    # Test
    model.eval(); preds, gts = [], []
    with torch.no_grad():
        for xb, yb in test_loader:
            xb, yb = xb.to(device), yb.to(device)
            pr = model(xb)
            preds.append(pr.cpu().numpy()); gts.append(yb.cpu().numpy())
    y_pred = np.concatenate(preds, axis=0).reshape(-1)
    y_true = np.concatenate(gts, axis=0).reshape(-1)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)
    return {'MSE': mse, 'RMSE': rmse, 'R2': r2}

# ========= 主流程（模型A vs 模型C） =========

def run_A_C(data_path, base_path=r"", epochs=100, seed=42):
    np.random.seed(seed); torch.manual_seed(seed)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    print("读取数据...")
    df_all = pd.read_excel(data_path)

    print("抽取统一子集：LayoutXLM + 视觉均值向量.txt + 视觉+两种感官及以上均值向量.txt")
    lxlm_mat, vis_mean_mat, two_more_mat, valid_idx = extract_subset_for_A_C(df_all, base_path)
    df = df_all.iloc[valid_idx].reset_index(drop=True)

    print("清洗 & 标准化...")
    lxlm_mat   = clean_matrix(lxlm_mat,   "LayoutXLM")
    vis_mean_mat   = clean_matrix(vis_mean_mat,   "VisualMean")
    two_more_mat   = clean_matrix(two_more_mat,   "TwoOrMore")

    print("准备 Baseline+百度 与 目标...")
    X_basic = prepare_baseline_data(df)
    y_mat   = prepare_targets(df)    # [N, 5]
    y_flat  = y_mat.reshape(-1, 1)   # [N*5, 1]

    basic_dim     = X_basic.shape[1]
    layoutxlm_dim = lxlm_mat.shape[1]
    visual_dim    = vis_mean_mat.shape[1]
    two_dim       = two_more_mat.shape[1]

   
    X_A = np.concatenate([X_basic, lxlm_mat, vis_mean_mat], axis=1)   # 模型A
    X_C = np.concatenate([X_basic, lxlm_mat, two_more_mat], axis=1)   # 模型C
    X_A_rep = np.repeat(X_A, 5, axis=0)
    X_C_rep = np.repeat(X_C, 5, axis=0)

   
    Nrep = X_A_rep.shape[0]
    all_idx = np.arange(Nrep)
    idx_train_val, idx_test = train_test_split(all_idx, test_size=0.2, random_state=seed, shuffle=True)
    idx_train, idx_val = train_test_split(idx_train_val, test_size=0.2, random_state=seed, shuffle=True)

    def make_loaders(Xrep):
        X_train, X_val, X_test = Xrep[idx_train], Xrep[idx_val], Xrep[idx_test]
        y_train, y_val, y_test = y_flat[idx_train], y_flat[idx_val], y_flat[idx_test]
        bs = min(32, max(1, len(idx_train)//8)) if len(idx_train) > 0 else 16
        bs = max(8, min(64, bs))
        return (
            DataLoader(TourismDataset(X_train, y_train), batch_size=bs, shuffle=True),
            DataLoader(TourismDataset(X_val,   y_val),   batch_size=bs),
            DataLoader(TourismDataset(X_test,  y_test),  batch_size=bs),
        )

    trA, vaA, teA = make_loaders(X_A_rep)
    trC, vaC, teC = make_loaders(X_C_rep)

    print("\n训练模型A：Baseline + 百度 + LayoutXLM + 视觉（来自 视觉均值向量.txt）")
    modelA = FeatureFusionMLP(basic_dim, layoutxlm_dim, visual_dim, None).to(device)
    resA = train_eval(modelA, trA, vaA, teA, device, epochs=epochs)

    print("\n训练模型C：Baseline + 百度 + LayoutXLM + 两种及以上（来自 视觉+两种感官及以上均值向量.txt）")
    modelC = FeatureFusionMLP(basic_dim, layoutxlm_dim, None, two_dim).to(device)
    resC = train_eval(modelC, trC, vaC, teC, device, epochs=epochs)

    print("\n模型性能对比：")
    print(f"{'模型':<55} {'MSE':>12} {'RMSE':>12} {'R2':>10}")
    print(f"{'A) Baseline+Index+LayoutXLM+Visual(mean)':<55} {resA['MSE']:12.4f} {resA['RMSE']:12.4f} {resA['R2']:10.4f}")
    print(f"{'C) Baseline+Index+LayoutXLM+Two-or-More NV':<55} {resC['MSE']:12.4f} {resC['RMSE']:12.4f} {resC['R2']:10.4f}")

if __name__ == "__main__":
    data_path = r""
    run_A_C(data_path, base_path=r"", epochs=100, seed=42)


In [None]:
#3x3
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

-

def read_feature_file(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read().strip().strip('[]')
            if not content:
                return None
            features = [float(x.strip()) for x in content.split(',')]
            return np.array(features, dtype=float)
    except Exception:
        return None

def get_layoutxlm_features(product_id, province, base_path):
    product_path = os.path.join(base_path, str(province), str(product_id), '')
    if not os.path.exists(product_path):
        return None
    cls_path = os.path.join(product_path, '')
    img_path = os.path.join(product_path, '')
    cls_features = read_feature_file(cls_path)
    img_features = read_feature_file(img_path)
    if cls_features is None or img_features is None:
        return None
    return np.concatenate([cls_features, img_features])

def get_visual_features(product_id, province, base_path):
    visual_path = os.path.join(base_path, str(province), str(product_id), '', '')
    if not os.path.exists(visual_path):
        return None
    visual_features = read_feature_file(visual_path)
    if visual_features is None:
        return None
    # 视觉特征按 768 维对齐
    if len(visual_features) < 768:
        visual_features = np.pad(visual_features, (0, 768 - len(visual_features)), 'constant')
    elif len(visual_features) > 768:
        visual_features = visual_features[:768]
    return visual_features

def extract_all_features(df, base_path=r""):
    """
    仅保留 layoutxlm 与 visual 同时存在的样本；
    - 填充 NaN（列均值）
    - 去除常量列
    - 标准化（各自 fit_transform）
    """
    layoutxlm_features, visual_features, valid_indices = [], [], []
    total = len(df)
    found = 0
    print(f"[特征抽取] 处理 {total} 条样本...")

    for i, row in df.iterrows():
        pid = row['ID']
        prov = row['省份']
        lxlm = get_layoutxlm_features(pid, prov, base_path)
        vis  = get_visual_features(pid, prov, base_path)
        if lxlm is not None and vis is not None:
            layoutxlm_features.append(lxlm)
            visual_features.append(vis)
            valid_indices.append(i)
            found += 1
        if (i + 1) % 100 == 0:
            print(f"  已处理 {i + 1}/{total}，有效样本 {found}")

    if not layoutxlm_features or not visual_features:
        raise ValueError("未找到任何有效的（同时具备 layoutxlm 与 视觉）样本。")

    lxlm_mat = np.stack(layoutxlm_features).astype(float)
    vis_mat  = np.stack(visual_features).astype(float)

    # 列均值填充 NaN
    for arr in (lxlm_mat, vis_mat):
        nan_mask = np.isnan(arr)
        if nan_mask.any():
            col_means = np.nanmean(arr, axis=0)
            inds = np.where(nan_mask)
            arr[inds] = np.take(col_means, inds[1])

    # 去常量列
    def remove_constant_cols(mat, name):
        var = np.var(mat, axis=0)
        keep = var > 1e-6
        removed = int((~keep).sum())
        if removed > 0:
            print(f"[特征抽取] {name} 去除常量列 {removed} 个")
        return mat[:, keep]

    lxlm_mat = remove_constant_cols(lxlm_mat, "LayoutXLM")
    vis_mat  = remove_constant_cols(vis_mat,  "Visual")

    # 标准化
    lxlm_scaler = StandardScaler()
    vis_scaler  = StandardScaler()
    lxlm_mat = lxlm_scaler.fit_transform(lxlm_mat)
    vis_mat  = vis_scaler.fit_transform(vis_mat)

    print(f"[特征抽取] 完成。有效样本: {len(valid_indices)}")
    return lxlm_mat, vis_mat, valid_indices


def prepare_baseline_only(df_sub):
    """三项数值 + 五项二元，共 8 维；标准化"""
    numeric = ['价格', '评分', '点评数']
    binary  = ['无购物', '无自费', '成团保障', '退改政策', '是否促销']

    Xn = df_sub[numeric].copy()
    Xn['价格']  = pd.to_numeric(Xn['价格'], errors='coerce')
    Xn['评分']  = pd.to_numeric(Xn['评分'], errors='coerce')
    Xn['点评数'] = pd.to_numeric(Xn['点评数'], errors='coerce')

    Xn['价格']  = Xn['价格'].fillna(Xn['价格'].mean())
    Xn['评分']  = Xn['评分'].fillna(Xn['评分'].mean())
    Xn['点评数'] = Xn['点评数'].fillna(Xn['点评数'].mean())

    Xn['价格']  = np.log1p(Xn['价格'])
    Xn['点评数'] = np.log1p(Xn['点评数'])

    Xb = df_sub[binary].fillna(0).astype(float)
    X = pd.concat([Xn, Xb], axis=1)

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    return np.nan_to_num(X_scaled, nan=0.0)

def prepare_baseline_plus_baidu(df_sub):
    """三项数值 + 五项二元 + 五项百度指数，共 13 维；标准化"""
    numeric = ['价格', '评分', '点评数']
    binary  = ['无购物', '无自费', '成团保障', '退改政策', '是否促销']
    index5  = ['index_6', 'index_7', 'index_8', 'index_9', 'index_10']

    Xn = df_sub[numeric].copy()
    Xn['价格']  = pd.to_numeric(Xn['价格'], errors='coerce')
    Xn['评分']  = pd.to_numeric(Xn['评分'], errors='coerce')
    Xn['点评数'] = pd.to_numeric(Xn['点评数'], errors='coerce')

    Xn['价格']  = Xn['价格'].fillna(Xn['价格'].mean())
    Xn['评分']  = Xn['评分'].fillna(Xn['评分'].mean())
    Xn['点评数'] = Xn['点评数'].fillna(Xn['点评数'].mean())

    Xn['价格']  = np.log1p(Xn['价格'])
    Xn['点评数'] = np.log1p(Xn['点评数'])

    Xb = df_sub[binary].fillna(0).astype(float)

    Xi = df_sub[index5].copy()
    for c in index5:
        Xi[c] = pd.to_numeric(Xi[c], errors='coerce')
    Xi = Xi.fillna(Xi.mean())
    Xi = np.log1p(Xi)

    X = pd.concat([Xn, Xb, Xi], axis=1)

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    return np.nan_to_num(X_scaled, nan=0.0)

def prepare_targets(df_sub):
    """目标：6-10 月销量；clip>=0；log1p；返回 [n, 5]"""
    cols = ['6月月销量', '7月月销量', '8月月销量', '9月月销量', '10月月销量']
    y = df_sub[cols].copy()
    for c in cols:
        y[c] = pd.to_numeric(y[c], errors='coerce').fillna(0.0)
        y[c] = np.clip(y[c].values, a_min=0.0, a_max=None)
        y[c] = np.log1p(y[c].values)
    return y.values.astype(float)



class TourismDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.as_tensor(X, dtype=torch.float32)
        self.y = torch.as_tensor(y, dtype=torch.float32)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

class HuberMSELoss(nn.Module):
    def __init__(self, delta=1.0, alpha=0.7):
        super().__init__()
        self.huber = nn.HuberLoss(delta=delta)
        self.mse   = nn.MSELoss()
        self.alpha = alpha

    def forward(self, pred, target):
        return self.alpha * self.mse(pred, target) + (1 - self.alpha) * self.huber(pred, target)

class FeatureFusionMLP(nn.Module):

    def __init__(self, basic_dim, layoutxlm_dim=None, visual_dim=None, reduced_dim=64):
        super().__init__()
        self.basic_dim = int(basic_dim)
        self.layoutxlm_dim = int(layoutxlm_dim) if layoutxlm_dim is not None and layoutxlm_dim > 0 else None
        self.visual_dim    = int(visual_dim)    if visual_dim is not None and visual_dim > 0    else None

        self.basic_reduce = nn.Linear(self.basic_dim, reduced_dim)
        self.layoutxlm_reduce = nn.Linear(self.layoutxlm_dim, reduced_dim) if self.layoutxlm_dim else None
        self.visual_reduce    = nn.Linear(self.visual_dim, reduced_dim)    if self.visual_dim    else None

        branches = 1
        if self.layoutxlm_reduce: branches += 1
        if self.visual_reduce:    branches += 1
        input_dim = reduced_dim * branches

        self.net = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.BatchNorm1d(64),
            nn.Dropout(0.3),

            nn.Linear(64, 32),
            nn.ReLU(),
            nn.BatchNorm1d(32),
            nn.Dropout(0.2),

            nn.Linear(32, 16),
            nn.ReLU(),
            nn.BatchNorm1d(16),

            nn.Linear(16, 1)
        )

    def forward(self, x):
        start = 0
        basic = x[:, start:start + self.basic_dim]; start += self.basic_dim

        lxlm = None
        if self.layoutxlm_dim:
            lxlm = x[:, start:start + self.layoutxlm_dim]; start += self.layoutxlm_dim

        visual = None
        if self.visual_dim:
            visual = x[:, start:start + self.visual_dim]; start += self.visual_dim

        out_list = [self.basic_reduce(basic)]
        if self.layoutxlm_reduce is not None and lxlm is not None:
            out_list.append(self.layoutxlm_reduce(lxlm))
        if self.visual_reduce is not None and visual is not None:
            out_list.append(self.visual_reduce(visual))

        fused = torch.cat(out_list, dim=1)
        return self.net(fused)



def train_eval(model, train_loader, val_loader, test_loader, device, epochs=100):
    criterion = HuberMSELoss(delta=1.0, alpha=0.7)
    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4, weight_decay=1e-4)
    scheduler = torch.optim.lr_scheduler.OneCycleLR(
        optimizer, max_lr=5e-4, epochs=epochs, steps_per_epoch=max(1, len(train_loader)), pct_start=0.1,
        anneal_strategy='cos'
    )

    best_val = float('inf')
    best_state = None
    patience, counter = 10, 0

    for ep in range(epochs):
        model.train()
        tr_loss = 0.0
        for xb, yb in train_loader:
            xb, yb = xb.to(device), yb.to(device).view(-1, 1)
            optimizer.zero_grad()
            pred = model(xb)
            loss = criterion(pred, yb)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            scheduler.step()
            tr_loss += loss.item()

        model.eval()
        va_loss = 0.0
        with torch.no_grad():
            for xb, yb in val_loader:
                xb, yb = xb.to(device), yb.to(device).view(-1, 1)
                pred = model(xb)
                va_loss += criterion(pred, yb).item()

        tr_avg = tr_loss / max(1, len(train_loader))
        va_avg = va_loss / max(1, len(val_loader))

        if va_avg < best_val:
            best_val = va_avg
            best_state = {k: v.cpu() for k, v in model.state_dict().items()}
            counter = 0
        else:
            counter += 1
        if counter >= patience:
            print(f"  [早停] epoch {ep+1}")
            break

        if (ep + 1) % 10 == 0 or ep == 0:
            print(f"  Epoch {ep+1:3d} | Train {tr_avg:.4f} | Val {va_avg:.4f}")

    if best_state is not None:
        model.load_state_dict(best_state)

    # 测试
    model.eval()
    preds, gts = [], []
    with torch.no_grad():
        for xb, yb in test_loader:
            xb = xb.to(device)
            pred = model(xb).cpu().numpy().reshape(-1)
            preds.append(pred)
            gts.append(yb.numpy().reshape(-1))
    yhat = np.concatenate(preds)
    y    = np.concatenate(gts)

    mse  = mean_squared_error(y, yhat)
    rmse = np.sqrt(mse)
    r2   = r2_score(y, yhat)
    return {'MSE': mse, 'RMSE': rmse, 'R2': r2}



def _build_model(tag, dims, device):
    if tag == 'Baseline':
        return FeatureFusionMLP(dims['basic_dim_x0'], None, None).to(device)
    elif tag == 'Baseline+百度':
        return FeatureFusionMLP(dims['basic_dim_x1'], None, None).to(device)
    elif tag == 'Baseline+百度+LayoutXLM':
        return FeatureFusionMLP(dims['basic_dim_x1'], dims['layoutxlm_dim'], None).to(device)
    elif tag == 'Baseline+百度+LayoutXLM+视觉':
        return FeatureFusionMLP(dims['basic_dim_x1'], dims['layoutxlm_dim'], dims['visual_dim']).to(device)
    elif tag == 'Baseline+百度+视觉':
        return FeatureFusionMLP(dims['basic_dim_x1'], None, dims['visual_dim']).to(device)
    else:
        raise ValueError(f"未知模型: {tag}")

def _eval_subset(name, mask, X0, X1, X2, X3, X4, y5, dims, device):
    idx = np.where(mask)[0]
    n = len(idx)
    if n < 8:  
        print(f"[跳过] {name} 样本不足（n={n}）")
        return None

    X0_s, X1_s, X2_s, X3_s, X4_s = X0[idx], X1[idx], X2[idx], X3[idx], X4[idx]
    y_s = y5[idx]  # [n,5]

    def rep5(a): return np.repeat(a, 5, axis=0)
    X_dict = {
        'Baseline':                        rep5(X0_s),
        'Baseline+百度':                   rep5(X1_s),
        'Baseline+百度+LayoutXLM':         rep5(X2_s),
        'Baseline+百度+LayoutXLM+视觉':     rep5(X3_s),
        'Baseline+百度+视觉':              rep5(X4_s),
    }
    y_rep = y_s.reshape(-1)

    results = {}
    for tag, X in X_dict.items():
        try:
            X_trv, X_te, y_trv, y_te = train_test_split(X, y_rep, test_size=0.2, random_state=42)
            if len(X_trv) == 0 or len(X_te) == 0:
                print(f"[跳过] {name} - {tag} 切分不足")
                continue
            X_tr, X_va, y_tr, y_va = train_test_split(X_trv, y_trv, test_size=0.2, random_state=42)
            if len(X_tr) == 0 or len(X_va) == 0:
                print(f"[跳过] {name} - {tag} 训练/验证不足")
                continue

            bs = max(1, min(32, len(X_tr)))
            tr_loader = DataLoader(TourismDataset(X_tr, y_tr), batch_size=bs, shuffle=True)
            va_loader = DataLoader(TourismDataset(X_va, y_va), batch_size=bs)
            te_loader = DataLoader(TourismDataset(X_te, y_te), batch_size=bs)

            model = _build_model(tag, dims, device)
            res = train_eval(model, tr_loader, va_loader, te_loader, device)
            results[tag] = res
        except Exception as e:
            print(f"[跳过] {name} - {tag} 发生异常：{e}")

    # 打印
    if results:
        print(f"\n========== {name}（n={n}） ==========")
        print(f"{'模型名称':<30} {'MSE':>12} {'RMSE':>12} {'R2':>10}")
        for tag, m in results.items():
            print(f"{tag:<30} {m['MSE']:12.4f} {m['RMSE']:12.4f} {m['R2']:10.4f}")
    else:
        print(f"[提示] {name} 无可用结果。")
    return results if results else None

def run_compare_by_price_and_top3types_cross(data_path, base_path=r"D:\有效数据汇总"):
    print("[读取] Excel 数据...")
    df = pd.read_excel(data_path)

   
    lxlm_mat, vis_mat, valid_idx = extract_all_features(df, base_path)
    dfc = df.iloc[valid_idx].reset_index(drop=True)

   
    print("[准备] Baseline / Baseline+百度 / 目标...")
    X0 = prepare_baseline_only(dfc)
    X1 = prepare_baseline_plus_baidu(dfc)
    X2 = np.concatenate([X1, lxlm_mat], axis=1)
    X3 = np.concatenate([X1, lxlm_mat, vis_mat], axis=1)
    X4 = np.concatenate([X1, vis_mat], axis=1)
    y5 = prepare_targets(dfc)  # [n,5]

    dims = {
        'basic_dim_x0': X0.shape[1],
        'basic_dim_x1': X1.shape[1],
        'layoutxlm_dim': lxlm_mat.shape[1],
        'visual_dim': vis_mat.shape[1],
    }

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

  
    price = pd.to_numeric(dfc['价格'], errors='coerce')
    masks_price = {
        '低价(<1340)':        (price < 1340).values,
        '中价(1340-3189)':    ((price >= 1340) & (price <= 3189)).values,
        '高价(>3189)':        (price > 3189).values,
    }

  
    type_series = dfc['类型'].astype(str).fillna('')
    skip_types = {'游学', '半自由行', '半自助游'}
    type_series_filtered = type_series[~type_series.isin(skip_types)]
    vc = type_series_filtered.value_counts()
    selected_types = list(vc.index[:3])

    if len(selected_types) < 3:
        print(f"[提示] 可用产品类型不足 3 个，实际：{selected_types}")

    print(f"\n[信息] 选取的产品类型 Top3：{selected_types}")

  
    results_cross = {}
    for t in selected_types:
        t_mask = (type_series == t).values
        for price_name, p_mask in masks_price.items():
            name = f"类型：{t} × 价格：{price_name}"
            mask = t_mask & p_mask
            r = _eval_subset(name, mask, X0, X1, X2, X3, X4, y5, dims, device)
            if r is not None:
                results_cross[(t, price_name)] = r


    return results_cross

# ------------------------ 入口 ------------------------

if __name__ == '__main__':
    DATA_PATH = r""
    BASE_PATH = r""
    run_compare_by_price_and_top3types_cross(DATA_PATH, BASE_PATH)
