In [33]:
import numpy as np
import pandas as pd
import torch
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.nn import DataParallel
from tqdm import tqdm
import torch.nn as nn
import os
import math
import time
from datetime import datetime
from torch.utils.data import Dataset, DataLoader
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA, FastICA
from sklearn.metrics import r2_score

class CustomDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, index):
        X_item = self.X[index]
        y_item = self.y[index]
        return X_item, y_item


def SNPDataset(dataset_type, reduction=None):
    if dataset_type == 1:
        root_X = 'Dataset1/FileS1/genotypes.txt'
        root_Y = 'Dataset1/FileS1/ebvs.txt'
        X_row = 1
        X_column = 2
        Y_row = 1
        Y_column = 1

    elif dataset_type == 2:
        root_X = 'Dataset2/genotype.csv'
        root_Y = 'Dataset2/mortality_EBV.csv'
        X_row = 1
        X_column = 2
        Y_row = 2
        Y_column = 1

    elif dataset_type == 4:
        root_X = 'Dataset4/simulated.csv'
        root_Y = 'Dataset4/simulated.csv'
        X_row = 1
        X_column = 1
        Y_row = 0
        Y_column = 0


    X = pd.read_csv(root_X, header=None, low_memory=False)
    Y = pd.read_csv(root_Y, header=None, low_memory=False)
    X = X.iloc[X_row:, X_column:]
    Y = Y.iloc[Y_row:, Y_column]
    X = X.reset_index(drop=True)
    Y = Y.reset_index(drop=True)
    df = pd.concat([X, Y], axis=1)
    df = df.sample(frac=1)
    X = df.iloc[:, :-1]
    Y = df.iloc[:, -1]
    num_train = math.floor(X.shape[0]*0.8)

    Xtrain = X.iloc[:num_train]
    Xtrain = Xtrain.fillna(Xtrain.mean()).values
    Ytrain = Y.iloc[:num_train].values
    Xtest = X.iloc[num_train:].values
    Ytest = Y.iloc[num_train:].values

    if dataset_type == 4:
        Ytrain = (Ytrain-Ytrain.mean())/Ytrain.std()
        Ytest = (Ytest - Ytest.mean()) / Ytest.std()

    if reduction == "PCA":
        pca = PCA(n_components = Xtest.shape[0])
        Xtrain, Xtest = pca.fit_transform(Xtrain), pca.fit_transform(Xtest)
    elif reduction == "ICA":
        ica = FastICA(n_components = Xtest.shape[0])
        Xtrain, Xtest = ica.fit_transform(Xtrain), ica.fit_transform(Xtest)
    elif reduction == "TSNE":
        tsne = TSNE(n_components=3)
        Xtrain, Xtest = tsne.fit_transform(Xtrain), tsne.fit_transform(Xtest)
    # elif reduction == "AE":

    Xtrain = Xtrain.astype(np.float32)
    Ytrain = Ytrain.astype(np.float32)
    Xtest = Xtest.astype(np.float32)
    Ytest = Ytest.astype(np.float32)

    return Xtrain,Ytrain,Xtest,Ytest


In [31]:
import numpy as np
import pandas as pd
import torch
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.nn import DataParallel
from tqdm import tqdm
import torch.nn as nn
import lightgbm as lgb
import os
import math
import time
from datetime import datetime
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import r2_score
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error as mse

def run(data_type):
    


    # Dataset(not for Transformer)
    LGBM_mse = []
    LGBM_r_sqr = []


    for i in tqdm(range(5)):
        Xtrain, Ytrain, Xtest, Ytest = SNPDataset(data_type)        
        #LGBM
        # 将数据转换为LightGBM所需的数据格式
        train_data = lgb.Dataset(Xtrain, label=Ytrain)
        
        # 设置模型参数
        params = {
            'boosting_type': 'gbdt',  # 使用梯度提升树算法
            'objective': 'regression',  # 回归任务
            'metric': 'rmse',  # 使用均方根误差评估模型性能
            'num_leaves': 31,  # 每棵树的叶子节点数目
            'learning_rate': 0.05,  # 学习率
            'feature_fraction': 0.9,  # 每次迭代时随机选择特征的比例
            'bagging_fraction': 0.8,  # 每次迭代时随机选择数据的比例
            'bagging_freq': 5,  # bagging的频率
            'verbose': 0  # 控制训练过程中输出的信息
        }
        start_time = time.time()
        # 训练模型
        model = lgb.train(params, train_data, num_boost_round=100)
        end_time = time.time()
        train_time = end_time - start_time
        start_time = time.time()
        # 使用训练好的模型进行预测
        Ypred = model.predict(Xtest)
        end_time = time.time()
        test_time = end_time - start_time
        
        mse_result = mse(Ytest, Ypred)
        r_squared = r2_score(Ytest, Ypred)
        LGBM_mse.append(mse_result)
        LGBM_r_sqr.append(r_squared)
        print(mse_result)
        print(r_squared)
        
        test_log = []
        test_log.append([train_time, test_time, mse_result, r_squared])
        test_log = pd.DataFrame(test_log, columns=['train_time', 'test_time', 'MSE score', 'R-squared'])
        os.makedirs("Dataset" + str(data_type) + "/LGBM", exist_ok=True)
        test_log.to_csv("Dataset" + str(data_type) + "/LGBM/test_log_" + str(i) + ".csv", index=False)
        

    LGBM_mse_mean, LGBM_mse_std = np.mean(LGBM_mse), np.std(LGBM_mse)
    LGBM_r_mean, LGBM_r_std = np.mean(LGBM_r_sqr), np.std(LGBM_r_sqr)
    print("LGBM:")
    print(LGBM_mse_mean, LGBM_mse_std)
    print(LGBM_r_mean, LGBM_r_std)
    print('-' * 100)



    log = {
           'LGBM': [LGBM_mse_mean, LGBM_mse_std, LGBM_r_mean, LGBM_r_std],
           }
    log = pd.DataFrame(log)
    log.to_csv("Dataset" + str(data_type) + "/log.csv")
    


In [34]:
run(4)

  0%|                                                                                            | 0/5 [00:00<?, ?it/s]

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.


 20%|████████████████▊                                                                   | 1/5 [00:12<00:49, 12.27s/it]

0.8886254572366729
0.11137454834259164
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.


 40%|█████████████████████████████████▌                                                  | 2/5 [00:23<00:35, 11.90s/it]

0.8694224832989218
0.13057751945811946
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.


 60%|██████████████████████████████████████████████████▍                                 | 3/5 [00:35<00:23, 11.79s/it]

0.8615537747482143
0.13844622543965668
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.


 80%|███████████████████████████████████████████████████████████████████▏                | 4/5 [00:47<00:11, 11.66s/it]

0.8614692561557966
0.1385307400630693
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.


100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:58<00:00, 11.68s/it]

0.8564238399412201
0.143576163451263
LGBM:
0.8674989622761651 0.011352441744539862
0.13250103935094001 0.011352439854487483
----------------------------------------------------------------------------------------------------



