In [1]:
import numpy as np
import pandas as pd
from transformers import T5Tokenizer, T5EncoderModel
import torch
import torch.nn.functional as F


#Model and Tokenizer Paths:
#Loading the Dataset
df = pd.read_csv('/root/ACE/ACE.csv')
sequences = df['Sequence'].tolist()
sequences = [" ".join(seq) for seq in sequences]
label = df['label'].tolist()
label = np.array(label).reshape(-1, 1)
#Initializing the Tokenizer and Model
model = T5EncoderModel.from_pretrained("/root/prot_t5_xl_uniref50")
tokenizer = T5Tokenizer.from_pretrained('/root/prot_t5_xl_uniref50', do_lower_case=False)
#Tokenizing Sequences
max_length = 30
encoded_inputs = tokenizer(sequences, padding="max_length", truncation=True, max_length=max_length, return_tensors="pt")
#Getting Sequence Embeddings
with torch.no_grad():
    outputs = model(**encoded_inputs)
sequence_output = outputs.last_hidden_state
batch_size, seq_len, features = sequence_output.shape
sequence_output_reshaped = sequence_output.transpose(1, 2).reshape(batch_size * features, 1, seq_len)
kernel_size = 30
#Reshaping Sequence Output
pooling_output = F.avg_pool1d(sequence_output_reshaped, kernel_size=kernel_size, stride=kernel_size)
pooled_output_reshaped = pooling_output.reshape(batch_size, features, -1).mean(dim=2)
data = np.hstack((label, pooled_output_reshaped.numpy()))
features_df = pd.DataFrame(data, columns=['label'] + [f'Feature_{i+1}' for i in range(pooled_output_reshaped.shape[1])])
#Saving the Data
output_file = '/root/ACE/data.csv'
features_df.to_csv(output_file, index=False)
print('finish')

  from .autonotebook import tqdm as notebook_tqdm
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


finish


In [2]:
df = pd.read_csv('/root/ACE/data.csv')
df

Unnamed: 0,label,Feature_1,Feature_2,Feature_3,Feature_4,Feature_5,Feature_6,Feature_7,Feature_8,Feature_9,...,Feature_1015,Feature_1016,Feature_1017,Feature_1018,Feature_1019,Feature_1020,Feature_1021,Feature_1022,Feature_1023,Feature_1024
0,0.0,0.097270,-0.160380,-0.306435,0.115091,-0.175842,-0.142548,0.056739,-0.317077,0.067125,...,-0.033352,-0.077223,0.154114,-0.137103,0.185030,-0.118853,-0.022853,0.123305,-0.198724,0.023007
1,0.0,0.024480,-0.171068,-0.315933,0.110920,-0.139448,-0.054369,-0.004602,-0.140373,0.093441,...,-0.108888,-0.079401,0.086221,-0.037480,0.070703,-0.141716,0.043071,0.097196,-0.202709,-0.024630
2,0.0,0.066163,-0.147163,-0.201867,0.142262,-0.110307,-0.050303,-0.047773,-0.111885,0.040851,...,-0.020461,-0.050617,0.132370,-0.068138,0.041430,-0.067821,-0.004901,0.062818,-0.135880,0.051415
3,0.0,-0.001688,-0.120292,-0.180780,0.046487,0.072402,-0.126382,-0.048107,-0.161264,-0.002549,...,-0.014289,0.073087,0.073622,-0.076161,-0.053432,-0.163096,-0.032108,0.240706,-0.113759,0.054313
4,0.0,-0.071286,-0.159643,-0.150938,0.017294,-0.053570,-0.017484,0.007866,-0.182715,-0.067593,...,-0.079360,0.057019,0.069805,0.012587,0.011929,-0.159104,0.070402,0.271110,-0.180106,-0.067358
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1015,1.0,-0.054012,-0.100391,-0.146257,0.078696,-0.099834,0.018065,-0.056856,-0.150547,-0.024929,...,-0.091248,-0.041773,0.000866,-0.020173,0.080982,-0.098517,-0.006587,0.097364,-0.194060,-0.011374
1016,1.0,-0.071987,-0.042983,-0.274542,0.137997,-0.128306,-0.003834,-0.083936,-0.023180,0.042224,...,-0.033112,-0.011224,0.137345,0.008093,0.074764,-0.087039,0.014975,0.138379,-0.229512,0.049529
1017,1.0,0.042786,-0.086126,-0.201720,0.141249,-0.112397,-0.051867,-0.063820,-0.020203,-0.024062,...,-0.001699,0.012831,0.109045,-0.017109,0.090615,-0.114299,0.007042,0.085257,-0.219106,0.127791
1018,1.0,-0.020559,-0.124248,-0.322803,0.050772,-0.112374,-0.006810,-0.004079,-0.124968,-0.008637,...,-0.051967,0.051020,0.148870,-0.060002,0.195638,-0.254247,0.032480,0.056365,-0.243037,0.057617


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split


# 使用分层抽样按照 'label' 列进行数据集的训练测试集划分
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=42)

# 保存训练集和测试集为CSV文件
train_df.to_csv("/root/ACE/train_data.csv", index=False)
test_df.to_csv("/root/ACE/test_data.csv", index=False)

print("Train and test datasets have been saved as train_data.csv and test_data.csv respectively.")


Train and test datasets have been saved as train_data.csv and test_data.csv respectively.
