# Create a test file 'PAAD_Model_TF_Expression_Features_Test' for machine learning

In [1]:
import pandas as pd

# Load the two uploaded files
tf_target_path = r'D:\project data\M-28\NTU_DATA_CLEANED\TF_Target.csv'
expression_data_path = r'D:\project data\M-28\NTU_DATA_CLEANED\PAAD_TOIL_RSEM_TPM_Levels.csv'

# Read CSV files
tf_target_df = pd.read_csv(tf_target_path)
expression_df = pd.read_csv(expression_data_path)

# Show the first few rows to understand the structure
tf_target_df.head(), expression_df.head()


(  HGNC_Symbol    Gene_ID TF_Symbol  TF_ID
 0         A2M          2    ARID3A   1820
 1        AAA1  100329167    ARID3A   1820
 2        AAAS       8086    ARID3A   1820
 3        AACS      65985    ARID3A   1820
 4       AADAC         13    ARID3A   1820,
         Ensembl_ID HGNC_Symbol  TCGA_HZ_7922  TCGA_Q3_A5QY  TCGA_FB_A545  \
 0  ENSG00000000003      TSPAN6        5.3615        3.1311        4.2533   
 1  ENSG00000000005        TNMD       -5.5735        1.9931       -9.9658   
 2  ENSG00000000419        DPM1        5.8020        4.6770        5.4627   
 3  ENSG00000000457       SCYL3        3.3577        1.8444        2.4544   
 4  ENSG00000000460       FIRRM        1.8762        1.0711        2.1313   
 
    TCGA_RL_AAAS  TCGA_F2_7276  TCGA_FB_A7DR  TCGA_YB_A89D  TCGA_IB_7889  ...  \
 0        4.7121        4.9814        4.0109        4.2389        4.4463  ...   
 1       -2.6349       -2.2447       -3.3076       -3.6259       -4.6082  ...   
 2        4.8167        5.2969    

In [None]:
# Get the expression column (remove the identification column)
expression_columns = expression_df.columns.difference(['Ensembl_ID', 'HGNC_Symbol'])

# The initial empty list is used to store each batch of results
result_batches = []

# Set batch size
batch_size = 10000
total_rows = tf_target_df.shape[0]

# batch processing
for start in range(0, total_rows, batch_size):
    end = min(start + batch_size, total_rows)
    print(f"process from {start} to {end} row...")

    # Current batch
    batch_df = tf_target_df.iloc[start:end]

    # Merge presentation data (left join preserves TF structure)
    batch_merged = pd.merge(batch_df, expression_df, on="HGNC_Symbol", how="left")

    # Extract TF_Symbol and expression value
    batch_final = batch_merged[['TF_Symbol'] + list(expression_columns)]

    # Add to the results list
    result_batches.append(batch_final)

# Merge all batches
final_df = pd.concat(result_batches, ignore_index=True)

# Save as a CSV file
output_path = r'D:\project data\M-28\NTU_DATA_CLEANED\PAAD_Model_TF_Expression_Features_Test.csv'
final_df.to_csv(output_path, index=False)

print("All batches are completed and saved as：", output_path)

In [None]:
# The TF_Target and expression data are merged and mapped using HGNC_Symbol
merged_df = pd.merge(tf_target_df, expression_df, on="HGNC_Symbol", how="left")

# Keep only the TF_Symbol and presentation data sections
expression_columns = expression_df.columns.difference(['Ensembl_ID', 'HGNC_Symbol'])
final_df = merged_df[['TF_Symbol'] + list(expression_columns)]

# Save the results as a CSV file
output_path = r'D:\project data\M-28\NTU_DATA_CLEANED\PAAD_Model_TF_Expression_Features_Test.csv'
final_df.to_csv(output_path, index=False)