# PRAD_Model_TF_Expression_Features.csv

## 处理表达

In [None]:
# Author: Shuojingrui He
# Description: Remove the first column (Ensembl_ID) from the expression matrix.

import pandas as pd

base_path = "/Users/yangqingdi/Desktop/M-28/NTU_DATA_CLEANED/"

# Load the CSV file
prad_file = base_path + "LUAD_TOIL_RSEM_TPM_Levels_After_DEG_DMA.csv"
prad_df = pd.read_csv(prad_file)

# Delete the first column (Ensembl_ID)
prad_df.drop(prad_df.columns[0], axis=1, inplace=True)

# Save the updated DataFrame back to the CSV file
prad_df.to_csv(prad_file, index=False)

# Print the first five rows to check the result
print(prad_df.head())


  HGNC_Symbol  TCGA_44_6778  TCGA_05_4420  TCGA_64_1679  TCGA_49_6744  \
0        TNMD       -9.9658       -5.5735       -4.2934       -0.8863   
1     SLC13A2       -2.3147       -9.9658       -4.2934       -2.3884   
2        FMO1       -0.9686        4.3723        2.8974        3.4277   
3         DCN        8.5880        7.9856        9.2946        8.8060   
4       SNAI2        2.1280        2.2663        3.5911        3.2080   

   TCGA_55_6982  TCGA_NJ_A4YI  TCGA_73_4666  TCGA_MN_A4N1  TCGA_55_7911  ...  \
0       -9.9658        2.2693       -3.1714       -3.3076       -9.9658  ...   
1       -4.2934       -9.9658       -5.0116       -0.9686       -5.0116  ...   
2       -0.4921       -0.2159        1.1897        0.0300       -2.0529  ...   
3        9.0189        7.7839        7.4630        6.2599        6.2728  ...   
4        2.9413        2.1345        1.1706        0.7493        1.6045  ...   

   TCGA_05_4424  TCGA_38_4632  TCGA_49_6761  TCGA_05_4432  TCGA_49_AAQV  \
0    

## 处理tf

In [None]:
# Author: Shuojingrui He
# Description: Remove the second row (GeneSym,GeneID,GeneSym,GeneID) from TF_Target.csv.

import pandas as pd

base_path = "/Users/yangqingdi/Desktop/M-28/NTU_DATA_CLEANED/"

# Load the TF_Target CSV file
tf_target_file = base_path + "TF_Target.csv"
tf_target_df = pd.read_csv(tf_target_file, dtype=str)

# Drop the first data row (which is currently index 0)
tf_target_df = tf_target_df.iloc[1:, :].reset_index(drop=True)

# Save the updated DataFrame back to the CSV file
tf_target_df.to_csv(tf_target_file, index=False)

# Print the first five rows to check the result
print(tf_target_df.head())

  HGNC_Symbol    Gene_ID TF_Symbol TF_ID
0    A1BG-AS1     503538    ARID3A  1820
1        A1CF      29974    ARID3A  1820
2         A2M          2    ARID3A  1820
3        AAA1  100329167    ARID3A  1820
4        AAAS       8086    ARID3A  1820


## 得到 PRAD_Model_TF_Expression_Features.csv

In [None]:
# Author: Shuojingrui He
# Description: Generate Model_TF_Expression_Features.csv by replacing HGNC_Symbol with TF_Symbol.

import pandas as pd

base_path = "/Users/yangqingdi/Desktop/M-28/NTU_DATA_CLEANED/"

# Load the PRAD expression CSV file
prad_file = base_path + "LUAD_TOIL_RSEM_TPM_Levels_After_DEG_DMA.csv"
prad_df = pd.read_csv(prad_file, dtype=str)

# Load the TF_Target CSV file
tf_target_file = base_path + "TF_Target.csv"
tf_target_df = pd.read_csv(tf_target_file, dtype=str)

# Merge dataframes on HGNC_Symbol to replace with TF_Symbol
merged_df = pd.merge(prad_df, tf_target_df[['HGNC_Symbol', 'TF_Symbol']], on='HGNC_Symbol', how='inner')

# Replace HGNC_Symbol with TF_Symbol and drop original HGNC_Symbol
merged_df.drop('HGNC_Symbol', axis=1, inplace=True)

# Reorder columns to place TF_Symbol first
cols = merged_df.columns.tolist()
cols.insert(0, cols.pop(cols.index('TF_Symbol')))
merged_df = merged_df[cols]

# Save the resulting DataFrame to a new CSV file
output_file = base_path + "LUAD_Model_TF_Expression_Features.csv"
merged_df.to_csv(output_file, index=False)

# Print the first five rows to check the result
print(merged_df.head())

  TF_Symbol TCGA_44_6778 TCGA_05_4420 TCGA_64_1679 TCGA_49_6744 TCGA_55_6982  \
0      CTCF      -9.9658      -5.5735      -4.2934      -0.8863      -9.9658   
1      EZH2      -9.9658      -5.5735      -4.2934      -0.8863      -9.9658   
2     H2AFZ      -9.9658      -5.5735      -4.2934      -0.8863      -9.9658   
3    POLR2A      -9.9658      -5.5735      -4.2934      -0.8863      -9.9658   
4   SUPT20H      -9.9658      -5.5735      -4.2934      -0.8863      -9.9658   

  TCGA_NJ_A4YI TCGA_73_4666 TCGA_MN_A4N1 TCGA_55_7911  ... TCGA_05_4424  \
0       2.2693      -3.1714      -3.3076      -9.9658  ...      -9.9658   
1       2.2693      -3.1714      -3.3076      -9.9658  ...      -9.9658   
2       2.2693      -3.1714      -3.3076      -9.9658  ...      -9.9658   
3       2.2693      -3.1714      -3.3076      -9.9658  ...      -9.9658   
4       2.2693      -3.1714      -3.3076      -9.9658  ...      -9.9658   

  TCGA_38_4632 TCGA_49_6761 TCGA_05_4432 TCGA_49_AAQV TCGA_50_5072  

# PRAD_Model_Gene_Expression_Features.csv

In [None]:
# Author: Shuojingrui He
# Description: Rename and save the PRAD expression data with an additional column.

import pandas as pd

base_path = "/Users/yangqingdi/Desktop/M-28/NTU_DATA_CLEANED/"

# Load the existing PRAD expression data
original_file = base_path + "LUAD_TOIL_RSEM_TPM_Levels_After_DEG_DMA.csv"
prad_df = pd.read_csv(original_file)


# Save the updated DataFrame as a new CSV file
new_file = base_path + "LUAD_Model_Gene_Expression_Features.csv"
prad_df.to_csv(new_file, index=False)

# Print the first five rows to check the result
print(prad_df.head())


  HGNC_Symbol  TCGA_44_6778  TCGA_05_4420  TCGA_64_1679  TCGA_49_6744  \
0        TNMD       -9.9658       -5.5735       -4.2934       -0.8863   
1     SLC13A2       -2.3147       -9.9658       -4.2934       -2.3884   
2        FMO1       -0.9686        4.3723        2.8974        3.4277   
3         DCN        8.5880        7.9856        9.2946        8.8060   
4       SNAI2        2.1280        2.2663        3.5911        3.2080   

   TCGA_55_6982  TCGA_NJ_A4YI  TCGA_73_4666  TCGA_MN_A4N1  TCGA_55_7911  ...  \
0       -9.9658        2.2693       -3.1714       -3.3076       -9.9658  ...   
1       -4.2934       -9.9658       -5.0116       -0.9686       -5.0116  ...   
2       -0.4921       -0.2159        1.1897        0.0300       -2.0529  ...   
3        9.0189        7.7839        7.4630        6.2599        6.2728  ...   
4        2.9413        2.1345        1.1706        0.7493        1.6045  ...   

   TCGA_05_4424  TCGA_38_4632  TCGA_49_6761  TCGA_05_4432  TCGA_49_AAQV  \
0    

# PRAD_Model_Methylation_Features.csv

In [None]:
# Author: Shuojingrui He
# Description: Rename and save the PRAD methylation data with an additional column.

import pandas as pd

base_path = "/Users/yangqingdi/Desktop/M-28/NTU_DATA_CLEANED/"

# Load the existing PRAD methylation data
original_file = base_path + "LUAD_Methylation_Levels_After_DEG_DMA.csv"
methylation_df = pd.read_csv(original_file)

# Add a new column with default or placeholder values

# Save the updated DataFrame as a new CSV file
new_file = base_path + "LUAD_Model_Methylation_Features.csv"
methylation_df.to_csv(new_file, index=False)

# Print the first five rows to check the result
print(methylation_df.head())

     Probe_ID  TCGA-44-4112-01  TCGA-NJ-A4YP-01  TCGA-86-8278-01  \
0  cg02389084          0.61600           0.6787           0.5441   
1  cg21462633          0.64380           0.6613           0.5885   
2  cg02585906          0.54915           0.3414           0.0751   
3  cg05848579          0.79245           0.6754           0.8877   
4  cg05875421          0.51765           0.4898           0.2296   

   TCGA-62-A470-01  TCGA-44-6778-01  TCGA-49-AARQ-01  TCGA-97-A4M1-01  \
0           0.4044           0.5103           0.3348           0.4228   
1           0.4676           0.5715           0.5335           0.5880   
2           0.5727           0.3867           0.6304           0.3524   
3           0.3636           0.4412           0.4730           0.8029   
4           0.7417           0.5536           0.6592           0.6443   

   TCGA-55-6975-01  TCGA-99-8033-01  ...  TCGA-78-7539-01  TCGA-62-A46S-01  \
0           0.6210           0.6803  ...           0.3723           0.4079

In [None]:
# Author: Shuojingrui He
# Description: Add 'tf' prefix to TF_Symbol in PRAD_Model_TF_Expression_Features.csv and save as new CSV.

import pandas as pd

base_path = "/Users/yangqingdi/Desktop/M-28/NTU_DATA_CLEANED/"

# Load the PRAD_Model_TF_Expression_Features CSV file
input_file = base_path + "LUAD_Model_TF_Expression_Features.csv"
df = pd.read_csv(input_file, dtype=str)

# Add 'tf' prefix to TF_Symbol column
df['TF_Symbol'] = 'tf' + df['TF_Symbol']

# Save the resulting DataFrame to a new CSV file
output_file = base_path + "PRAD_Model_TF_Expression_Features_Withtf.csv"
df.to_csv(output_file, index=False)

# Print the first five rows to check the result
print(df.head())

   TF_Symbol TCGA_44_6778 TCGA_05_4420 TCGA_64_1679 TCGA_49_6744 TCGA_55_6982  \
0     tfCTCF      -9.9658      -5.5735      -4.2934      -0.8863      -9.9658   
1     tfEZH2      -9.9658      -5.5735      -4.2934      -0.8863      -9.9658   
2    tfH2AFZ      -9.9658      -5.5735      -4.2934      -0.8863      -9.9658   
3   tfPOLR2A      -9.9658      -5.5735      -4.2934      -0.8863      -9.9658   
4  tfSUPT20H      -9.9658      -5.5735      -4.2934      -0.8863      -9.9658   

  TCGA_NJ_A4YI TCGA_73_4666 TCGA_MN_A4N1 TCGA_55_7911  ... TCGA_05_4424  \
0       2.2693      -3.1714      -3.3076      -9.9658  ...      -9.9658   
1       2.2693      -3.1714      -3.3076      -9.9658  ...      -9.9658   
2       2.2693      -3.1714      -3.3076      -9.9658  ...      -9.9658   
3       2.2693      -3.1714      -3.3076      -9.9658  ...      -9.9658   
4       2.2693      -3.1714      -3.3076      -9.9658  ...      -9.9658   

  TCGA_38_4632 TCGA_49_6761 TCGA_05_4432 TCGA_49_AAQV TCGA_50_

In [None]:
import pandas as pd

# Load the two uploaded files
tf_target_path = '/Users/yangqingdi/Desktop/M-28/NTU_DATA_CLEANED/TF_Target.csv'
expression_data_path = '/Users/yangqingdi/Desktop/M-28/NTU_DATA_CLEANED/LUAD_TOIL_RSEM_TPM_Levels.csv'

# Read CSV files
tf_target_df = pd.read_csv(tf_target_path)
expression_df = pd.read_csv(expression_data_path)

# Show the first few rows to understand the structure
tf_target_df.head(), expression_df.head()
# The TF_Target and expression data are merged and mapped using HGNC_Symbol
merged_df = pd.merge(tf_target_df, expression_df, on="HGNC_Symbol", how="left")

# Keep only the TF_Symbol and presentation data sections
expression_columns = expression_df.columns.difference(['Ensembl_ID', 'HGNC_Symbol'])
final_df = merged_df[['TF_Symbol'] + list(expression_columns)]

# Save the results as a CSV file
output_path = '/Users/yangqingdi/Desktop/M-28/NTU_DATA_CLEANED/LUAD_Model_TF_Expression_Features_Test.csv'
final_df.to_csv(output_path, index=False)

KeyboardInterrupt: 