In [40]:
import pandas as pd

In [41]:
#Load data
GEM_X = pd.read_csv('GEM_X.csv')
GEM_Y = pd.read_csv('GEM_Y.csv')
DEG_X = pd.read_csv('Leaf_DEGs_VarX_T1.csv')
DEG_Y = pd.read_csv('Leaf_DEGs_VarY_T1.csv')

In [42]:
#Clean function
def clean_dataframe(df):
    df.columns = df.columns.str.rstrip()
    if 'gene_name' in df.columns:
        df['gene_name'] = df['gene_name'].astype(str).str.rstrip()
    return df

In [43]:
#Apply clean
GEM_X = clean_dataframe(GEM_X)
GEM_Y = clean_dataframe(GEM_Y)

In [44]:
#Process Function
def process_variety(GEM_df, DEG_df, var_name, control_code, treat_code):
    
    #Standardise 'gene' column name
    if 'gene' in DEG_df.columns:
        DEG_df = DEG_df.rename(columns={'gene': 'gene_name'})
    
    #Clean DEG file
    DEG_df = clean_dataframe(DEG_df)
    
    #Find correct columns
    ctrl_cols = sorted([col for col in GEM_df.columns if control_code in col])
    treat_cols = sorted([col for col in GEM_df.columns if treat_code in col])
    
    #Create TwoTimePoints
    wanted_cols = ['gene_name'] + ctrl_cols + treat_cols
    subset_df = GEM_df[wanted_cols]
    subset_df.to_csv(f'all_{var_name}_TwoTimePoints.csv', index=False)
    
    #Create Leaf_DEGs
    merged_df = pd.merge(DEG_df, subset_df, on='gene_name')
    
    DEG_info_cols = ['gene_name', 'log2FoldChange', 'padj', 'Athaliana_geneID', 'Gene_Function']
    final_cols = DEG_info_cols + ctrl_cols + treat_cols
    
    merged_df[final_cols].to_csv(f'Leaf_DEGs_{var_name}.csv', index=False)

In [46]:
#Run analysis
process_variety(GEM_X, DEG_X, 'VarX', 'XC', 'X1')
process_variety(GEM_Y, DEG_Y, 'VarY', 'YC', 'Y1')

print("Done")

Done
