### Get gene symbol to run DAVID pipline

In [8]:
import pandas as pd

for species in ['human','mouse']:
    # Read the essential gene predictions
    prediction_ess = pd.read_csv(f"../../results/{species}/{species}_essential_genes_union.csv")

    # Filter out rows where 'gene_name' is '-'
    prediction_ess = prediction_ess[prediction_ess['gene_name'] != '-']

    # Apply the process_quoted_strings function to each row in the 'gene_name' column

    # Process and explode the 'gene_name' column
    ess_name = prediction_ess['gene_name'].str.split(';').explode().reset_index(drop=True)

    # Convert the set to a DataFrame
    ess_name_df = pd.DataFrame(list(ess_name), columns=['gene_name'])

    # Save the final union of essential gene names to a file
    ess_name_df.to_csv(f"ess_name_{species}.txt", index=False, header=None)

    print("Union of essential gene names saved successfully.")


Union of essential gene names saved successfully.
Union of essential gene names saved successfully.


In [9]:
import pandas as pd

for species in ['human','mouse']:
    # Read the essential gene predictions
    all_lnc = pd.read_csv(f"../../data/LPI/{species}/lncRNA.csv")

    # Filter out rows where 'gene_name' is '-'
    all_lnc = all_lnc[all_lnc['gene_name'] != '-']

    # Process and explode the 'gene_name' column
    name = all_lnc['gene_name'].str.split(';').explode().reset_index(drop=True)

    # Convert the set to a DataFrame
    name_df = pd.DataFrame(list(name), columns=['gene_name'])

    # Save the final union of essential gene names to a file
    name_df.to_csv(f"name_{species}.txt", index=False, header=None)

    print("Union of gene names saved successfully.")


Union of gene names saved successfully.
Union of gene names saved successfully.


### Statistic results of  Go Term(BP,CC,MF)

In [13]:
import pandas as pd

species = 'human'

# Read the file
file_path = f'chart_{species}_all.txt'  
df = pd.read_csv(file_path, sep='\t')  
df = df[['Category', 'Term', 'Count', '%', 'PValue', 'Fold Enrichment', 'Bonferroni', 'Benjamini', 'FDR']]

# Filter for GO terms
go_term = ['GOTERM_BP_DIRECT', 'GOTERM_CC_DIRECT', 'GOTERM_MF_DIRECT']
df = df[df['Category'].isin(go_term)]

# Extract different types of GO terms and sort by Count in descending order
df_bp = df[df['Category'] == 'GOTERM_BP_DIRECT'].sort_values(by='%', ascending=False)
df_cc = df[df['Category'] == 'GOTERM_CC_DIRECT'].sort_values(by='%', ascending=False)
df_mf = df[df['Category'] == 'GOTERM_MF_DIRECT'].sort_values(by='%', ascending=False)

# Save each to separate files
df_bp.to_csv(f'GOTERM_BP_{species}.csv', index=False)
df_cc.to_csv(f'GOTERM_CC_{species}.csv', index=False)
df_mf.to_csv(f'GOTERM_MF_{species}.csv', index=False)
