### Get gene symbol to run DAVID pipline

#### tissue specific

In [3]:
import pandas as pd

species = "mouse"

if species == "human":
    tissues = ['heart', 'lung', 'stomach']
    k = 30
else:
    tissues = ['heart', 'lung', 'brain']
    k = 50

all_ess = pd.read_csv(f"../../results/{species}/{species}_essential_genes_union.csv")

for t in tissues:
    # Read the essential gene predictions
    prediction_ess = pd.read_csv(f"../ess_number/filtered/{species}/BC_top{k}pct_{species}_{t}_esslnc.csv")
    prediction_ess.columns = ['lncRNA_id']
    p_ess = all_ess[all_ess['lncRNA_id'].isin(prediction_ess['lncRNA_id'])]

    # Filter out rows where 'gene_name' is '-'
    p_ess = p_ess[p_ess['gene_name'] != '-']

    # Apply the process_quoted_strings function to each row in the 'gene_name' column

    # Process and explode the 'gene_name' column
    ess_name = p_ess['gene_name'].str.split(';').explode().reset_index(drop=True)

    # Convert the set to a DataFrame
    ess_name_df = pd.DataFrame(list(ess_name), columns=['gene_name'])

    # Save the final union of essential gene names to a file
    ess_name_df.to_csv(f"ess_name_{species}_{t}.txt", index=False, header=None)

    print("Union of essential gene names saved successfully.")


Union of essential gene names saved successfully.
Union of essential gene names saved successfully.
Union of essential gene names saved successfully.


#### intersection

In [8]:
import pandas as pd

species = "human"

all_ess = pd.read_csv(f"../../results/{species}/{species}_essential_genes_union.csv")


# Read the essential gene predictions
prediction_ess = pd.read_csv(f"../ess_number/filtered/{species}/common_essential_genes_{species}.csv")
prediction_ess.columns = ['lncRNA_id']
p_ess = all_ess[all_ess['lncRNA_id'].isin(prediction_ess['lncRNA_id'])]

# Filter out rows where 'gene_name' is '-'
p_ess = p_ess[p_ess['gene_name'] != '-']

# Apply the process_quoted_strings function to each row in the 'gene_name' column

# Process and explode the 'gene_name' column
ess_name = p_ess['gene_name'].str.split(';').explode().reset_index(drop=True)

# Convert the set to a DataFrame
ess_name_df = pd.DataFrame(list(ess_name), columns=['gene_name'])

# Save the final union of essential gene names to a file
ess_name_df.to_csv(f"ess_name_{species}_intersection.txt", index=False, header=None)

print("Union of essential gene names saved successfully.")


Union of essential gene names saved successfully.


In [None]:
import pandas as pd

for species in ['human','mouse']:
    # Read the essential gene predictions
    all_lnc = pd.read_csv(f"../../data/LPI/{species}/lncRNA.csv")

    # Filter out rows where 'gene_name' is '-'
    all_lnc = all_lnc[all_lnc['gene_name'] != '-']

    # Process and explode the 'gene_name' column
    name = all_lnc['gene_name'].str.split(';').explode().reset_index(drop=True)

    # Convert the set to a DataFrame
    name_df = pd.DataFrame(list(name), columns=['gene_name'])

    # Save the final union of essential gene names to a file
    name_df.to_csv(f"name_{species}.txt", index=False, header=None)

    print("Gene names saved successfully.")


Union of gene names saved successfully.
Union of gene names saved successfully.


### Statistic results of  Go Term(BP,CC,MF)

In [22]:
import pandas as pd

species = 'human'
if species == "mouse":
	item = ['heart', 'lung', 'brain', 'inter']
else:
	item = ['heart', 'lung', 'stomach', 'inter']

# Read the file
for i in item:
	file_path = f'./chart/{species}_{i}.csv'  
	df = pd.read_csv(file_path)  
	df = df[['Category', 'Term', 'P-Value', 'Fold Enrichment', 'FDR']]

	# Filter out rows with FDR >= 0.05 (non-significant GO terms)
	significant_df = df[df['FDR'] < 0.05]
	significant_df = significant_df[significant_df['Fold Enrichment'] > 1]

	# For each category, get the top 5 GO terms by sorting based on p-value
	top_bp = significant_df[significant_df['Category'] == 'GOTERM_BP_DIRECT'].sort_values('P-Value').head(5)
	top_cc = significant_df[significant_df['Category'] == 'GOTERM_CC_DIRECT'].sort_values('P-Value').head(5)
	top_mf = significant_df[significant_df['Category'] == 'GOTERM_MF_DIRECT'].sort_values('P-Value').head(5)
	df_go = pd.concat([top_bp, top_cc, top_mf], ignore_index=True)
	df_go.to_csv(f"./enrich_go/{species}_{i}.csv", index=False)



In [24]:
import pandas as pd
import os

# 目录下的八个CSV文件
csv_files = [
    "./enrich_go/human_heart.csv",
    "./enrich_go/human_lung.csv",
    "./enrich_go/human_stomach.csv",
    "./enrich_go/human_inter.csv",
    "./enrich_go/mouse_heart.csv",
    "./enrich_go/mouse_lung.csv",
    "./enrich_go/mouse_brain.csv",
    "./enrich_go/mouse_inter.csv",
]

# 输出 Excel 文件路径
output_excel_file = "go_data.xlsx"

# 创建一个 Excel writer
with pd.ExcelWriter(output_excel_file, engine='xlsxwriter') as writer:
    for csv_file in csv_files:
        # 读取每个CSV文件
        df = pd.read_csv(csv_file)
        
        # 提取文件名作为 sheet 名（不含路径和扩展名）
        sheet_name = os.path.splitext(os.path.basename(csv_file))[0]
        
        # 将 CSV 数据写入 Excel 的每个 sheet
        df.to_excel(writer, sheet_name=sheet_name, index=False)

print(f"✅ Merged {len(csv_files)} CSV files into {output_excel_file}")


✅ Merged 8 CSV files into go_data.xlsx
