I have a csv that contains these columns:
Sample
associated_gene
Noncyclo_Z_Score

For each unique Sample, I want to do this:
For each antisense gene (gene whose name ends with -AS#, where # is any number, or _AS) that has a Noncyclo_Z_Score of >2, I want to know what is the Noncyclo_Z_Score of the sense gene. If the antisense gene name ends with -AS#, then the sense gene name is everything in front of -. If the antisense gene name ends with _AS, then the sense gene name is in between "novelGene_" and "_AS" (like this novelGene_SenseGeneName_AS)

Write this script in python jupyternotebook

In [1]:
# Import necessary libraries
import pandas as pd

# Load the CSV file
file_path = "/mmfs1/gscratch/stergachislab/yhhc/projects/Iso-seq_public/Cyclo_noncyclo_comparison/Analysis/10.12.24/3.Compare_samples/2.Gene/data_combined_full.csv"  # Replace with your file path
data = pd.read_csv(file_path)

# Initialize an empty list to store results
results = []

# Iterate through unique samples
for sample in data['Sample'].unique():
    sample_data = data[data['Sample'] == sample]
    
    # Filter antisense genes based on the naming pattern and Z_Score > 2
    antisense_genes = sample_data[
        (sample_data['associated_gene'].str.endswith("-AS")) |
        (sample_data['associated_gene'].str.contains("_AS$"))
    ]
    antisense_genes = antisense_genes[antisense_genes['Noncyclo_Z_Score'] > 2]
    
    for _, row in antisense_genes.iterrows():
        antisense_gene = row['associated_gene']
        antisense_z_score = row['Noncyclo_Z_Score']
        
        # Identify the corresponding sense gene
        if "-AS" in antisense_gene:
            sense_gene = antisense_gene.split("-AS")[0]
        elif "_AS" in antisense_gene:
            sense_gene = antisense_gene.split("novelGene_")[1].split("_AS")[0]
        else:
            continue  # Skip if it doesn't match the pattern
        
        # Find the Noncyclo_Z_Score for the sense gene
        sense_gene_data = sample_data[sample_data['associated_gene'] == sense_gene]
        if not sense_gene_data.empty:
            sense_z_score = sense_gene_data['Noncyclo_Z_Score'].values[0]
        else:
            sense_z_score = None  # If the sense gene is not found
        
        # Append the result
        results.append({
            'Sample': sample,
            'Antisense_Gene': antisense_gene,
            'Antisense_Z_Score': antisense_z_score,
            'Sense_Gene': sense_gene,
            'Sense_Z_Score': sense_z_score
        })

# Convert results to a DataFrame
results_df = pd.DataFrame(results)

# Save the results to a CSV file
output_path = "antisense_sense_analysis.csv"  # Replace with your desired output path
results_df.to_csv(output_path, index=False)

# Display the results
results_df.head()


  data = pd.read_csv(file_path)


Unnamed: 0,Sample,Antisense_Gene,Antisense_Z_Score,Sense_Gene,Sense_Z_Score
0,BCH_0252-01,PTENP1-AS,3.590593,PTENP1,1.998326
1,BCH_0252-01,novelGene_7SK_AS,4.509518,7SK,-0.43105
2,BCH_0252-01,novelGene_AAK1_AS,2.537296,AAK1,2.306796
3,BCH_0252-01,novelGene_AASS_AS,2.217679,AASS,0.769069
4,BCH_0252-01,novelGene_ABAT_AS,4.381759,ABAT,1.7321


In [2]:
# Further filter results to include only cases where the sense gene has Noncyclo_Z_Score < -2
filtered_results = results_df[
    (results_df['Sense_Z_Score'].notnull()) & (results_df['Sense_Z_Score'] < -2)
]

# Save the filtered results to a new CSV file
filtered_output_path = "filtered_antisense_sense_analysis.csv"  # Replace with your desired output path
filtered_results.to_csv(filtered_output_path, index=False)

# Display the filtered results
filtered_results.head()

Unnamed: 0,Sample,Antisense_Gene,Antisense_Z_Score,Sense_Gene,Sense_Z_Score
1264,BCH_0252-01,novelGene_RAB1A_AS,3.124363,RAB1A,-2.212805
1834,BCH_1199-01,novelGene_AP3S2_AS_novelGene_ARPIN-AP3S2_AS,3.230244,AP3S2,-2.015269
1836,BCH_1199-01,novelGene_ARF1_AS,4.059751,ARF1,-2.634571
1845,BCH_1199-01,novelGene_ATG3_AS,4.064019,ATG3,-2.14684
1890,BCH_1199-01,novelGene_CCT7_AS,2.752725,CCT7,-2.633242


In [9]:
import pandas as pd

# File paths
file_antisense = "/mmfs1/gscratch/stergachislab/yhhc/projects/Iso-seq_public/Cyclo_noncyclo_comparison/Analysis/10.12.24/5.Test_statistics/added_HPO_and_seqr_Hyp2GOE_Gene.csv"
file_sense = "/mmfs1/gscratch/stergachislab/yhhc/projects/Iso-seq_public/Cyclo_noncyclo_comparison/Analysis/10.12.24/5.Test_statistics/added_HPO_and_seqr_Hyp2LOE_Gene.csv"

# Load files
df_antisense = pd.read_csv(file_antisense)
df_sense = pd.read_csv(file_sense)

# Function to identify the sense gene name based on antisense gene naming pattern
def get_sense_gene_name(antisense_gene):
    if "-AS" in antisense_gene:
        return antisense_gene.split("-AS")[0]
    elif "_AS" in antisense_gene:
        return antisense_gene.split("novelGene_")[1].split("_AS")[0]
    return None

# Process antisense file
results = []

for sample in df_antisense["Sample"].unique():
    sample_data = df_antisense[df_antisense["Sample"] == sample]
    antisense_genes = sample_data[sample_data["rank_top_95_percentile"] > 0]

    for _, antisense_row in antisense_genes.iterrows():
        antisense_gene = antisense_row["original_associated_gene"]
        antisense_data = antisense_row.add_prefix("Antisense_").to_dict()  # Add prefix to antisense columns
        sense_gene = get_sense_gene_name(antisense_gene)

        # Find the sense gene data in the sense file
        sense_gene_data = df_sense[(df_sense["Sample"] == sample) & (df_sense["original_associated_gene"] == sense_gene)]
        if not sense_gene_data.empty:
            sense_row = sense_gene_data.iloc[0].add_prefix("Sense_").to_dict()  # Add prefix to sense columns
            sense_rank = sense_row["Sense_rank_top_95_percentile"]

            # Include only if sense_rank > 0
            if sense_rank > 0:
                combined_data = {**antisense_data, **sense_row}
                results.append(combined_data)

# Convert results to a DataFrame
results_df = pd.DataFrame(results)

# Output results to a CSV file
output_file = "filtered_senseLOE_antisenseGOE_combined_with_prefix.csv"
results_df.to_csv(output_file, index=False)

# Display the first few rows of the results
results_df.head()


  df_antisense = pd.read_csv(file_antisense)
  df_sense = pd.read_csv(file_sense)


Unnamed: 0,Antisense_Sample,Antisense_associated_gene,Antisense_Total_bin_cyclo_count_Bin1_le,Antisense_Total_bin_cyclo_count_Bin2_g,Antisense_Total_bin_noncyclo_count_Bin1_le,Antisense_Total_bin_noncyclo_count_Bin2_g,Antisense_proportion_in_Bin1_cyclo,Antisense_proportion_in_Bin1_noncyclo,Antisense_Isoform_PBid,Antisense_cyclo_count,...,Sense_rank_top_99_5_percentile,Sense_rank_top_99_percentile,Sense_rank_top_98_percentile,Sense_rank_top_95_percentile,Sense_original_associated_gene,Sense_Proband_HPO,Sense_PhenotypesExtracted_OMIMnum,Sense_OMIM_HPO,Sense_similarity,Sense_Associated_with_genetic_variant
0,BCH_2401-01,MBD5,0,0,0,6,,0.0,novelGene_MBD5_AS,0,...,,,,432.0,MBD5,"Abnormality of the kidney, Hematuria, Proteinu...",OMIM:156200,"['Astigmatism', 'Short foot', 'Language impair...",0.09697,False
1,BCH_2401-01,novelGene,0,0,0,6,,0.0,novelGene_MBD5_AS,0,...,,,,432.0,MBD5,"Abnormality of the kidney, Hematuria, Proteinu...",OMIM:156200,"['Astigmatism', 'Short foot', 'Language impair...",0.09697,False
2,BCH_2401-01,AS,0,0,0,6,,0.0,novelGene_MBD5_AS,0,...,,,,432.0,MBD5,"Abnormality of the kidney, Hematuria, Proteinu...",OMIM:156200,"['Astigmatism', 'Short foot', 'Language impair...",0.09697,False
3,UDN204349,novelGene,0,0,0,1,,0.0,novelGene_METTL25_AS,0,...,38.0,57.0,69.0,73.0,METTL25,"Recurrent pneumonia, Recurrent sinusitis, Ecze...",,,,False
4,UDN204349,METTL25,0,0,0,1,,0.0,novelGene_METTL25_AS,0,...,38.0,57.0,69.0,73.0,METTL25,"Recurrent pneumonia, Recurrent sinusitis, Ecze...",,,,False


In [10]:
import pandas as pd

# File paths
file_sense = "/mmfs1/gscratch/stergachislab/yhhc/projects/Iso-seq_public/Cyclo_noncyclo_comparison/Analysis/10.12.24/5.Test_statistics/added_HPO_and_seqr_Hyp2GOE_Gene.csv"
file_antisense = "/mmfs1/gscratch/stergachislab/yhhc/projects/Iso-seq_public/Cyclo_noncyclo_comparison/Analysis/10.12.24/5.Test_statistics/added_HPO_and_seqr_Hyp2LOE_Gene.csv"

# Load files
df_antisense = pd.read_csv(file_antisense)
df_sense = pd.read_csv(file_sense)

# Function to identify the sense gene name based on antisense gene naming pattern
def get_sense_gene_name(antisense_gene):
    if "-AS" in antisense_gene:
        return antisense_gene.split("-AS")[0]
    elif "_AS" in antisense_gene:
        return antisense_gene.split("novelGene_")[1].split("_AS")[0]
    return None

# Process antisense file
results = []

for sample in df_antisense["Sample"].unique():
    sample_data = df_antisense[df_antisense["Sample"] == sample]
    antisense_genes = sample_data[sample_data["rank_top_95_percentile"] > 0]

    for _, antisense_row in antisense_genes.iterrows():
        antisense_gene = antisense_row["original_associated_gene"]
        antisense_data = antisense_row.add_prefix("Antisense_").to_dict()  # Add prefix to antisense columns
        sense_gene = get_sense_gene_name(antisense_gene)

        # Find the sense gene data in the sense file
        sense_gene_data = df_sense[(df_sense["Sample"] == sample) & (df_sense["original_associated_gene"] == sense_gene)]
        if not sense_gene_data.empty:
            sense_row = sense_gene_data.iloc[0].add_prefix("Sense_").to_dict()  # Add prefix to sense columns
            sense_rank = sense_row["Sense_rank_top_95_percentile"]

            # Include only if sense_rank > 0
            if sense_rank > 0:
                combined_data = {**antisense_data, **sense_row}
                results.append(combined_data)

# Convert results to a DataFrame
results_df = pd.DataFrame(results)

# Output results to a CSV file
output_file = "filtered_senseGOE_antisenseLOE_combined_with_prefix.csv"
results_df.to_csv(output_file, index=False)

# Display the first few rows of the results
results_df.head()


  df_antisense = pd.read_csv(file_antisense)
  df_sense = pd.read_csv(file_sense)


Unnamed: 0,Antisense_Sample,Antisense_associated_gene,Antisense_Total_bin_cyclo_count_Bin1_le,Antisense_Total_bin_cyclo_count_Bin2_g,Antisense_Total_bin_noncyclo_count_Bin1_le,Antisense_Total_bin_noncyclo_count_Bin2_g,Antisense_proportion_in_Bin1_cyclo,Antisense_proportion_in_Bin1_noncyclo,Antisense_Isoform_PBid,Antisense_cyclo_count,...,Sense_rank_top_99_5_percentile,Sense_rank_top_99_percentile,Sense_rank_top_98_percentile,Sense_rank_top_95_percentile,Sense_original_associated_gene,Sense_Proband_HPO,Sense_PhenotypesExtracted_OMIMnum,Sense_OMIM_HPO,Sense_similarity,Sense_Associated_with_genetic_variant
0,BCH_2401-01,PLXNB3-AS1,1,0,0,2,1.0,0.0,PLXNB3-AS1,1,...,,,,289.0,PLXNB3,"Abnormality of the kidney, Hematuria, Proteinu...",,,,False
1,UDN687128,TSPYL1,0,1,0,1,0.0,0.0,novelGene_TSPYL1_AS,1,...,,,,55.0,TSPYL1,"Asthenia, Oral-pharyngeal dysphagia, Dysphagia...",OMIM:608800,"['Ambiguous genitalia, male', 'Exaggerated sta...",0.216431,False
2,UDN687128,novelGene,0,1,0,1,0.0,0.0,novelGene_TSPYL1_AS,1,...,,,,55.0,TSPYL1,"Asthenia, Oral-pharyngeal dysphagia, Dysphagia...",OMIM:608800,"['Ambiguous genitalia, male', 'Exaggerated sta...",0.216431,False
3,UDN687128,AS,0,1,0,1,0.0,0.0,novelGene_TSPYL1_AS,1,...,,,,55.0,TSPYL1,"Asthenia, Oral-pharyngeal dysphagia, Dysphagia...",OMIM:608800,"['Ambiguous genitalia, male', 'Exaggerated sta...",0.216431,False
4,BCH_310-01,ZNF516-AS1,0,0,0,1,,0.0,ZNF516-AS1,0,...,,,,249.0,ZNF516,"Ptosis, Spasticity, Progressive spasticity, Lo...",,,,False
