In [16]:
import pandas as pd
import re

# Load the CSV file
file_path = '/content/cnv_segments_summary_14.csv'
df = pd.read_csv(file_path)


# Function to parse the 'segment' column
def parse_segment(segment):
    pattern = r"(\d+):(\d+)-(\d+)\s+\(CN\s+(\d+)\)"
    match = re.match(pattern, segment)
    if match:
        chromosome = int(match.group(1))
        start = int(match.group(2))
        end = int(match.group(3))
        cn = int(match.group(4))
        cnv_type = 'gain' if cn > 2 else 'loss'
        return pd.Series([chromosome, start, end, cn, cnv_type])
    else:
        return pd.Series([None, None, None, None, None])

# Apply parsing function to 'segment' column
parsed_columns = df['segment'].apply(parse_segment)
parsed_columns.columns = ['chromosome', 'start', 'end', 'CN', 'CNV_type']

# Combine original n_cells with parsed data
final_df = pd.concat([df[['segment', 'n_cells']], parsed_columns], axis=1)

#  Function to count segments per chromosome (sorted descending)
def count_segments_per_chromosome(df):
    counts = df['chromosome'].value_counts().sort_values(ascending=False)
    return counts

#  Function to count gains
def count_gains(df):
    return (df['CNV_type'] == 'gain').sum()

#  Function to count losses
def count_losses(df):
    return (df['CNV_type'] == 'loss').sum()

#  Function total n_cells per chromosome (sorted descending)
def total_cells_per_chromosome(df):
    totals = df.groupby('chromosome')['n_cells'].sum().sort_values(ascending=False)
    return totals

# Run the functions
segments_per_chromosome = count_segments_per_chromosome(final_df)
total_gains = count_gains(final_df)
total_losses = count_losses(final_df)
cells_per_chromosome = total_cells_per_chromosome(final_df)

# Print cleaned DataFrame
print("\nParsed DataFrame:")
print(final_df)

# Print summary stats (sorted)
print("\nSegments per chromosome (most to least):")
for chrom, count in segments_per_chromosome.items():
    print(f"Chromosome {chrom}: {count} segments")

print(f"\nTotal number of gain segments: {total_gains}")
print(f"Total number of loss segments: {total_losses}")

print("\nTotal number of cells affected per chromosome (most to least):")
for chrom, total_cells in cells_per_chromosome.items():
    print(f"Chromosome {chrom}: {total_cells} cells have CNVs")

# Optionally, save cleaned data
final_df.to_csv('parsed_cnv_data_with_ncells.csv', index=False)





Parsed DataFrame:
                          segment  n_cells  chromosome        start  \
0        Y:4993372-5000226 (CN 3)     5265         NaN          NaN   
1     19:52031378-58590710 (CN 3)     3869        19.0   52031378.0   
2    1:151327811-160945025 (CN 3)     3854         1.0  151327811.0   
3      1:19312326-43525187 (CN 3)     3838         1.0   19312326.0   
4     16:68245304-88663298 (CN 3)     3818        16.0   68245304.0   
..                            ...      ...         ...          ...   
160   19:51415724-51639478 (CN 3)     3011        19.0   51415724.0   
161    1:11054584-11054584 (CN 3)     3005         1.0   11054584.0   
162   17:35744511-35744511 (CN 3)     3005        17.0   35744511.0   
163   20:58818893-58863466 (CN 3)     3001        20.0   58818893.0   
164   17:35760887-35908474 (CN 3)     3000        17.0   35760887.0   

             end   CN CNV_type  
0            NaN  NaN     None  
1     58590710.0  3.0     gain  
2    160945025.0  3.0     gai

In [17]:
def find_cnv_overlap(df, chromosome=None, start=None, end=None, cn=None):
    """
    Checks if a CNV overlaps with the specified chromosome and region (start to end).
    Optional: also filter by CN value.

    A match occurs if:
      - same chromosome
      - input region overlaps with CNV region
      - (optional) CN matches
    """
    query = df
    if chromosome is not None:
        query = query[query['chromosome'] == chromosome]
    if start is not None and end is not None:
        # overlap condition: start <= segment_end and end >= segment_start
        query = query[(query['start'] <= end) & (query['end'] >= start)]
    if cn is not None:
        query = query[query['CN'] == cn]

    if not query.empty:
        print("\n✅ Found the following overlapping CNV(s):")
        print(query)
    else:
        print("\n❌ No overlapping CNV found.")

    return query



In [18]:
find_cnv_overlap(final_df, chromosome=20, start=31216079, end=35871578, cn=3)


✅ Found the following overlapping CNV(s):
                     segment  n_cells  chromosome     start         end   CN  \
6  20:267186-47501887 (CN 3)     3734        20.0  267186.0  47501887.0  3.0   

  CNV_type  
6     gain  


Unnamed: 0,segment,n_cells,chromosome,start,end,CN,CNV_type
6,20:267186-47501887 (CN 3),3734,20.0,267186.0,47501887.0,3.0,gain
