In [8]:
import pandas as pd
import re

# Load the CSV file
file_path = '/content/cnv_segments_summary_16.csv'
df = pd.read_csv(file_path)


# Function to parse the 'segment' column
def parse_segment(segment):
    pattern = r"(\d+):(\d+)-(\d+)\s+\(CN\s+(\d+)\)"
    match = re.match(pattern, segment)
    if match:
        chromosome = int(match.group(1))
        start = int(match.group(2))
        end = int(match.group(3))
        cn = int(match.group(4))
        cnv_type = 'gain' if cn > 2 else 'loss'
        return pd.Series([chromosome, start, end, cn, cnv_type])
    else:
        return pd.Series([None, None, None, None, None])

# Apply parsing function to 'segment' column
parsed_columns = df['segment'].apply(parse_segment)
parsed_columns.columns = ['chromosome', 'start', 'end', 'CN', 'CNV_type']

# Combine original n_cells with parsed data
final_df = pd.concat([df[['segment', 'n_cells']], parsed_columns], axis=1)

#  Function to count segments per chromosome (sorted descending)
def count_segments_per_chromosome(df):
    counts = df['chromosome'].value_counts().sort_values(ascending=False)
    return counts

#  Function to count gains
def count_gains(df):
    return (df['CNV_type'] == 'gain').sum()

#  Function to count losses
def count_losses(df):
    return (df['CNV_type'] == 'loss').sum()

#  Function total n_cells per chromosome (sorted descending)
def total_cells_per_chromosome(df):
    totals = df.groupby('chromosome')['n_cells'].sum().sort_values(ascending=False)
    return totals

# Run the functions
segments_per_chromosome = count_segments_per_chromosome(final_df)
total_gains = count_gains(final_df)
total_losses = count_losses(final_df)
cells_per_chromosome = total_cells_per_chromosome(final_df)

# Print cleaned DataFrame
print("\nParsed DataFrame:")
print(final_df)

# Print summary stats (sorted)
print("\nSegments per chromosome (most to least):")
for chrom, count in segments_per_chromosome.items():
    print(f"Chromosome {chrom}: {count} segments")

print(f"\nTotal number of gain segments: {total_gains}")
print(f"Total number of loss segments: {total_losses}")

print("\nTotal number of cells affected per chromosome (most to least):")
for chrom, total_cells in cells_per_chromosome.items():
    print(f"Chromosome {chrom}: {total_cells} cells have CNVs")

# Optionally, save cleaned data
final_df.to_csv('parsed_cnv_data_with_ncells.csv', index=False)





Parsed DataFrame:
                         segment  n_cells  chromosome        start  \
0       16:77007-30761745 (CN 3)    10339        16.0      77007.0   
1   6:109492855-170575295 (CN 3)     9938         6.0  109492855.0   
2       4:53286-189940855 (CN 3)     9841         4.0      53286.0   
3     11:126990-134378504 (CN 3)     9785        11.0     126990.0   
4      20:270863-64083376 (CN 3)     9772        20.0     270863.0   
..                           ...      ...         ...          ...   
79    8:62248591-62248591 (CN 3)     6047         8.0   62248591.0   
80  X:101009346-101009346 (CN 3)     6045         NaN          NaN   
81  X:115003975-119236245 (CN 3)     6044         NaN          NaN   
82    X:49250438-49269793 (CN 3)     6022         NaN          NaN   
83  X:112774503-112774503 (CN 3)     6005         NaN          NaN   

            end   CN CNV_type  
0    30761745.0  3.0     gain  
1   170575295.0  3.0     gain  
2   189940855.0  3.0     gain  
3   13437850