In [1]:
# Parsing genomicLocation for predictsnp
import os
import pandas as pd
import re

# Input and output folder paths
input_folder = "data/Variations"  # Replace with your folder containing input CSV files
output_folder = "data/Pr"  # Replace with your desired output folder

# Ensure the output folder exists
os.makedirs(output_folder, exist_ok=True)

# Function to parse a single genomicLocation
def parse_genomic_location(location):
    if not isinstance(location, str):  # Ensure the location is a string
        return None
    
    # Match the format like NC_000007.14:g.55019281C>T
    match = re.match(r"NC_(\d+)\.\d+:g\.(\d+)([ACGT])>([ACGT])", location)
    if match:
        chromosome, position, ref_base, alt_base = match.groups()
        return f"{int(chromosome)},{position},{ref_base},{alt_base}"

    # Match the format for deletions, e.g., NC_000007.14:g.55019311_55019331del
    match_del = re.match(r"NC_(\d+)\.\d+:g\.(\d+)_([\d]+)del", location)
    if match_del:
        chromosome, start_position, end_position = match_del.groups()
        return f"{int(chromosome)},{start_position}-{end_position},del"
    
    return None

# Function to handle cases with multiple values or empty lists
def process_genomic_location(cell):
    if isinstance(cell, str) and cell.startswith('[') and cell.endswith(']'):
        # Remove square brackets and split by commas
        cell = cell[1:-1]
        locations = [loc.strip().strip("'") for loc in cell.split(',')]
        parsed_locations = [parse_genomic_location(loc) for loc in locations]
        return ','.join([loc for loc in parsed_locations if loc])  # Join valid parsed locations
    else:
        return parse_genomic_location(cell)

# Process each CSV file in the input folder
for file_name in os.listdir(input_folder):
    if file_name.endswith(".csv"):
        input_path = os.path.join(input_folder, file_name)
        output_path = os.path.join(output_folder, file_name)

        # Read the input CSV
        df = pd.read_csv(input_path)

        if "genomicLocation" in df.columns:
            # Apply the function to process genomicLocation column
            df["genomicLocation"] = df["genomicLocation"].apply(process_genomic_location)

            # Save the updated DataFrame to the output folder
            df.to_csv(output_path, index=False)
            print(f"Processed and saved: {output_path}")
        else:
            print(f"Skipping {file_name}: 'genomicLocation' column not found.")

Processed and saved: data/Pr/P10721_variations.csv
