In [52]:
import pandas as pd

# Define the file path
file_path = 'variants_case_vcf.csv'  # Update the path as necessary if the file is located elsewhere

# Read the file into a DataFrame, assuming it is tab-separated
df = pd.read_csv(file_path)

# Display the first few rows of the DataFrame to understand its structure
df

Unnamed: 0,CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT,col_1,...,col_1499,col_1500,col_1501,col_1502,col_1503,col_1504,col_1505,col_1506,col_1507,col_1508
0,4,1807371,16337,C,A,.,.,PR,GT,0/1,...,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0
1,7,138391446,1344704,T,C,.,.,PR,GT,0/0,...,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0
2,1,154574727,2942810,G,A,.,.,PR,GT,0/0,...,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0
3,2,163128866,812537,G,C,.,.,PR,GT,0/0,...,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0
4,16,1505761,1012215,A,G,.,.,PR,GT,0/0,...,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1503,15,52425575,2575918,C,T,.,.,PR,GT,0/0,...,0/0,0/0,0/0,0/0,0/0,1/1,0/0,0/0,0/0,0/0
1504,17,8076848,929285,A,C,.,.,PR,GT,0/0,...,0/0,0/0,0/0,0/0,0/0,0/0,1/1,0/0,0/0,0/0
1505,1,150990382,932732,T,C,.,.,PR,GT,0/0,...,0/0,0/0,0/0,0/0,0/0,0/0,0/0,1/1,0/0,0/0
1506,11,17167409,599543,C,A,.,.,PR,GT,0/0,...,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,1/1,0/0


In [86]:
def case_variants(n, m, i):
    """
    Simulate causative variants of samples for AD (Autosomal Dominant) and AR (Autosomal Recessive) inheritance.

    Args:
        n (int): A non-negative integer specifying the number of Autosomal Dominant (AD) samples to simulate.
        m (int): A non-negative integer specifying the number of Autosomal Recessive (AR) samples to simulate.
        i (int): A non-negative integer specifying the number of healthy samples to simulate.

    Returns:
        None: This function generates and saves two text files:
            1. `case_variants.txt`: Contains the simulated causative variants for each sample.
            2. `case_chrom_pos.txt`: Contains the chromosome and position information for the causative variants.

    """
    # Read the file into a DataFrame
    df = pd.read_csv('variants_case_vcf.csv')
    
    # Ensure n and m are within the specified limits
    n = min(n, 814)
    m = min(m, 694)
      
    # Extract first n rows and last m rows
    rows_extracted = pd.concat([df.head(n).copy(), df.tail(m).copy()])
    
    # Extract first 5+n columns and last m columns
    columns_extracted = pd.concat([df.iloc[:, :5+n].copy(), df.iloc[:, -m:]].copy(), axis=1)
    
    # Combine extracted rows and columns
    extracted_df = rows_extracted[columns_extracted.columns].copy()   
    
    # Add `i` new columns with all values set to "0/0"
    for col_index in range(i):
        extracted_df[f'new_col_{col_index + 1}'] = "0/0"
    
    # Save the result to a txt file
    extracted_df.to_csv('case_variants.txt', sep='\t', index=False, header=False)
    
    # Extract the first two columns (CHROM and POS)
    chrom_pos_columns = extracted_df[['CHROM', 'POS']]

    # Save to a text file
    chrom_pos_columns.to_csv("case_chrom_pos.txt", index=False, header=False, sep='\t')
    
    return extracted_df

In [88]:
# Example usage
# Assuming df_variants is the DataFrame containing the data
# Adjust `n` and `m` as needed
df_case_variants = case_variants(n=20, m=20, i=10)

In [90]:
df_case_variants

Unnamed: 0,CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT,col_1,...,new_col_1,new_col_2,new_col_3,new_col_4,new_col_5,new_col_6,new_col_7,new_col_8,new_col_9,new_col_10
0,4,1807371,16337,C,A,.,.,PR,GT,1/0,...,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0
1,7,138391446,1344704,T,C,.,.,PR,GT,0/0,...,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0
2,1,154574727,2942810,G,A,.,.,PR,GT,0/0,...,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0
3,2,163128866,812537,G,C,.,.,PR,GT,0/0,...,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0
4,16,1505761,1012215,A,G,.,.,PR,GT,0/0,...,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0
5,17,42992619,16170,C,T,.,.,PR,GT,0/0,...,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0
6,2,227914783,1029644,C,A,.,.,PR,GT,0/0,...,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0
7,12,88532921,236464,C,A,.,.,PR,GT,0/0,...,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0
8,1,173881080,18029,G,A,.,.,PR,GT,0/0,...,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0
9,10,123279677,13272,G,C,.,.,PR,GT,0/0,...,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0
