In [None]:
import pandas as pd

# Load the data
# csv file has two rows of phenotypic data (binary and 5 class groupings) and then multiple rows copied from the VCF file
file_path = "phenotypes_genotypes_Chr09_409.csv"  # Update with the path to your file
df = pd.read_csv(file_path, index_col=0)

# Transpose the dataframe for easier column-based filtering
df = df.T

# coerce the columns to numeric
bc_columns = ["BC_Binary", "BC_5_Class"]
for col in bc_columns:
    df[col] = pd.to_numeric(df[col], errors="coerce")
    
print(len(df))

print(len(df[(df["Chr09_40934664"] == df["Chr09_40934714"]) & (df["Chr09_40934664"] == df["Chr09_40934715"]) & (df["Chr09_40934664"] == df["Chr09_41182684"])]))

116
116


In [None]:
## so we know that the genotypes are the same at the markers Chr09_40934664 - Chr09_41182684. Let's double check that they match up with the phenotypes

In [16]:
# Define conditions
conditions = {
    "1/1_BC_Binary=1": (df["Chr09_40934664"] == "1/1") & (df["BC_Binary"] == 1),
    "0/1_BC_Binary=1": (df["Chr09_40934664"] == "0/1") & (df["BC_Binary"] == 1),
    "0/0_BC_Binary=1": (df["Chr09_40934664"] == "0/0") & (df["BC_Binary"] == 2)

}

# Calculate averages and counts
results = []
for condition_name, condition in conditions.items():
    subset = df[condition]
    average = subset["BC_5_Class"].mean()
    count = subset.shape[0]
    results.append({"Condition": condition_name, "Average_BC_5_Class": average, "Count": count})

# Convert results to a DataFrame for display
results_df = pd.DataFrame(results)

# Save or display results
print(results_df)


         Condition  Average_BC_5_Class  Count
0  1/1_BC_Binary=1            1.000000     33
1  0/1_BC_Binary=1            1.000000     53
2  0/0_BC_Binary=1            3.535714     28


In [17]:
sum(results_df['Count'])

114

In [None]:
## these are the two that are "incorrect", but they are barely a blush color, so I think it's the normal blushing of the hets
df[(df["Chr09_40934664"] == "0/1") & (df["BC_Binary"] == 2)]

Unnamed: 0,BC_Binary,BC_5_Class,Chr09_40934664,Chr09_40934714,Chr09_40934715,Chr09_41182684,Chr09_41873463
F2-06-02,2,2,0/1,0/1,0/1,0/1,0/1
F2-06-266,2,2,0/1,0/1,0/1,0/1,0/1


In [None]:
# Now let's compile all info into one dataframe for representative SNPs for the full locus

# Load the data
file_path = "phenotypes_genotypes_Chr09.csv"  # Update with the path to your file
df_og = pd.read_csv(file_path, index_col=0)

# Transpose the dataframe for easier column-based filtering
df_og = df_og.T

# coerce the columns to numeric
bc_columns = ["BC_Binary", "BC_5_Class"]
for col in bc_columns:
    df_og[col] = pd.to_numeric(df_og[col], errors="coerce")

Unnamed: 0,BC_Binary,BC_5_Class,Chr09_46098776,Chr09_49257768
F2-06-79W,1,1,0/0,1/1
F2-05-116,1,1,0/0,1/1
F2-06-02,2,2,1/1,0/1
F2-06-03,1,1,0/0,1/1
F2-06-04,1,1,0/0,1/1
...,...,...,...,...
F2-06-94,1,1,0/0,0/1
F2-06-96,1,1,0/1,0/1
F2-06-97,1,1,0/0,0/1
F2-06-98,1,1,0/0,1/1


In [31]:
#df.insert(loc = 7,
#          column = 'Chr09_46098776',
#          value = df_og['Chr09_46098776'])
df.insert(loc = 10,
         column = 'Chr09_49257768',
         value = df_og['Chr09_49257768'])

In [None]:
# see all of the genotypes within locus of interest.

df[df["BC_Binary"] == 2]

Unnamed: 0,BC_Binary,BC_5_Class,Chr09_40934664,Chr09_40934714,Chr09_40934715,Chr09_41182684,Chr09_41873463,Chr09_46098776,Chr09_47501485,Chr09_48129837,Chr09_49257768
F2-06-02,2,2,0/1,0/1,0/1,0/1,0/1,1/1,0/0,0/1,0/1
F2-06-07,2,4,0/0,0/0,0/0,0/0,0/0,0/1,0/0,0/1,0/0
F2-06-112,2,4,0/0,0/0,0/0,0/0,0/0,0/1,0/0,0/1,0/0
F2-06-117,2,5,0/0,0/0,0/0,0/0,0/0,0/1,0/0,0/1,0/0
F2-06-118,2,5,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/1,0/1
F2-06-129,2,4,0/0,0/0,0/0,0/0,0/0,0/1,0/0,0/1,0/0
F2-06-130,2,4,0/0,0/0,0/0,0/0,0/0,0/1,0/0,0/1,0/0
F2-06-132,2,3,0/0,0/0,0/0,0/0,0/0,1/1,0/0,0/1,0/0
F2-06-137,2,4,0/0,0/0,0/0,0/0,0/0,0/1,0/0,0/1,0/0
F2-06-138,2,5,0/0,0/0,0/0,0/0,0/0,0/1,0/0,0/1,0/0
