Notes
Two SNPs are identical if
- they share the same position
AND
- the have the same substitution.

When you compute the difference between X-Y, consider SNPs that are at least 80% in X, and at least 27% in Y.

Given
- A = BdWA1
- B = FAT_R_P1
- C = FAT_R_P2
- D = FAT_R_C1
- E = FAT_R_C2
- F = FAT_R_C3
- G = Fos_FAT_R_P
- H = Fos_FAT_R_C1
- I = Fos_FAT_R_C2
- J = Fos_FAT_R_C3

produce:
- Unique SNPs for FAT_R_P1 = (B-A-C)
- Unique SNPs for FAT_R_P2 = (C-A-B)
- Unique SNPs for FAT_R_C1 = (D-A-B-C-E-F)
- Unique SNPs for FAT_R_C2 = (E-A-B-C-D-F)
- Unique SNPs for FAT_R_C3 = (F-A-B-C-D-E)
- Common SNPs in FAT_R_Clones =(D+E+F)
- Unique SNPs for Fos_FAT_R_P = (G-A-B-C-D-E-F)
- Unique SNPs for Fos_FAT_R_C1 = (H-A-B-C-D-E-F-G-I-J)
- Unique SNPs for Fos_FAT_R_C2 = (I-A-B-C-D-E-F-G-H-J)
- Unique SNPs for Fos_FAT_R_C3 = (J-A-B-C-D-E-F-G-H-I)
- Common SNPs in Fos_FAT_R_Clones = (H+I+J)
- SNPs in the intersection between (D+E+F) and (H+I+J)

In [None]:
###
### support functions
###
import io
import pandas as pd
from google.colab import files

def read_file(fname):
  u = files.upload()
  for fn in u.keys():
    print('User uploaded file "{name}" with length {length} bytes'.format(name=fn, length=len(u[fn])))
  tab = pd.read_excel(io.BytesIO(u[fname]),keep_default_na=False)
  my_dict = {}
  for index, row in tab.iterrows():
    change = row['Change']
    chromosome = row['Chromosome']
    minimum = row['Minimum']
    maximum = row['Maximum']
    frequency = row['Variant Frequency']
    ptype = row['Polymorphism Type']
    aachange = row['Amino Acid Change']
    cds = row['CDS']
    codon = row['CDS Codon Number']
    effect = row['Protein Effect']
    assert(minimum==maximum)
    my_dict[chromosome,minimum,change] = (float(frequency),ptype,aachange,cds,codon,effect)
  print('Read', len(my_dict), 'records')
  return my_dict

def dict_union(X_dict,Y_dict):
  #
  # computes (X u Y)
  #
  XunionY_dict = {}
  for y in Y_dict:
    XunionY_dict[y] = Y_dict[y]
  for x in X_dict:
    if x in Y_dict:
      if X_dict[x][0] > Y_dict[x][0]:
        XunionY_dict[x] = X_dict[x] # take the highest frequency
      else:
        XunionY_dict[x] = Y_dict[x]
    else:
      XunionY_dict[x] = X_dict[x]
  return XunionY_dict

def dict_print(my_dict):
  #
  # prints the dictionary
  #
  print('Change, Chromosome, Minimum, Maximum, Variant Frequency, Polymorphism Type, Amino Acid Change, CDS, CDS Codon Number, Protein Effect')
  for x in my_dict:
    print(str(x[2])+", "+str(x[0])+", "+str(x[1])+", "+str(x[1])+", "+str(my_dict[x][0])+", "+str(my_dict[x][1])+", "+str(my_dict[x][2])+", "+str(my_dict[x][3])+", "+str(my_dict[x][4])+", "+str(my_dict[x][5]))
  print('\n')

def dict_difference(X_dict,Y_dict):
  #
  # computes and prints X - Y
  #
  difference_dict = {}
  discarded_dict = {}
  for x in X_dict:
    if (X_dict[x][0] >= 0.8): # consider b only if at least 80%
      if x in Y_dict:
        if (Y_dict[x][0] >= 0.27): # consider a only if at least 27%
          discarded_dict[x] = 'SNP in common'
        else:
          difference_dict[x] = X_dict[x]
      else:
        difference_dict[x] = X_dict[x]
    else:
      discarded_dict[x] = 'below 80%'
  # print the CSVs
  dict_print(difference_dict)
  print('Change, Chromosome, Minimum, Maximum, Reason Discarded')
  for x in discarded_dict:
    print(str(x[2])+", "+str(x[0])+", "+str(x[1])+", "+str(x[1])+", "+str(discarded_dict[x]))
  # return (difference_dict, discarded_dict)

In [None]:
###
### READ A (WA1)
###
A_dict = read_file('A_BdWA_1 SNPs.xlsx')
print(A_dict)

In [None]:
###
### READ B (FAT_R_P1)
###
B_dict = read_file('B_FAT_R_P1 SNPs .xlsx')
print(B_dict)

In [None]:
###
### READ C (FAT_R_P2)
###
C_dict = read_file('C_FAT_R_P2.xlsx')
print(C_dict)

In [None]:
###
### COMPUTE Unique SNPs for FAT_R_P1 = B - (A u C)
###
AunionC_dict = dict_union(A_dict,C_dict)
dict_difference(B_dict,AunionC_dict)

In [None]:
###
### COMPUTE Unique SNPs for FAT_R_P2 = C - (A u B)
###
AunionB_dict = dict_union(A_dict,B_dict)
dict_difference(C_dict, AunionB_dict)

In [None]:
###
### READ D (FAT_R_C1)
###
D_dict = read_file('D_FAT_RC1 SNPs.xlsx')
print(D_dict)

In [None]:
###
### READ E (FAT_R_C2)
###
E_dict = read_file('E_FAT_R_C2.xlsx')
print(E_dict)

In [None]:
###
### READS F (FAT_R_C3)
###
F_dict = read_file('F_FAT_R_C3.xlsx')
print(F_dict)

In [None]:
###
### COMPUTE Unique SNPs for FAT_R_C1 = D - (A u B u C u E u F)
###
AuB_dict = dict_union(A_dict, B_dict)
CuE_dict = dict_union(C_dict, E_dict)
AuBuCuE_dict = dict_union(AuB_dict, CuE_dict)
AuBuCuEuF_dict = dict_union(AuBuCuE_dict, F_dict)
dict_difference(D_dict, AuBuCuEuF_dict)

In [None]:
###
### COMPUTE Unique SNPs for FAT_R_C2 = E - (A u B u C u D u F)
###
AuB_dict = dict_union(A_dict, B_dict)
CuD_dict = dict_union(C_dict, D_dict)
AuBuCuD_dict = dict_union(AuB_dict, CuD_dict)
AuBuCuDuF_dict = dict_union(AuBuCuD_dict, F_dict)
dict_difference(E_dict, AuBuCuDuF_dict)

In [None]:
###
### COMPUTE Unique SNPs for FAT_R_C3 = F - (A u B u C u D u E)
###
AuB_dict = dict_union(A_dict, B_dict)
CuD_dict = dict_union(C_dict, D_dict)
AuBuCuD_dict = dict_union(AuB_dict, CuD_dict)
AuBuCuDuE_dict = dict_union(AuBuCuD_dict, E_dict)
dict_difference(F_dict, AuBuCuDuE_dict)

In [None]:
###
### COMPUTE Common SNPs in FAT_R_Clones = D u E u F
###
DuE_dict = dict_union(D_dict, E_dict)
DuEuF_dict = dict_union(DuE_dict, F_dict)
dict_print(DuEuF_dict)

In [None]:
###
### READS G (Fos_FAT_R_P)
###
G_dict = read_file('G_Fos_FAT_R_P.xlsx')
print(G_dict)

In [None]:
###
### READS H (Fos_FAT_R_C1)
###
H_dict = read_file('H_Fos_FAT_R_C1.xlsx')
print(H_dict)

In [None]:
###
### READS I (Fos_FAT_R_C2)
###
I_dict = read_file('I_Fos_FAT_R_C2.xlsx')
print(I_dict)

In [None]:
###
### READS J (Fos_FAT_R_C3)
###
J_dict = read_file('J_FOS_FAT_R_C3.xlsx')
print(J_dict)

In [None]:
###
### COMPUTE Unique SNPs for Fos_FAT_R_P = G - (A u B u C u D u E u F)
###
AuB_dict = dict_union(A_dict, B_dict)
CuD_dict = dict_union(C_dict, D_dict)
EuF_dict = dict_union(E_dict, F_dict)
AuBuCuD_dict = dict_union(AuB_dict, CuD_dict)
AuBuCuDuEuF_dict = dict_union(AuBuCuD_dict,EuF_dict)
dict_difference(G_dict, AuBuCuDuEuF_dict)

In [None]:
###
### COMPUTE Unique SNPs for Fos_FAT_R_C1 = H - (A u B u C u D u E u F u G u I u J)
###
AuB_dict = dict_union(A_dict, B_dict)
CuD_dict = dict_union(C_dict, D_dict)
AuBuCuD_dict = dict_union(AuB_dict, CuD_dict)
EuF_dict = dict_union(E_dict, F_dict)
GuI_dict = dict_union(G_dict, I_dict)
EuFuGuI_dict = dict_union(EuF_dict,GuI_dict)
AuBuCuDuEuFuGuI_dict = dict_union(AuBuCuD_dict,EuFuGuI_dict)
AuBuCuDuEuFuGuIuJ_dict = dict_union(AuBuCuDuEuFuGuI_dict,J_dict)
dict_difference(H_dict,AuBuCuDuEuFuGuIuJ_dict)

In [None]:
###
### COMPUTE Unique SNPs for Fos_FAT_R_C2 = I - (A u B u C u D u E u F u G u H u J)
###
AuB_dict = dict_union(A_dict, B_dict)
CuD_dict = dict_union(C_dict, D_dict)
AuBuCuD_dict = dict_union(AuB_dict, CuD_dict)
EuF_dict = dict_union(E_dict, F_dict)
GuH_dict = dict_union(G_dict, H_dict)
EuFuGuH_dict = dict_union(EuF_dict,GuH_dict)
AuBuCuDuEuFuGuH_dict = dict_union(AuBuCuD_dict,EuFuGuH_dict)
AuBuCuDuEuFuGuHuJ_dict = dict_union(AuBuCuDuEuFuGuH_dict,J_dict)
dict_difference(I_dict,AuBuCuDuEuFuGuHuJ_dict)

In [None]:
###
### COMPUTE Unique SNPs for Fos_FAT_R_C3 = J - (A u B u C u D u E u F u G u H u I)
###
AuB_dict = dict_union(A_dict, B_dict)
CuD_dict = dict_union(C_dict, D_dict)
AuBuCuD_dict = dict_union(AuB_dict, CuD_dict)
EuF_dict = dict_union(E_dict, F_dict)
GuH_dict = dict_union(G_dict, H_dict)
EuFuGuH_dict = dict_union(EuF_dict,GuH_dict)
AuBuCuDuEuFuGuH_dict = dict_union(AuBuCuD_dict,EuFuGuH_dict)
AuBuCuDuEuFuGuHuI_dict = dict_union(AuBuCuDuEuFuGuH_dict,I_dict)
dict_difference(J_dict,AuBuCuDuEuFuGuHuI_dict)

In [None]:
###
### COMPUTE Common SNPs in Fos_FAT_R_Clones = H u I u J
###
HuI_dict = dict_union(H_dict,I_dict)
HuIuJ_dict = dict_union(HuI_dict,J_dict)
dict_print(HuIuJ_dict)

In [None]:
###
### COMPUTE Common SNPs = D u E u F H u I u J
###
DuEuFuHuIuJ_dict = dict_union(DuEuF_dict,HuIuJ_dict)
dict_print(DuEuFuHuIuJ_dict)

In [None]:
###
### COMPUTE Common SNPs = (D u E u F) intersection (H u I u J)
###
DuEuFintHuIuJ_dict = {}
for x in DuEuF_dict:
  if x in HuIuJ_dict:
    if DuEuF_dict[x][0] > HuIuJ_dict[x][0]:
      DuEuFintHuIuJ_dict[x] = DuEuF_dict[x] # take the highest frequency
    else:
      DuEuFintHuIuJ_dict[x] = HuIuJ_dict[x]
dict_print(DuEuFintHuIuJ_dict)