In [2]:
import re
import pandas as pd
import streamlit as st
import base64

# identifying karyotype reports

In [None]:
st.write('# Cytogenetics calculator')

In [3]:
def load_file(file= 'Cytogenetics_TS_Apr2021.xlsx', streamlit=False):
    '''
    Loads the excel file and drops the ID column
    
    Params:
    -------
    
    file: excel filepath
        the file containing the cytogenetic reports, with columns
        that contain information about details to fill
        
    streamlit: bool, default False
        whether the streamlit interface should be used
    '''
    if streamlit:
        file = st.file_uploader("Upload excel here")
    karyotypes = pd.read_excel(file)    
    karyotypes = karyotypes.drop(columns='ID')
    
    return karyotypes

In [4]:
load_file().head()

Unnamed: 0,Cytogenetics,Number of cytogenetic abnormalities,Monosomy,Structural,"""-Y""","""-X""",del11q,del12p,del13q,del5q,...,t(9;22),t(16;16),inv(16),t(8;21),t(15;17),t(9;11),t(6;11),t(10;11),t(v;11),abnormal(17p)
0,"46,XX[25]_x000D_",,,,,,,,,,...,,,,,,,,,,
1,"46,XY,del(11)(q14q23)[10]",,,,,,,,,,...,,,,,,,,,,
2,"46,XX,del(5)(q22q35)[8/46,XX[2]",,,,,,,,,,...,,,,,,,,,,
3,"45,X,-Y[17]/46,XY[3]",,,,,,,,,,...,,,,,,,,,,
4,"47,XY,+8[9]/46,XY[1]",,,,,,,,,,...,,,,,,,,,,


In [9]:
#karyotypes = pd.read_excel('Cytogenetics_TS_Apr2021.xlsx', )
#file = st.file_uploader("Upload excel here")
#karyotypes = pd.read_excel(file)
#karyotypes = karyotypes.drop(columns='ID')
karyotypes.head()

Unnamed: 0,Cytogenetics,Number of cytogenetic abnormalities,Monosomy,Structural,"""-Y""","""-X""",del11q,del12p,del13q,del5q,...,t(16;16),inv(16),t(8;21),t(15;17),t(9;11),t(6;11),t(10;11),t(v;11),abnormal(17p),Error description
0,"46,XX[25]_x000D_",,,,,,,,,,...,,,,,,,,,,False
1,"46,XY,del(11)(q14q23)[10]",,,,,,,,,,...,,,,,,,,,,False
2,"46,XX,del(5)(q22q35)[8/46,XX[2]",,,,,,,,,,...,,,,,,,,,,False
3,"45,X,-Y[17]/46,XY[3]",,,,,,,,,,...,,,,,,,,,,False
4,"47,XY,+8[9]/46,XY[1]",,,,,,,,,,...,,,,,,,,,,False


In [6]:
karyotypes = load_file()
properties = karyotypes.columns[4:-1].to_list()

In [7]:
properties

['"-Y"',
 '"-X"',
 'del11q',
 'del12p',
 'del13q',
 'del5q',
 'del7q',
 'idic(X)(q13)',
 'isochromosome17q',
 'Monosomy13',
 'Monosomy17',
 'Monosomy5',
 'Monosomy7',
 't(1;3)',
 't(11;16)(q23.3;p13.3)',
 't(12p)',
 't(17p)',
 't(2;11)',
 't(3;21)',
 't(3;5)',
 't(5;10)',
 't(5;12)',
 't(5;17)',
 't(5;7)',
 't(5q)',
 't(1;22)',
 'inv(3)',
 't(3;3)',
 't(6;9)',
 't(9;22)',
 't(16;16)',
 'inv(16)',
 't(8;21)',
 't(15;17)',
 't(9;11)',
 't(6;11)',
 't(10;11)',
 't(v;11)']

In [11]:
def properties_dict(properties=properties):
    d = {p:p for p in properties}
    for k,v in d.items():
        #removing quotation marks at start and end
        if v[0] == '"':
            if v[-1] == v[0]:
                v = v[1:-1]
        
        #recognising monosomies
        if v.startswith('Monosomy'):
            v = '-' + v[8:]
            
        matches = re.finditer('(\d+)(p|q)',v)
        for m in matches:
            f = v[:m.start()] + '(' + m.group()[:-1] + \
                ')(' + m.group()[-1]+ v[m.end():]
            v=f
            
        #creating escape characters for strings
        v = re.escape(v)
        
        if v == 't\\(v;11\\)':
            v = 't\\(\d+;11\\)'
        
        d[k] = v
    return {v:k for k,v in d.items()}

In [12]:
prop_dict = properties_dict()
prop_dict

{'\\-Y': '"-Y"',
 '\\-X': '"-X"',
 'del\\(11\\)\\(q': 'del11q',
 'del\\(12\\)\\(p': 'del12p',
 'del\\(13\\)\\(q': 'del13q',
 'del\\(5\\)\\(q': 'del5q',
 'del\\(7\\)\\(q': 'del7q',
 'idic\\(X\\)\\(q13\\)': 'idic(X)(q13)',
 'isochromosome\\(17\\)\\(q': 'isochromosome17q',
 '\\-13': 'Monosomy13',
 '\\-17': 'Monosomy17',
 '\\-5': 'Monosomy5',
 '\\-7': 'Monosomy7',
 't\\(1;3\\)': 't(1;3)',
 't\\(11;16\\)\\(q23\\.3;p13\\.3\\)': 't(11;16)(q23.3;p13.3)',
 't\\(\\(12\\)\\(p\\)': 't(12p)',
 't\\(\\(17\\)\\(p\\)': 't(17p)',
 't\\(2;11\\)': 't(2;11)',
 't\\(3;21\\)': 't(3;21)',
 't\\(3;5\\)': 't(3;5)',
 't\\(5;10\\)': 't(5;10)',
 't\\(5;12\\)': 't(5;12)',
 't\\(5;17\\)': 't(5;17)',
 't\\(5;7\\)': 't(5;7)',
 't\\(\\(5\\)\\(q\\)': 't(5q)',
 't\\(1;22\\)': 't(1;22)',
 'inv\\(3\\)': 'inv(3)',
 't\\(3;3\\)': 't(3;3)',
 't\\(6;9\\)': 't(6;9)',
 't\\(9;22\\)': 't(9;22)',
 't\\(16;16\\)': 't(16;16)',
 'inv\\(16\\)': 'inv(16)',
 't\\(8;21\\)': 't(8;21)',
 't\\(15;17\\)': 't(15;17)',
 't\\(9;11\\)': 't(9;11

In [13]:
def remove_artefact(row):
    if re.search('_x000D_$', row['Cytogenetics']):
        return row['Cytogenetics'].strip()[:-7]
    return row['Cytogenetics'].strip()

In [14]:
karyotypes['Cytogenetics'] = karyotypes.apply(remove_artefact, axis=1)

In [15]:
karyotypes['Error'] = False

In [16]:
karyotypes['Error description'] = None

In [17]:
punct = r'()[]/'
punct_dict = {s:0 for s in punct}

In [18]:
def gram_error(string):
    missing = list()
    for k in punct_dict:
        punct_dict[k] = len(re.findall(re.escape(k), string))
    if punct_dict['/'] != punct_dict['[']-1:
        return 'incorrrect ratio of \'\\\' to \'[]\' '
    if punct_dict['['] != punct_dict[']']:
        missing.append(min(['[', ']'], key=punct_dict.get))
    if punct_dict['('] != punct_dict[')']:
        missing.append(min(['(',')'], key=punct_dict.get))
    if len(missing)!=0:
        return f'missing grammar: {", ".join(missing)}'
    substring = re.split('/', string)
    for i, s in enumerate(substring):
        chrom = s[:s.index(',')] #chromsome string
        if not re.search('idem', s):
            expected = 46
        expected -= len(re.findall('\-', s))
        expected += len(re.findall('\+', s))
        if re.search('[~]', s):
            low_num, high_num = int(chrom[:2]), int(chrom[-2:]) 
            if low_num <= expected <= high_num:
                pass
            elif expected < low_num:
                return f'chromsome number lower than expected in {i+1} subsection'
            else:
                return f'chromsome number higher than expected in {i+1} subsection'
        else:
            num = int(chrom[:2])
            if expected == num:
                pass
            elif expected > num:
                return f'chromsome number higher than expected in {i+1} subsection'
            else:
                return f'chromsome number lower than expected in {i+1} subsection'
    pass

In [20]:
def parse_karyotype(row):
    error = gram_error(row['Cytogenetics'])
    if error:
        row['Error'] = True
        row['Error description'] = error
        if not error.startswith('chromsome'):
            return row
    abnorms = set(re.split('/|,|\[', row['Cytogenetics']))
    removed = set()
    col_true = set()
    mono = 0
    struc = 0
    der = 0
    seventeen_p = False
    for a in abnorms:
        if re.fullmatch('\d\d([~-]\d\d)?|X[XY]?|(cp)?\d\d?\]|idem', a):
            removed.add(a)
#         if re.search('mar', a): #no longer the case to remove mar
#             removed.add(a)
        
        if a not in removed:
            
            if re.fullmatch('-\d+|-[XY]', a):
                mono += 1
            if re.search('[inv|t].*\).*\)', a):
                struc += 1
            
            if re.search('der', a):
                der += 1

            if re.search('17.*p|-17',a):
                m = re.search('\d\d?;\d\d?', a)
                if m:
                    split = re.split(';', m.group())
                    if re.findall('[pq]', a)[split.index('17')] == 'p':
                        seventeen_p = True
                    else:
                        seventeen_p = False
                else:
                    seventeen_p = True
            
            for p in prop_dict:
                if re.search(p, a):
                    col_true.add(p)
    abnorms = abnorms.difference(removed)
    row['Number of cytogenetic abnormalities'] = len(abnorms) + der
    row['Monosomy'] = mono
    row['Structural'] = struc
    row['abnormal(17p)'] = seventeen_p
    for c in col_true:
        row[prop_dict[c]] = True
    return row

In [21]:
results = karyotypes.apply(parse_karyotype, axis=1, )
results.head()

Unnamed: 0,Cytogenetics,Number of cytogenetic abnormalities,Monosomy,Structural,"""-Y""","""-X""",del11q,del12p,del13q,del5q,...,inv(16),t(8;21),t(15;17),t(9;11),t(6;11),t(10;11),t(v;11),abnormal(17p),Error description,Error
0,"46,XX[25]",0.0,0.0,0.0,,,,,,,...,,,,,,,,False,,False
1,"46,XY,del(11)(q14q23)[10]",1.0,0.0,0.0,,,True,,,,...,,,,,,,,False,,False
2,"46,XX,del(5)(q22q35)[8/46,XX[2]",,,,,,,,,,...,,,,,,,,,missing grammar: ],True
3,"45,X,-Y[17]/46,XY[3]",1.0,1.0,0.0,True,,,,,,...,,,,,,,,False,,False
4,"47,XY,+8[9]/46,XY[1]",1.0,0.0,0.0,,,,,,,...,,,,,,,,False,,False


In [22]:
results.loc[results['Error'].astype(bool)]

Unnamed: 0,Cytogenetics,Number of cytogenetic abnormalities,Monosomy,Structural,"""-Y""","""-X""",del11q,del12p,del13q,del5q,...,inv(16),t(8;21),t(15;17),t(9;11),t(6;11),t(10;11),t(v;11),abnormal(17p),Error description,Error
2,"46,XX,del(5)(q22q35)[8/46,XX[2]",,,,,,,,,,...,,,,,,,,,missing grammar: ],True
14,"45,X,-Y[6] 46,XY[14]",,,,,,,,,,...,,,,,,,,,incorrrect ratio of '\' to '[]',True
33,"45-47,XY,add(3)(p13),-5,add(6)(p22),del(7)(q22...",10.0,3.0,0.0,,,,,,,...,,,,,,,,True,chromsome number lower than expected in 1 subs...,True
69,"46,XY,+1,der(1;7)(q10;p10)[9]/46,XY[4]",3.0,0.0,0.0,,,,,,,...,,,,,,,,False,chromsome number higher than expected in 1 sub...,True
71,"46,XY,-11,add(18)(q21),-20,add(20)(p12),+2mar[10]",5.0,2.0,0.0,,,,,,,...,,,,,,,,False,chromsome number lower than expected in 1 subs...,True
78,"43~47,X,-Y,add(1)(p36.3),-5,add(5)(q11.2),-8,a...",15.0,8.0,0.0,True,,,,,,...,,,,,,,,True,chromsome number lower than expected in 1 subs...,True
115,"45,XX,-7[22]/46,idem,+12[3]/47,idem,+12,+20[5]",3.0,1.0,0.0,,,,,,,...,,,,,,,,False,chromsome number higher than expected in 3 sub...,True
127,"45,XX,add(4)(q31),-5,-6,-17,del(20)(q11.2q13.1...",6.0,3.0,0.0,,,,,,,...,,,,,,,,True,chromsome number lower than expected in 1 subs...,True
147,"46,XY,add(5)(q14),del(7)(q31q36),+8,?dup(10)(q...",13.0,3.0,0.0,,,,,,,...,,,,,,,,False,chromsome number lower than expected in 1 subs...,True


In [27]:
results.loc[results['Error']==False] = results.loc[results['Error']==False].fillna(False)

In [28]:
results.head()

Unnamed: 0,Cytogenetics,Number of cytogenetic abnormalities,Monosomy,Structural,"""-Y""","""-X""",del11q,del12p,del13q,del5q,...,inv(16),t(8;21),t(15;17),t(9;11),t(6;11),t(10;11),t(v;11),abnormal(17p),Error description,Error
0,"46,XX[25]",0.0,0.0,0.0,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,"46,XY,del(11)(q14q23)[10]",1.0,0.0,0.0,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,"46,XX,del(5)(q22q35)[8/46,XX[2]",,,,,,,,,,...,,,,,,,,,missing grammar: ],True
3,"45,X,-Y[17]/46,XY[3]",1.0,1.0,0.0,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,"47,XY,+8[9]/46,XY[1]",1.0,0.0,0.0,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [13]:
#results.any()

Cytogenetics                            True
Number of cytogenetic abnormalities     True
Monosomy                                True
Structural                              True
"-Y"                                    True
"-X"                                   False
del11q                                  True
del12p                                 False
del13q                                  True
del5q                                   True
del7q                                   True
idic(X)(q13)                           False
isochromosome17q                       False
Monosomy13                              True
Monosomy17                              True
Monosomy5                               True
Monosomy7                               True
t(1;3)                                 False
t(11;16)(q23.3;p13.3)                  False
t(12p)                                 False
t(17p)                                 False
t(2;11)                                False
t(3;21)   

In [115]:
#results.to_excel('Cytogenetics_output_V3.xlsx')

In [29]:
global

{'\\-Y': '"-Y"',
 '\\-X': '"-X"',
 'del\\(11\\)\\(q': 'del11q',
 'del\\(12\\)\\(p': 'del12p',
 'del\\(13\\)\\(q': 'del13q',
 'del\\(5\\)\\(q': 'del5q',
 'del\\(7\\)\\(q': 'del7q',
 'idic\\(X\\)\\(q13\\)': 'idic(X)(q13)',
 'isochromosome\\(17\\)\\(q': 'isochromosome17q',
 '\\-13': 'Monosomy13',
 '\\-17': 'Monosomy17',
 '\\-5': 'Monosomy5',
 '\\-7': 'Monosomy7',
 't\\(1;3\\)': 't(1;3)',
 't\\(11;16\\)\\(q23\\.3;p13\\.3\\)': 't(11;16)(q23.3;p13.3)',
 't\\(\\(12\\)\\(p\\)': 't(12p)',
 't\\(\\(17\\)\\(p\\)': 't(17p)',
 't\\(2;11\\)': 't(2;11)',
 't\\(3;21\\)': 't(3;21)',
 't\\(3;5\\)': 't(3;5)',
 't\\(5;10\\)': 't(5;10)',
 't\\(5;12\\)': 't(5;12)',
 't\\(5;17\\)': 't(5;17)',
 't\\(5;7\\)': 't(5;7)',
 't\\(\\(5\\)\\(q\\)': 't(5q)',
 't\\(1;22\\)': 't(1;22)',
 'inv\\(3\\)': 'inv(3)',
 't\\(3;3\\)': 't(3;3)',
 't\\(6;9\\)': 't(6;9)',
 't\\(9;22\\)': 't(9;22)',
 't\\(16;16\\)': 't(16;16)',
 'inv\\(16\\)': 'inv(16)',
 't\\(8;21\\)': 't(8;21)',
 't\\(15;17\\)': 't(15;17)',
 't\\(9;11\\)': 't(9;11

In [None]:
def download_link(object_to_download, download_filename, download_link_text):
    """
    Generates a link to download the given object_to_download.

    object_to_download (str, pd.DataFrame):  The object to be downloaded.
    download_filename (str): filename and extension of file. e.g. mydata.csv, some_txt_output.txt
    download_link_text (str): Text to display for download link.

    Examples:
    download_link(YOUR_DF, 'YOUR_DF.csv', 'Click here to download data!')
    download_link(YOUR_STRING, 'YOUR_STRING.txt', 'Click here to download your text!')

    """
    if isinstance(object_to_download,pd.DataFrame):
        object_to_download = object_to_download.to_csv(index=False)

    # some strings <-> bytes conversions necessary here
    b64 = base64.b64encode(object_to_download.encode()).decode()

    return f'<a href="data:file/txt;base64,{b64}" download="{download_filename}">{download_link_text}</a>'

In [None]:
st.write(results)

In [None]:
if st.button('Download Dataframe as CSV'):
    tmp_download_link = download_link(results, 'YOUR_DF.csv', 'Click here to download your data!')
    st.markdown(tmp_download_link, unsafe_allow_html=True)