In [1]:
import re
import pandas as pd
import streamlit as st
import base64

# identifying karyotype reports

In [2]:
st.write('# Cytogenetics calculator')

  command:

    streamlit run C:\Users\tamir\anaconda\lib\site-packages\ipykernel_launcher.py [ARGUMENTS]


In [3]:
def load_file(file= 'Cytogenetics_TS_Apr2021.xlsx', streamlit=False):
    '''
    Loads the excel file and drops the ID column
    
    Params:
    -------
    
    file: excel filepath
        the file containing the cytogenetic reports, with columns
        that contain information about details to fill
        
    streamlit: bool, default False
        whether the streamlit interface should be used
    '''
    if streamlit:
        file = st.file_uploader("Upload excel here")
    karyotypes = pd.read_excel(file)    
    karyotypes = karyotypes.drop(columns='ID')
    
    return karyotypes

In [4]:
load_file('Cytogenetics_TS_Apr2021.xlsx').head()

Unnamed: 0,Cytogenetics,Number of cytogenetic abnormalities,Monosomy,Structural,"""-Y""","""-X""",del11q,del12p,del13q,del5q,...,t(9;22),t(16;16),inv(16),t(8;21),t(15;17),t(9;11),t(6;11),t(10;11),t(v;11),abnormal(17p)
0,"46,XX[25]_x000D_",,,,,,,,,,...,,,,,,,,,,
1,"46,XY,del(11)(q14q23)[10]",,,,,,,,,,...,,,,,,,,,,
2,"46,XX,del(5)(q22q35)[8/46,XX[2]",,,,,,,,,,...,,,,,,,,,,
3,"45,X,-Y[17]/46,XY[3]",,,,,,,,,,...,,,,,,,,,,
4,"47,XY,+8[9]/46,XY[1]",,,,,,,,,,...,,,,,,,,,,


In [5]:
karyotypes = load_file('Cytogenetics_TS_Apr2021.xlsx')
properties = karyotypes.columns[4:-1].to_list()

In [6]:
#karyotypes = pd.read_excel('Cytogenetics_TS_Apr2021.xlsx', )
#file = st.file_uploader("Upload excel here")
#karyotypes = pd.read_excel(file)
#karyotypes = karyotypes.drop(columns='ID')
karyotypes.head()

Unnamed: 0,Cytogenetics,Number of cytogenetic abnormalities,Monosomy,Structural,"""-Y""","""-X""",del11q,del12p,del13q,del5q,...,t(9;22),t(16;16),inv(16),t(8;21),t(15;17),t(9;11),t(6;11),t(10;11),t(v;11),abnormal(17p)
0,"46,XX[25]_x000D_",,,,,,,,,,...,,,,,,,,,,
1,"46,XY,del(11)(q14q23)[10]",,,,,,,,,,...,,,,,,,,,,
2,"46,XX,del(5)(q22q35)[8/46,XX[2]",,,,,,,,,,...,,,,,,,,,,
3,"45,X,-Y[17]/46,XY[3]",,,,,,,,,,...,,,,,,,,,,
4,"47,XY,+8[9]/46,XY[1]",,,,,,,,,,...,,,,,,,,,,


In [7]:
properties

['"-Y"',
 '"-X"',
 'del11q',
 'del12p',
 'del13q',
 'del5q',
 'del7q',
 'idic(X)(q13)',
 'isochromosome17q',
 'Monosomy13',
 'Monosomy17',
 'Monosomy5',
 'Monosomy7',
 't(1;3)',
 't(11;16)(q23.3;p13.3)',
 't(12p)',
 't(17p)',
 't(2;11)',
 't(3;21)',
 't(3;5)',
 't(5;10)',
 't(5;12)',
 't(5;17)',
 't(5;7)',
 't(5q)',
 't(1;22)',
 'inv(3)',
 't(3;3)',
 't(6;9)',
 't(9;22)',
 't(16;16)',
 'inv(16)',
 't(8;21)',
 't(15;17)',
 't(9;11)',
 't(6;11)',
 't(10;11)',
 't(v;11)']

In [52]:
def properties_dict(karyotypes, properties = None):
    '''
    1. Transforms column names in input file into relevant 
    string formats to be used in regex report extraction
    2. Returns a dictionary having these strings as keys,
    with the original column names as values
    
    Params:
    -------
    karyotypes: pd.DataFrame
        dataframe obtained in load_file
        
    properties: list, default None
        option to obtain string names for abnormalities directly.
    '''
    if not properties:
        properties = karyotypes.columns[4:-1].to_list()
    d = {p:p for p in properties}
    for k,v in d.items():
        #removing quotation marks at start and end
        if v[0] == '"':
            if v[-1] == v[0]:
                v = v[1:-1]
        
        #Formatting monosomies
        if v.startswith('Monosomy'):
            v = '-' + v[8:]
        
        #
        matches = re.finditer('(\d+)(p|q)',v)
        for m in matches:
            f = v[:m.start()] + '(' + m.group()[:-1] + \
                ')(' + m.group()[-1]+ v[m.end():]
            v=f
            
        #creating escape characters for strings
        v = re.escape(v)
        
        #creating special case for for t(v;11)
        if v == 't\\(v;11\\)':
            v = '(t\\(\d+;11\\))|(t\\(11;\d+\\))'
        
        d[k] = v
    return {v:k for k,v in d.items()}

In [9]:
prop_dict = properties_dict()
prop_dict

{'\\-Y': '"-Y"',
 '\\-X': '"-X"',
 'del\\(11\\)\\(q': 'del11q',
 'del\\(12\\)\\(p': 'del12p',
 'del\\(13\\)\\(q': 'del13q',
 'del\\(5\\)\\(q': 'del5q',
 'del\\(7\\)\\(q': 'del7q',
 'idic\\(X\\)\\(q13\\)': 'idic(X)(q13)',
 'isochromosome\\(17\\)\\(q': 'isochromosome17q',
 '\\-13': 'Monosomy13',
 '\\-17': 'Monosomy17',
 '\\-5': 'Monosomy5',
 '\\-7': 'Monosomy7',
 't\\(1;3\\)': 't(1;3)',
 't\\(11;16\\)\\(q23\\.3;p13\\.3\\)': 't(11;16)(q23.3;p13.3)',
 't\\(\\(12\\)\\(p\\)': 't(12p)',
 't\\(\\(17\\)\\(p\\)': 't(17p)',
 't\\(2;11\\)': 't(2;11)',
 't\\(3;21\\)': 't(3;21)',
 't\\(3;5\\)': 't(3;5)',
 't\\(5;10\\)': 't(5;10)',
 't\\(5;12\\)': 't(5;12)',
 't\\(5;17\\)': 't(5;17)',
 't\\(5;7\\)': 't(5;7)',
 't\\(\\(5\\)\\(q\\)': 't(5q)',
 't\\(1;22\\)': 't(1;22)',
 'inv\\(3\\)': 'inv(3)',
 't\\(3;3\\)': 't(3;3)',
 't\\(6;9\\)': 't(6;9)',
 't\\(9;22\\)': 't(9;22)',
 't\\(16;16\\)': 't(16;16)',
 'inv\\(16\\)': 'inv(16)',
 't\\(8;21\\)': 't(8;21)',
 't\\(15;17\\)': 't(15;17)',
 't\\(9;11\\)': 't(9;11

In [10]:
def remove_artefact(row):
    if pd.notna(row['Cytogenetics']):
        if re.search('_x000D_$', row['Cytogenetics']):
            return row['Cytogenetics'].replace(' ', '')[:-7]
        return row['Cytogenetics'].replace(' ', '')
    return 'Error'

In [11]:
karyotypes['Cytogenetics'] = karyotypes.apply(remove_artefact, axis=1)

In [12]:
karyotypes['Error'] = False

In [13]:
karyotypes['Error description'] = None

In [14]:
punct = r'()[]/'
punct_dict = {s:0 for s in punct}

In [15]:
def gram_error(string):
    error = []
    if string == 'Error':
        error.append('String report missing')
    if re.search('fail', string.lower()):
        error.append('String report indicates failure')
    if not re.search(',', string):
        error.append('Missing comma')
    if re.search('[^a-z]?c[^p]', string) or re.search('[^a-z]c$', string):
        error.append('constitutional changes present')
    missing = list()
    for k in punct_dict:
        punct_dict[k] = len(re.findall(re.escape(k), string))
    if punct_dict['/'] != punct_dict['[']-1:
        error.append('incorrrect ratio of \'\\\' to \'[]\' ')
    if punct_dict['['] != punct_dict[']']:
        missing.append(min(['[', ']'], key=punct_dict.get))
    if punct_dict['('] != punct_dict[')']:
        missing.append(min(['(',')'], key=punct_dict.get))
    if len(missing)!=0:
        error.append(f'missing grammar: {", ".join(missing)}')
    substring = re.split('/', string)
    for i, s in enumerate(substring):
        try:
            chrom = s[:s.index(',')] #chromosome string
        except ValueError:
            error.append('Part of report missing comma')
        if not re.search('idem', s):
            expected = 46
        expected -= len(re.findall('\-', s))
        expected += len(re.findall('\+', s))
        if re.search('mar', s):
            mar_plural = re.search('\+(\d)(~\d)?mar',s)
            if mar_plural:
                expected += int(mar_plural.groups()[0]) - 1
                tilda_present =  re.search('\+(\d)~(\d)mar', s)
                if tilda_present:
                    d = tilda_present.group(1)
                    error.append(f'Variable number of markers in report. Minimum number ({d}) used by default')

        if re.search('[~]', chrom):
            try:
                low_num, high_num = int(chrom[:2]), int(chrom[-2:])
            except ValueError:
                error.append('Part of report not clearly defined by two chromosome numbers followed by comma (e.g. "43~45,")')
            if low_num <= expected <= high_num:
                pass
            elif expected > high_num:
                error.append(f'chromosome number lower than expected in subsection number {i+1}')
            else:
                error.append(f'chromosome number higher than expected in subsection number {i+1}')
        else:
            try:
                num = int(chrom[:2])
            except ValueError:
                error.append('Start of report missing clear chromosome number followed by comma (e.g. "46,")')
            if expected == num:
                pass
            elif expected < num:
                error.append(f'chromosome number higher than expected in subsection number {i+1}')
            else:
                error.append(f'chromosome number lower than expected in subsection number {i+1}')
    return error

In [16]:
#gram_error('45-47,XY,add(3)(p13),-5,add(6)(p22),del(7)(q22q36),add(9)(q34),add(12)(p12),-13,-17,+r,+1~4mar[cp8]/46,XY[2]')

In [77]:
def make_multi_translocation_dict(string):
    exp = '(\d\d?);(\d\d?)'
    while re.search(exp, string):
        chr_groups = list(re.search(exp, string).groups())
        exp = exp + ';(\d\d?)'
    if re.search('[pq]', string):
        exp2 = '([pq]\d\d?\.?\d?\d?)'
        for g in range(len(chr_groups)-1):
            exp2 = exp2 + ';([pq]\d\d?\.?\d?\d?)'
        arm_groups = list(re.search(exp2, string).groups())
        return {int(c):a for c,a in zip(chr_groups, arm_groups)}
    return chr_groups

In [78]:
def check_trans_dict(trans_dict, col_true):
    chr_keys = list(trans_dict)
    for i in range(len(chr_keys)-1):
        sorted_list = sorted([chr_keys[i],chr_keys[i+1]])
        first_chr, second_chr = sorted_list[0], sorted_list[1]
        for p in prop_dict:
            if re.search(p, f't({first_chr};{second_chr})'):
                col_true.add(p)
    return col_true

In [84]:
def parse_karyotype(row, prop_dict):
    error = gram_error(row['Cytogenetics'])
    if error:
        row['Error description'] = error
        for e in error:
            if not e.startswith('chromosome'):
                if not e.startswith('Variable'):
                    if not e.startswith('constitutional'):
                        row['Error'] = True
                        return row

    abnorms = set(re.split('/|,|\[', row['Cytogenetics']))
    removed = set()
    col_true = set()
    mono = 0
    struc = 0
    der = 0
    mar = 0
    seventeen_p = False
    for a in abnorms:
        if re.fullmatch('(\d\d([~-]\d\d)?|X[XY]?|(cp)?\d\d?\]|idem)(\??c)?', a):
            removed.add(a)
#         if re.search('mar', a): #no longer the case to remove mar
#             removed.add(a)
        
        if a not in removed:
            
            if re.search('t\(\d\d?;\d\d?;\d\d?',a):
                trans_dict = make_multi_translocation_dict(a)
                col_true = check_trans_dict(trans_dict, col_true)
                
            if re.search('mar', a):
                mar_plural = re.search('\+(\d)',a)
                if mar_plural:
                    mar += int(mar_plural.groups()[0])
                else:
                    mar += 1
            
            if re.fullmatch('-\d+|-[XY]', a):
                mono += 1
            if re.search('[inv|t].*\).*\)', a):
                struc += 1
            
            if re.search('der', a):
                der += 1

            if re.search('17.*p|-17',a):
                m = re.search('\d\d?;\d\d?', a)
                if m:
                    split = re.split(';', m.group())
                    if re.findall('[pq]', a)[split.index('17')] == 'p':
                        seventeen_p = True
                    else:
                        seventeen_p = False
                else:
                    seventeen_p = True
            
            for p in prop_dict:
                if re.search(p, a):
                    col_true.add(p)
    abnorms = abnorms.difference(removed)
    row['Number of cytogenetic abnormalities'] = len(abnorms) + der + mar
    row['Monosomy'] = mono
    row['Structural'] = struc
    row['abnormal(17p)'] = seventeen_p
    for c in col_true:
        row[prop_dict[c]] = True
    return row

In [85]:
karyotypes.head()

Unnamed: 0,Cytogenetics,Number of cytogenetic abnormalities,Monosomy,Structural,"""-Y""","""-X""",del11q,del12p,del13q,del5q,...,inv(16),t(8;21),t(15;17),t(9;11),t(6;11),t(10;11),t(v;11),abnormal(17p),Error,Error description
0,"46,XX[25]",,,,,,,,,,...,,,,,,,,,False,
1,"46,XY,del(11)(q14q23)[10]",,,,,,,,,,...,,,,,,,,,False,
2,"46,XX,del(5)(q22q35)[8/46,XX[2]",,,,,,,,,,...,,,,,,,,,False,
3,"45,X,-Y[17]/46,XY[3]",,,,,,,,,,...,,,,,,,,,False,
4,"47,XY,+8[9]/46,XY[1]",,,,,,,,,,...,,,,,,,,,False,


In [86]:
rows_done = 0
while True:
     parse_karyotype(karyotypes.iloc[rows_done], prop_dict)
     rows_done += 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#r

IndexError: single positional indexer is out-of-bounds

In [87]:
rows_done

159

In [88]:
karyotypes.iloc[rows_done]

Cytogenetics                           46,XX,t(2;7)(p1?3;q36)[3]/47,XX,+8[3]/46,XX[4]
Number of cytogenetic abnormalities                                               NaN
Monosomy                                                                          NaN
Structural                                                                        NaN
"-Y"                                                                              NaN
"-X"                                                                              NaN
del11q                                                                            NaN
del12p                                                                            NaN
del13q                                                                            NaN
del5q                                                                             NaN
del7q                                                                             NaN
idic(X)(q13)                                          

In [45]:
results = karyotypes.apply(parse_karyotype, axis=1, )
results.head()

<class 'set'>
<re.Match object; span=(9, 16), match='q24;q21'>
<class 'set'>
<re.Match object; span=(9, 16), match='q27;q23'>
<class 'set'>
<re.Match object; span=(8, 15), match='q21;p13'>
<class 'set'>
<re.Match object; span=(8, 15), match='q34;q11'>
<class 'set'>
<re.Match object; span=(8, 15), match='q21;q23'>
<class 'set'>
<re.Match object; span=(8, 15), match='q21;q23'>
<class 'set'>
<re.Match object; span=(8, 15), match='p16;q11'>
<class 'set'>
<re.Match object; span=(8, 15), match='p16;q11'>
<class 'set'>
<re.Match object; span=(8, 15), match='q34;q11'>
<class 'set'>
<re.Match object; span=(8, 15), match='q34;q11'>
<class 'set'>
<re.Match object; span=(9, 16), match='p14;q22'>
<class 'set'>
<re.Match object; span=(8, 15), match='q22;q22'>
<class 'set'>
<re.Match object; span=(9, 16), match='q24;q21'>
<class 'set'>
<re.Match object; span=(9, 16), match='q24;q21'>
<class 'set'>
<re.Match object; span=(8, 15), match='q34;q11'>
<class 'set'>
None


AttributeError: 'NoneType' object has no attribute 'groups'

In [None]:
results.loc[results['Error'].astype(bool)]

In [None]:
results.loc[results['Error']==False] = results.loc[results['Error']==False].fillna(False)

In [None]:
results.any()

In [None]:
#results.to_excel('Cytogenetics_output_V4.xlsx')

In [50]:
#Dan's code
def setup(abnormalities):
    """
    convert a list of abnormalities into a config for the extractor
    """
    prop_dict = properties_dict(karyotypes=None, properties=abnormalities)
    return prop_dict

def extract_from_string(karyotype, prop_dict):
    """
    Run extraction on a single karyotype string, extraction based on prop_dict
    prop_dict can be created with setup()
    This function guarantees the output will have a property key for every abnormality value in prop_dict
    as well as some additional created by parse_karyotype
    Anything in prop_dict that is not detected will default to False
    """
    input = {
        'Cytogenetics': karyotype.strip(),
        'Error': False,
        'Error description': ""
    }
    result = parse_karyotype(input, prop_dict)
    for abn in prop_dict.values():
        if abn not in result:
            result[abn] = False
    output = {'error': result['Error'], 'error_message': result['Error description'], 'result': result}
    return output

def base_extraction():
    ex = ["-Y", "-X", 'del11q', 'del12p', 
    'del13q', 'del5q', 'del7q', 'idic(X)(q13)', 'isochromosome17q', 'Monosomy13', 
    'Monosomy17', 'Monosomy5', 'Monosomy7', 't(1;3)', 't(11;16)(q23.3;p13.3)', 
    't(12p)', 't(17p)', 't(2;11)', 't(3;21)', 't(3;5)', 't(5;10)', 't(5;12)', 
    't(5;17)', 't(5;7)', 't(5q)', 't(1;22)', 'inv(3)', 't(3;3)', 't(6;9)', 
    't(9;22)', 't(16;16)', 'inv(16)', 't(8;21)', 't(15;17)', 't(9;11)', 
    't(6;11)', 't(10;11)', 't(v;11)']
    return ex

In [58]:
karyotypes['Cytogenetics']

0                            46,XX[25]
1            46,XY,del(11)(q14q23)[10]
2      46,XX,del(5)(q22q35)[8/46,XX[2]
3                 45,X,-Y[17]/46,XY[3]
4                 47,XY,+8[9]/46,XY[1]
                    ...               
154                          46,XY[30]
155                          46,XX[20]
156                          46,XY[20]
157                          46,XX[20]
158                          46,XX[20]
Name: Cytogenetics, Length: 159, dtype: object

In [62]:
#Dan's example
for report in karyotypes['Cytogenetics'].iloc[109:]:
    abn = base_extraction()
    props = setup(abn)
    result = extract_from_string(report, props)
#print(report)
#print(result)

<class 'set'>
<re.Match object; span=(8, 15), match='q34;q11'>
<class 'set'>
<re.Match object; span=(8, 15), match='q34;q11'>
<class 'set'>
None


AttributeError: 'NoneType' object has no attribute 'groups'

In [None]:
def download_link(object_to_download, download_filename, download_link_text):
    """
    Generates a link to download the given object_to_download.

    object_to_download (str, pd.DataFrame):  The object to be downloaded.
    download_filename (str): filename and extension of file. e.g. mydata.csv, some_txt_output.txt
    download_link_text (str): Text to display for download link.

    Examples:
    download_link(YOUR_DF, 'YOUR_DF.csv', 'Click here to download data!')
    download_link(YOUR_STRING, 'YOUR_STRING.txt', 'Click here to download your text!')

    """
    if isinstance(object_to_download,pd.DataFrame):
        object_to_download = object_to_download.to_csv(index=False)

    # some strings <-> bytes conversions necessary here
    b64 = base64.b64encode(object_to_download.encode()).decode()

    return f'<a href="data:file/txt;base64,{b64}" download="{download_filename}">{download_link_text}</a>'

In [None]:
st.write(results)

In [None]:
if st.button('Download Dataframe as CSV'):
    tmp_download_link = download_link(results, 'YOUR_DF.csv', 'Click here to download your data!')
    st.markdown(tmp_download_link, unsafe_allow_html=True)