In [1]:
import pandas as pd
import ast
from tqdm import tqdm
pd.set_option('display.max_rows', None)
tqdm.pandas()

In [2]:
df = pd.read_excel('parsed_df.xlsx', engine='openpyxl')

In [3]:
df.head()

Unnamed: 0,pmcs,patient_description,Clinical characteristics,"C-reactive protein, mg/dL",D-Dimer,Age,Prothrombin time,Sex,Creatinine mg/dl,White blood cell count,...,Hematocrit (%),ICU admission,Race,Hospital LOS (d),"Highest temperature, °C",Monocytes,GFR (ml/min),Duration of symptoms,Potassium (mEq/L),Risc factors
0,PMC7200342,"Case 1 is a 45-year-old woman, with no obvious...",,142,803,,,,,,...,,,,,,,,,,
1,PMC7200342,Case 2 is a 67-year-old woman who suffered fro...,,4,1040,,,,,,...,,,,,,,,,,
2,PMC7200342,Case 3 is a 72-year-old male with a history of...,,33,644,,,,,,...,,,,,,,,,,
3,PMC7200342,Case 4 is a 77-year-old male admitting to emer...,,366,378,,,,,,...,,,,,,,,,,
4,PMC7303641,A 55-year old male was referred to the hospita...,,12,82,56.0,,Male,,,...,,,,,,,,≈ 14,,


In [4]:
df_ner = pd.read_excel('thesis_dataframe.xlsx', engine='openpyxl')

In [5]:
df_ner[df_ner.pmcs == 'PMC7200342']

Unnamed: 0,pmcs,patient_description,markup
65,PMC7200342,Case 4 is a 77-year-old male admitting to emer...,"[(12, 14, 'Age'), (24, 28, 'Sex'), (62, 67, 'H..."
66,PMC7200342,Case 3 is a 72-year-old male with a history of...,"[(12, 14, 'Age'), (24, 28, 'Sex'), (47, 59, 'C..."
67,PMC7200342,Case 2 is a 67-year-old woman who suffered fro...,"[(12, 14, 'Age'), (24, 29, 'Sex'), (48, 58, 'C..."
68,PMC7200342,"Case 1 is a 45-year-old woman, with no obvious...","[(12, 14, 'Age'), (24, 29, 'Sex'), (113, 132, ..."


In [6]:
df_with_markup = df.merge(df_ner[['pmcs', 'patient_description', 'markup']], on=['pmcs', 'patient_description'])

In [7]:
def get_relevant_entity(text, labels):
    res = {}
    labels = ast.literal_eval(labels)
    if labels:
        for ent in labels:        
            entity = text[ent[0]:ent[1]]
            entity = entity.strip()

            if ent[2] not in res:
                res[ent[2]] = [entity]
            else:
                res[ent[2]].append(entity)
    return res

In [8]:
df_with_markup['ners_true'] = df_with_markup.progress_apply(lambda x: get_relevant_entity(
    text=x['patient_description'], 
    labels=x['markup']
), axis=1)

100%|██████████| 247/247 [00:00<00:00, 4938.36it/s]


In [9]:
col_list = list(df.columns)

col_list.remove('pmcs')
col_list.remove('patient_description')

In [10]:
df_with_markup.fillna(value='NaN', inplace=True)

In [11]:
for i, row in df_with_markup.iterrows():
    for col in col_list:
        if row[col] == 'NaN' and col in row['ners_true']:
            df_with_markup.at[i, col] = '; '.join(row['ners_true'][col])

In [12]:
def fill_death(x):
    if x == 'NaN':
        return 0
    x = x.lower()
    
    markers = ['died', 'expired', 'passed', 'terminal', 'death', 'yes', 'diseaced']
    for m in markers:
        if m in x:
            return 1
    return 0

In [13]:
df_with_markup['Death'] = df_with_markup['Death'].apply(fill_death)

In [14]:
df_with_markup.head()

Unnamed: 0,pmcs,patient_description,Clinical characteristics,"C-reactive protein, mg/dL",D-Dimer,Age,Prothrombin time,Sex,Creatinine mg/dl,White blood cell count,...,Race,Hospital LOS (d),"Highest temperature, °C",Monocytes,GFR (ml/min),Duration of symptoms,Potassium (mEq/L),Risc factors,markup,ners_true
0,PMC7200342,"Case 1 is a 45-year-old woman, with no obvious...",left facial paresis; dysarthria; left hemipare...,142,803,45,,woman,,,...,,,high fever,,,,,,"[(12, 14, 'Age'), (24, 29, 'Sex'), (113, 132, ...","{'Age': ['45'], 'Sex': ['woman'], 'Clinical ch..."
1,PMC7200342,Case 2 is a 67-year-old woman who suffered fro...,dysarthria; right hemiparesis; upper respirato...,4,1040,67,,woman,,,...,,,intermittent fever,,,,,,"[(12, 14, 'Age'), (24, 29, 'Sex'), (48, 58, 'C...","{'Age': ['67'], 'Sex': ['woman'], 'Clinical ch..."
2,PMC7200342,Case 3 is a 72-year-old male with a history of...,hypertension; loss of consciousness; dysarthri...,33,644,72,,male,,,...,,,,,,,,,"[(12, 14, 'Age'), (24, 28, 'Sex'), (47, 59, 'C...","{'Age': ['72'], 'Sex': ['male'], 'Clinical cha..."
3,PMC7200342,Case 4 is a 77-year-old male admitting to emer...,cough; shortness of breath; left hemi-hypoesth...,366,378,77,,male,,,...,,,fever,,,,,,"[(12, 14, 'Age'), (24, 28, 'Sex'), (62, 67, 'H...","{'Age': ['77'], 'Sex': ['male'], 'Highest temp..."
4,PMC7303641,A 55-year old male was referred to the hospita...,suspected deep vein thrombosis (DVT); unilater...,12,82,56,,Male,,,...,,,,,,≈ 14,,,"[(2, 4, 'Sex'), (14, 18, 'Age'), (53, 89, 'Cli...","{'Sex': ['55'], 'Age': ['male'], 'Clinical cha..."


In [15]:
# df_with_markup.to_csv('df_with_markup.csv', index=False)

In [16]:
df_for_modelling = df_with_markup.loc[:, ~df_with_markup.columns.isin(['pmcs', 'markup', 'ners_true'])]

In [17]:
X = df_for_modelling.loc[:, df_for_modelling.columns != 'Death']
y = df_for_modelling['Death']

In [18]:
col_list_X = list(X.columns)
col_list_X.remove('patient_description')

In [19]:
def collect_vectors(row, col_list=col_list_X):
    text = []
    for col in col_list:
        if row[col] != 'NaN':
            text.append(str(row[col]))
    return ' '.join(text)

In [20]:
X['vectors'] = X.progress_apply(
    lambda row: collect_vectors(row=row, col_list=col_list_X),
    axis=1
)

100%|██████████| 247/247 [00:00<00:00, 5735.60it/s]


# Tf-idf + Logreg

In [23]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
import eli5

In [26]:
import spacy

nlp = spacy.load('ru_core_news_md')

In [60]:
def get_clean_sents(text):
    doc = nlp(text)
    new_text = []
    markers = ['died', 'expired', 'passed', 'terminal', 'death', 'diseaced', 'discharged']
    for sent in doc.sents:
        sent = sent.text
        
        flag = True
        for m in markers:
            if m in sent:
                flag = False
                break
        if flag:
            new_text.append(sent)
    return ' '.join(new_text)

In [61]:
X['patient_description_wo_death'] = X['patient_description'].progress_apply(get_clean_sents)

100%|██████████| 247/247 [00:06<00:00, 39.86it/s]


In [62]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [63]:
X_patient_train = X_train['patient_description']
X_patient_test = X_test['patient_description']

In [64]:
X_patient_train_honest = X_train['patient_description_wo_death']
X_patient_test_honest = X_test['patient_description_wo_death']

In [65]:
X_rest_train = X_train['vectors']
X_rest_test = X_test['vectors']

In [66]:
X_all_train = X_train['patient_description_wo_death'] + ' ' + X_train['vectors']
X_all_test = X_test['patient_description_wo_death'] + ' ' + X_test['vectors']

In [87]:
steps = [
    ('vectorizer', TfidfVectorizer(min_df=0.1, max_df=0.5)),
    ('logreg', LogisticRegression(C=25))
]

pipe = Pipeline(steps=steps)

In [88]:
pipe.fit(X_patient_train, y_train)

preds = pipe.predict(X_patient_test)

print(classification_report(y_true=y_test, y_pred=preds))

              precision    recall  f1-score   support

           0       0.92      0.95      0.93        60
           1       0.77      0.67      0.71        15

    accuracy                           0.89        75
   macro avg       0.84      0.81      0.82        75
weighted avg       0.89      0.89      0.89        75



In [91]:
report = classification_report(y_test, preds, output_dict=True)
df_classification_report = pd.DataFrame(report).transpose()
df_classification_report = df_classification_report.round(decimals=2)

df_classification_report = df_classification_report.astype(str)
df_classification_report.at['accuracy', 'precision'] = ''
df_classification_report.at['accuracy', 'recall'] = ''
df_classification_report.at['accuracy', 'support'] = df_classification_report.at['macro avg', 'support']
df_classification_report['support'] = df_classification_report['support'].str.replace('[.]0', '')

In [93]:
print(df_classification_report.to_latex())

\begin{tabular}{lllll}
\toprule
{} & precision & recall & f1-score & support \\
\midrule
0            &      0.92 &   0.95 &     0.93 &      60 \\
1            &      0.77 &   0.67 &     0.71 &      15 \\
accuracy     &           &        &     0.89 &      75 \\
macro avg    &      0.84 &   0.81 &     0.82 &      75 \\
weighted avg &      0.89 &   0.89 &     0.89 &      75 \\
\bottomrule
\end{tabular}



In [94]:
eli5.show_weights(pipe['logreg'], top=30, feature_names=pipe['vectorizer'].get_feature_names())



Weight?,Feature
+10.407,died
+4.379,scan
+3.784,despite
+3.780,by
+3.701,demonstrated
+3.478,presentation
+3.411,intubated
+3.285,diagnosed
+3.280,icu
+3.251,progressive


In [40]:
len(pipe['vectorizer'].vocabulary_)

249

In [95]:
steps = [
    ('vectorizer', TfidfVectorizer(min_df=0.05, max_df=0.7)),
    ('logreg', LogisticRegression(C=8))
]

pipe = Pipeline(steps=steps)

In [98]:
pipe.fit(X_patient_train_honest, y_train)

preds = pipe.predict(X_patient_test_honest)

print(classification_report(y_true=y_test, y_pred=preds))

              precision    recall  f1-score   support

           0       0.86      0.95      0.90        60
           1       0.67      0.40      0.50        15

    accuracy                           0.84        75
   macro avg       0.77      0.68      0.70        75
weighted avg       0.82      0.84      0.82        75



In [97]:
eli5.show_weights(pipe['logreg'], top=30, feature_names=pipe['vectorizer'].get_feature_names())



Weight?,Feature
+2.128,rrt
+2.115,despite
+2.069,local
+2.034,patient
+1.917,scan
+1.811,demonstrated
+1.810,via
+1.804,ratio
+1.801,75
+1.781,by


In [99]:
report = classification_report(y_test, preds, output_dict=True)
df_classification_report = pd.DataFrame(report).transpose()
df_classification_report = df_classification_report.round(decimals=2)

df_classification_report = df_classification_report.astype(str)
df_classification_report.at['accuracy', 'precision'] = ''
df_classification_report.at['accuracy', 'recall'] = ''
df_classification_report.at['accuracy', 'support'] = df_classification_report.at['macro avg', 'support']
df_classification_report['support'] = df_classification_report['support'].str.replace('[.]0', '')
print(df_classification_report.to_latex())

In [103]:
steps = [
    ('vectorizer', TfidfVectorizer(min_df=0.005, max_df=0.95)),
    ('logreg', LogisticRegression(C=25))
]

pipe = Pipeline(steps=steps)

In [104]:
pipe.fit(X_rest_train, y_train)

preds = pipe.predict(X_rest_test)

print(classification_report(y_true=y_test, y_pred=preds))

              precision    recall  f1-score   support

           0       0.88      0.95      0.91        60
           1       0.70      0.47      0.56        15

    accuracy                           0.85        75
   macro avg       0.79      0.71      0.74        75
weighted avg       0.84      0.85      0.84        75



In [105]:
report = classification_report(y_test, preds, output_dict=True)
df_classification_report = pd.DataFrame(report).transpose()
df_classification_report = df_classification_report.round(decimals=2)

df_classification_report = df_classification_report.astype(str)
df_classification_report.at['accuracy', 'precision'] = ''
df_classification_report.at['accuracy', 'recall'] = ''
df_classification_report.at['accuracy', 'support'] = df_classification_report.at['macro avg', 'support']
df_classification_report['support'] = df_classification_report['support'].str.replace('[.]0', '')
print(df_classification_report.to_latex())

\begin{tabular}{lllll}
\toprule
{} & precision & recall & f1-score & support \\
\midrule
0            &      0.88 &   0.95 &     0.91 &      60 \\
1            &       0.7 &   0.47 &     0.56 &      15 \\
accuracy     &           &        &     0.85 &      75 \\
macro avg    &      0.79 &   0.71 &     0.74 &      75 \\
weighted avg &      0.84 &   0.85 &     0.84 &      75 \\
\bottomrule
\end{tabular}



In [30]:
len(pipe['vectorizer'].vocabulary_)

2563

In [108]:
steps = [
    ('vectorizer', TfidfVectorizer(min_df=0.01)),
    ('logreg', LogisticRegression(C=25))
]

pipe = Pipeline(steps=steps)

In [109]:
pipe.fit(X_all_train, y_train)

preds = pipe.predict(X_all_test)

print(classification_report(y_true=y_test, y_pred=preds))

              precision    recall  f1-score   support

           0       0.88      0.97      0.92        60
           1       0.78      0.47      0.58        15

    accuracy                           0.87        75
   macro avg       0.83      0.72      0.75        75
weighted avg       0.86      0.87      0.85        75

