In [None]:
import os
import pandas as pd
import numpy as np
from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sns
from Levenshtein import distance
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
workingDir = os.path.abspath(os.path.join(''))

### read df

In [None]:
df_train = pd.read_csv(filepath_or_buffer=os.path.join(workingDir, 'data', 'train.csv'))
df_test = pd.read_csv(filepath_or_buffer=os.path.join(workingDir, 'data', 'test.csv'))

In [None]:
def spell_correction(word, corpus):
    if word not in corpus:
        distances = [distance(word, c) for c in corpus]
        min_index = distances.index(min(distances))
        corpus[min_index]
        if len(word)/10 + 1 >=  min(distances):
            word = corpus[min_index]
    return word

# German

In [None]:
language = 'german'

In [None]:
all_words = [word for sentence in df_train[f"input_{language}"] for word in sentence.split()]
# Create a corpus as a set of unique words
corpus = list(set(all_words))

In [None]:
documents = []

# Iterate over each sentence in the test dataframe
for sen in range(0, len(df_test[f"input_{language}"])):
    # Convert the sentence to a string
    document = str(df_test[f"input_{language}"][sen])
    # Split the sentence into words
    document = document.split()
    # Apply spell correction to each word in the sentence
    document = [spell_correction(word, corpus) for word in document]
    # Join the corrected words back into a single string
    document = ' '.join(document)
    
    # Append the corrected sentence to the documents list
    documents.append(document)

df_test[f"input_{language}"] = pd.DataFrame({f"input_{language}": documents})

# Topology

In [None]:
X_train, y_train = df_train[f"input_{language}"], df_train.Topology
X_test, y_test = df_test[f"input_{language}"], df_test.Topology

In [None]:
doc_cp_train = X_train[~ (y_train  == 'none')]
doc_not_cp_train = X_train[y_train  == 'none']

In [None]:
doc_cp_bilateral_train = X_train[y_train == 'Bilateral']
doc_cp_not_bilateral_train = X_train[~ (y_train  == 'Bilateral') & ~ (y_train  == 'none')]

In [None]:
doc_cp_unilateral_train = X_train[y_train == 'Unilateral']
doc_cp_not_unilateral_train = X_train[~ (y_train  == 'Unilateral') & ~ (y_train  == 'none')]

## Build rules

In [None]:
cp_dict = [['cp '], [' cp '], [' cp'], ['cerebral', 'pares'], ['cerebral', 'palsy'], ['cererbralparese'], ['zerebralpares'], ['diplegie'], ['hemiplegie'], ['hemisyndrom'],
           ['hemipar'], ['tetra', 'pares'], ['tetraplegie'], ['cerebral', 'bewegun', 'störung'], ['zerebral', 'bewegun', 'störung']]

In [None]:
[doc for doc in doc_cp_train if not any([all([t in doc for t in term]) for term in cp_dict])]

In [None]:
[doc for doc in doc_not_cp_train if any([all([t in doc for t in term]) for term in cp_dict])]

In [None]:
bilateral_dict = [['bilateral'], ['bein betonen spastisch'], ['diplegie'], ['diparese']]

In [None]:
[doc for doc in doc_cp_bilateral_train if not any([all([t in doc for t in term]) for term in bilateral_dict])]

In [None]:
[doc for doc in doc_cp_not_bilateral_train if any([all([t in doc for t in term]) for term in bilateral_dict])]

In [None]:
unilateral_dict = [['recht cp'], ['link cp'], ['cp recht'], ['cp link'], 
                   ['recht betont spastisch'], ['link betont spastisch'], 
                   ['cerebral parese recht'], ['cerebral parese link'], 
                   ['hemiparese'], ['hemiplegie'], ['hemisyndrom'], ['hemi '], ['hemipar '], ['hemisymptomatik'], ['hemispastisch'], 
                   ['zerebralparese link'], ['zerebralparese recht'],
                   ['unilateral spastisch'], ['unilateral bein betonen spastisch'], ['unilateral cerebral'], ['unilateraler spastischer'], 
                   ['unilaterale armbetonte spastisch'], ['unilateraler spastisch'],]

In [None]:
[doc for doc in doc_cp_unilateral_train if not any([all([t in doc for t in term]) for term in unilateral_dict])]

In [None]:
[doc for doc in doc_cp_not_unilateral_train if any([all([t in doc for t in term]) for term in unilateral_dict])]

In [None]:
cp_pred = []
for doc in X_train:
    cp_pred.append(any([all([t in doc for t in term]) for term in cp_dict]))

In [None]:
bilateral_pred = []
for doc in X_train:
    bilateral_pred.append(any([all([t in doc for t in term]) for term in bilateral_dict]))

In [None]:
unilateral_pred = []
for doc in X_train:
    unilateral_pred.append(any([all([t in doc for t in term]) for term in unilateral_dict]))

In [None]:
preds = pd.DataFrame({'cp_pred': cp_pred, 
              'bilateral_pred': bilateral_pred, 
              'unilateral_pred': unilateral_pred
})

preds['y_pred'] = 'none'
preds.loc[preds['cp_pred'] == True, 'y_pred'] = 'Undefined'
preds.loc[(preds['bilateral_pred'] == True) & (preds['cp_pred'] == True), 'y_pred'] = 'Bilateral'
preds.loc[(preds['unilateral_pred'] == True) & (preds['cp_pred'] == True), 'y_pred'] = 'Unilateral'

In [None]:
classes = np.unique(y_train)
y_pred = preds.y_pred

## Accuracy, Precision, Recall
accuracy = metrics.accuracy_score(y_train, y_pred)
print("Accuracy:",  round(accuracy,2))
print("Detail:")
print(metrics.classification_report(y_train, y_pred))

## Plot confusion matrix
cm = metrics.confusion_matrix(y_train, y_pred)
fig, ax = plt.subplots()
sns.heatmap(cm, annot=True, fmt='d', ax=ax, cmap=plt.cm.Blues,
            cbar=False)
ax.set(xlabel="Pred", ylabel="True", xticklabels=classes,
       yticklabels=classes, title="Confusion matrix")
plt.yticks(rotation=0)
plt.show()

accuracy_train = metrics.accuracy_score(y_train, y_pred)
# Calculate precision, recall, and F1 score
precision_micro_train, recall_micro_train, f1_micro_train, _ = metrics.precision_recall_fscore_support(y_train, y_pred, average='micro')
precision_macro_train, recall_macro_train, f1_macro_train, _ = metrics.precision_recall_fscore_support(y_train, y_pred, average='macro')

## Make predictions

In [None]:
cp_pred = []
for doc in X_test:
    cp_pred.append(any([all([t in doc for t in term]) for term in cp_dict]))
bilateral_pred = []
for doc in X_test:
    bilateral_pred.append(any([all([t in doc for t in term]) for term in bilateral_dict]))
unilateral_pred = []
for doc in X_test:
    unilateral_pred.append(any([all([t in doc for t in term]) for term in unilateral_dict]))
preds = pd.DataFrame({'cp_pred': cp_pred, 
              'bilateral_pred': bilateral_pred, 
              'unilateral_pred': unilateral_pred
})

preds['y_pred'] = 'none'
preds.loc[preds['cp_pred'] == True, 'y_pred'] = 'Undefined'
preds.loc[(preds['bilateral_pred'] == True) & (preds['cp_pred'] == True), 'y_pred'] = 'Bilateral'
preds.loc[(preds['unilateral_pred'] == True) & (preds['cp_pred'] == True), 'y_pred'] = 'Unilateral'

In [None]:
classes = np.unique(y_test)
y_pred = preds.y_pred

## Accuracy, Precision, Recall
accuracy = metrics.accuracy_score(y_test, y_pred)
print("Accuracy:",  round(accuracy,2))
print("Detail:")
print(metrics.classification_report(y_test, y_pred))

## Plot confusion matrix
cm = metrics.confusion_matrix(y_test, y_pred)
fig, ax = plt.subplots()
sns.heatmap(cm, annot=True, fmt='d', ax=ax, cmap=plt.cm.Blues,
            cbar=False)
ax.set(xlabel="Pred", ylabel="True", xticklabels=classes,
       yticklabels=classes, title="Confusion matrix")
plt.yticks(rotation=0)
plt.show()


accuracy_test = metrics.accuracy_score(y_test, y_pred)
# Calculate precision, recall, and F1 score
precision_micro_test, recall_micro_test, f1_micro_test, _ = metrics.precision_recall_fscore_support(y_test, y_pred, average='micro')
precision_macro_test, recall_macro_test, f1_macro_test, _ = metrics.precision_recall_fscore_support(y_test, y_pred, average='macro')

# English

In [None]:
language = 'english'

In [None]:
all_words = [word for sentence in df_train[f"input_{language}"] for word in sentence.split()]
# Create a corpus as a set of unique words
corpus = list(set(all_words))

In [None]:
documents = []

for sen in range(0, len(df_test[f"input_{language}"])):
    document = str(df_test[f"input_{language}"][sen])
    document = document.split()
    document = [spell_correction(word, corpus) for word in document]
    document = ' '.join(document)
    
    documents.append(document)

In [None]:
df_test[f"input_{language}"] = pd.DataFrame({f"input_{language}": documents})

# Topology

In [None]:
X_train, y_train = df_train[f"input_{language}"], df_train.Topology
X_test, y_test = df_test[f"input_{language}"], df_test.Topology

In [None]:
doc_cp_train = X_train[~ (y_train  == 'none')]
doc_not_cp_train = X_train[y_train  == 'none']

In [None]:
doc_cp_bilateral_train = X_train[y_train == 'Bilateral']
doc_cp_not_bilateral_train = X_train[~ (y_train  == 'Bilateral') & ~ (y_train  == 'none')]

In [None]:
doc_cp_unilateral_train = X_train[y_train == 'Unilateral']
doc_cp_not_unilateral_train = X_train[~ (y_train  == 'Unilateral') & ~ (y_train  == 'none')]

## Build rules

In [None]:
cp_dict = [['cp '], [' cp '], [' cp'], ['cerebral', 'pares'], ['cerebral', 'palsy'], ['hemiparesis'], ['hemiplegia'], ['hemiparous'], 
           ['hemisyndrome'], ['tetraparesis'], ['tetraplegia'], ['diplegia'], ['cerebral', 'movement', 'disorder']]

In [None]:
[doc for doc in doc_cp_train if not any([all([t in doc for t in term]) for term in cp_dict])]

In [None]:
[doc for doc in doc_not_cp_train if any([all([t in doc for t in term]) for term in cp_dict])]

In [None]:
bilateral_dict = [['bilateral'], ['diplegia'], ['diparesis'], ['leg stressed spastic'], ['arm stressed spastic'], ['leg stressed cerebral'], ['arm stressed cerebral']]

In [None]:
[doc for doc in doc_cp_bilateral_train if not any([all([t in doc for t in term]) for term in bilateral_dict])]

In [None]:
[doc for doc in doc_cp_not_bilateral_train if any([all([t in doc for t in term]) for term in bilateral_dict])]

In [None]:
unilateral_dict = [['right cerebral'], ['left cerebral'], ['right cp'], ['left cp'], ['cp on the right'], ['cp on the left'], ['right sided spastic'], ['left sided spastic'],
                   ['cerebral palsy right'], ['cerebral palsy left'], 
                   ['hemiparesis'], ['hemiplegia'], ['hemi '], ['hemiplegic'], ['hemisyndrome'], ['hemiparous'], ['hemispastic'], ['hemisymptomatic'],
                   ['unilateral', 'spastic'], ['unilateral', 'cp'], ['unilateral cerebral'], ['unilateral arm stressed spastic'], ['unilateral leg stressed spastic'],
                   ['right hand cerebral'], ['left hand cerebral']

]

In [None]:
[doc for doc in doc_cp_unilateral_train if not any([all([t in doc for t in term]) for term in unilateral_dict])]

In [None]:
[doc for doc in doc_cp_not_unilateral_train if any([all([t in doc for t in term]) for term in unilateral_dict])]

In [None]:
cp_pred = []
for doc in X_train:
    cp_pred.append(any([all([t in doc for t in term]) for term in cp_dict]))

In [None]:
bilateral_pred = []
for doc in X_train:
    bilateral_pred.append(any([all([t in doc for t in term]) for term in bilateral_dict]))

In [None]:
unilateral_pred = []
for doc in X_train:
    unilateral_pred.append(any([all([t in doc for t in term]) for term in unilateral_dict]))

In [None]:
preds = pd.DataFrame({'cp_pred': cp_pred, 
              'bilateral_pred': bilateral_pred, 
              'unilateral_pred': unilateral_pred
})

preds['y_pred'] = 'none'
preds.loc[preds['cp_pred'] == True, 'y_pred'] = 'Undefined'
preds.loc[(preds['bilateral_pred'] == True) & (preds['cp_pred'] == True), 'y_pred'] = 'Bilateral'
preds.loc[(preds['unilateral_pred'] == True) & (preds['cp_pred'] == True), 'y_pred'] = 'Unilateral'

In [None]:
classes = np.unique(y_train)
y_pred = preds.y_pred

## Accuracy, Precision, Recall
accuracy = metrics.accuracy_score(y_train, y_pred)
print("Accuracy:",  round(accuracy,2))
print("Detail:")
print(metrics.classification_report(y_train, y_pred))

## Plot confusion matrix
cm = metrics.confusion_matrix(y_train, y_pred)
fig, ax = plt.subplots()
sns.heatmap(cm, annot=True, fmt='d', ax=ax, cmap=plt.cm.Blues,
            cbar=False)
ax.set(xlabel="Pred", ylabel="True", xticklabels=classes,
       yticklabels=classes, title="Confusion matrix")
plt.yticks(rotation=0)
plt.show()

accuracy_train = metrics.accuracy_score(y_train, y_pred)
# Calculate precision, recall, and F1 score
precision_micro_train, recall_micro_train, f1_micro_train, _ = metrics.precision_recall_fscore_support(y_train, y_pred, average='micro')
precision_macro_train, recall_macro_train, f1_macro_train, _ = metrics.precision_recall_fscore_support(y_train, y_pred, average='macro')

## Make predictions

In [None]:
cp_pred = []
for doc in X_test:
    cp_pred.append(any([all([t in doc for t in term]) for term in cp_dict]))
bilateral_pred = []
for doc in X_test:
    bilateral_pred.append(any([all([t in doc for t in term]) for term in bilateral_dict]))
unilateral_pred = []
for doc in X_test:
    unilateral_pred.append(any([all([t in doc for t in term]) for term in unilateral_dict]))
preds = pd.DataFrame({'cp_pred': cp_pred, 
              'bilateral_pred': bilateral_pred, 
              'unilateral_pred': unilateral_pred
})

preds['y_pred'] = 'none'
preds.loc[preds['cp_pred'] == True, 'y_pred'] = 'Undefined'
preds.loc[(preds['bilateral_pred'] == True) & (preds['cp_pred'] == True), 'y_pred'] = 'Bilateral'
preds.loc[(preds['unilateral_pred'] == True) & (preds['cp_pred'] == True), 'y_pred'] = 'Unilateral'

In [None]:
classes = np.unique(y_test)
y_pred = preds.y_pred

## Accuracy, Precision, Recall
accuracy = metrics.accuracy_score(y_test, y_pred)
print("Accuracy:",  round(accuracy,2))
print("Detail:")
print(metrics.classification_report(y_test, y_pred))

## Plot confusion matrix
cm = metrics.confusion_matrix(y_test, y_pred)
fig, ax = plt.subplots()
sns.heatmap(cm, annot=True, fmt='d', ax=ax, cmap=plt.cm.Blues,
            cbar=False)
ax.set(xlabel="Pred", ylabel="True", xticklabels=classes,
       yticklabels=classes, title="Confusion matrix")
plt.yticks(rotation=0)
plt.show()


accuracy_test = metrics.accuracy_score(y_test, y_pred)
# Calculate precision, recall, and F1 score
precision_micro_test, recall_micro_test, f1_micro_test, _ = metrics.precision_recall_fscore_support(y_test, y_pred, average='micro')
precision_macro_test, recall_macro_test, f1_macro_test, _ = metrics.precision_recall_fscore_support(y_test, y_pred, average='macro')