In [None]:
import os
import re
import pandas as pd
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
import re
from compound_split import doc_split
import string
import spacy
from deep_translator import GoogleTranslator
from nltk.stem import WordNetLemmatizer

In [None]:
workingDir = os.path.abspath(os.path.join(''))

### read df

In [None]:
# before removing nan values
df = pd.read_csv(filepath_or_buffer=os.path.join(workingDir, 'data', 'diagnoses.csv'))

In [None]:
# Define the CPCode2Topology function
def CPCode2Topology(CPCode):
    if CPCode == 'G80.0':
        return 'Undefined'
    elif CPCode == 'G80.1':
        return 'Bilateral'
    elif CPCode == 'G80.2':
        return 'Unilateral'
    elif CPCode in ['G80.3', 'G80.4', 'G80.8', 'G80.9']:
        return 'Undefined'
    elif CPCode in ['G81.0', 'G81.1', 'G81.9']:
        return 'Unilateral'
    else:
        return 'none'

# Assuming IData is a pandas DataFrame
# Apply the CPCode2Topology function to each entry in the 'Output' column
df['Topology'] = df['Output'].apply(CPCode2Topology)

Tokenize

In [None]:
# Extracting the 'Input' column from the DataFrame 'df' and assigning it to the variable 'X'
# Extracting the 'Topology' column from the DataFrame 'df' and assigning it to the variable 'y'
X, y = df.Input, df.Topology

In [None]:
nlp = spacy.load('de_core_news_md')
stopw = stopwords.words('german')

documents = []
doc_cp = []
doc_not_cp = []


for sen in range(0, len(X)):
    document = str(X[sen])
    
    #split german compound 
    document = doc_split.doc_split(string.capwords(document))
        
    #remove numbers
    document = re.sub(r'[0-9]{1}', ' ', document)
    document = re.sub(r'[0-9]{2}', ' ', document)
    document = re.sub(r'[0-9]{3}', ' ', document)
    document = re.sub(r'[0-9]{4}', ' ', document)
    
    # Remove all the special characters
    document = re.sub(r'\W', ' ', document)
    
    # remove all single characters
    document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)
    
    # Remove single characters from the start
    document = re.sub(r'\^[a-zA-Z]\s+', ' ', document) 
    
    # Substituting multiple spaces with single space
    document = re.sub(r'\s+', ' ', document, flags=re.I)
    
    # Removing prefixed 'b'
    document = re.sub(r'^b\s+', '', document)
    
    # Lemmatization
    document = nlp(document)
    document = [word.lemma_ for word in document]
    document = ' '.join(document)

    # Converting to Lowercase
    document = document.lower()
    
    # removing stop words
    document = document.split()
    document = [w for w in document if not w in stopw]
    document = ' '.join(document)
    

    documents.append(document)
    
    if y[sen]:
        doc_cp.append(document)
    else:
        doc_not_cp.append(document)

In [None]:
df["input_german"] = pd.DataFrame({'input_german': documents})

In [None]:
documents = []

for sen in range(0, len(X)):
    document = str(X[sen])
       
    # Substituting multiple spaces with single space
    document = re.sub(r'\s+', ' ', document, flags=re.I)
    
    documents.append(document)
    

In [None]:
df["input_german_bert"] = pd.DataFrame({'input_german_bert': documents})

## Using translation

In [None]:
X, y = df.Input, df.Topology

In [None]:
nlp = spacy.load('en_core_web_md')
stopw = stopwords.words('english')

documents = []
doc_cp = []
doc_not_cp = []

lemmatizer = WordNetLemmatizer()

for sen in range(0, len(X)):
    document = str(X[sen])
    document = GoogleTranslator(source='de', target='en').translate(document)
    #remove numbers
    document = re.sub(r'[0-9]{1}', ' ', document)
    document = re.sub(r'[0-9]{2}', ' ', document)
    document = re.sub(r'[0-9]{3}', ' ', document)
    document = re.sub(r'[0-9]{4}', ' ', document)
    
    # Remove all the special characters
    document = re.sub(r'\W', ' ', document)
    
    # remove all single characters
    document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)
    
    # Remove single characters from the start
    document = re.sub(r'\^[a-zA-Z]\s+', ' ', document) 
    
    # Substituting multiple spaces with single space
    document = re.sub(r'\s+', ' ', document, flags=re.I)
    
    # Removing prefixed 'b'
    document = re.sub(r'^b\s+', '', document)
        
    # Lemmatization
    document = nlp(document)
    document = [word.lemma_ for word in document]
    document = ' '.join(document)
    
    # Converting to Lowercase
    document = document.lower()
    
    # removing stop words
    document = document.split()
    document = [w for w in document if not w in stopw]
    document = ' '.join(document)

    documents.append(document)

    if y[sen]:
        doc_cp.append(document)
    else:
        doc_not_cp.append(document)


In [None]:
df["input_english"] = pd.DataFrame({'input_english': documents})

In [None]:
documents = []

for sen in range(0, len(X)):
    document = str(X[sen])
    document = GoogleTranslator(source='de', target='en').translate(document)
    
    # Substituting multiple spaces with single space
    document = re.sub(r'\s+', ' ', document, flags=re.I)
    
    
    documents.append(document)


In [None]:
df["input_english_bert"] = pd.DataFrame({'input_english_bert': documents})

In [None]:
df_train, df_test, y_train, y_test = train_test_split(df, y, test_size=0.2, random_state=1, stratify=y)

In [None]:
df_train.to_csv("train.csv", encoding='utf-8', index=False)
df_test.to_csv("test.csv", encoding='utf-8', index=False)