In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Embedding, MultiHeadAttention, LayerNormalization, Dropout, GlobalAveragePooling1D
from tensorflow.keras.optimizers import Adam
import tensorflowjs as tfjs
import nltk

# Download NLTK data files
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Read the data from CSV file
try:
    data = pd.read_csv("ICD10_openmed_UTF8.csv", delimiter=";", encoding="utf-8")
except FileNotFoundError:
    print("Error: CSV file not found. Please provide the correct path.")
    exit()
except pd.errors.ParserError:
    print("Error: CSV parsing error. Check the format of your data.")
    exit()

# Function for text preprocessing
def preprocess_text(text):
    # Tokenize
    tokens = word_tokenize(text)
    
    # Convert to lower case
    tokens = [token.lower() for token in tokens]
    
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    return ' '.join(tokens)

# Apply preprocessing to the text data
data['processed_term'] = data['term'].apply(preprocess_text)

# Tokenization and padding
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data['processed_term']) 
sequences = tokenizer.texts_to_sequences(data['processed_term'])
max_length = max(len(seq) for seq in sequences)
X = pad_sequences(sequences, maxlen=max_length)

# Convert labels to numeric indices
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(data["icd10"])

# Split the data into training, test, and validation sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Check the shapes of the data
print(f"Shape of X_train: {X_train.shape}")
print(f"Shape of y_train: {y_train.shape}")
print(f"Shape of X_val: {X_val.shape}")
print(f"Shape of y_val: {y_val.shape}")
print(f"Shape of X_test: {X_test.shape}")
print(f"Shape of y_test: {y_test.shape}")


[nltk_data] Downloading package punkt to /Users/tizander/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/tizander/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/tizander/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Shape of X_train: (39232, 17)
Shape of y_train: (39232,)
Shape of X_val: (8407, 17)
Shape of y_val: (8407,)
Shape of X_test: (8408, 17)
Shape of y_test: (8408,)


In [5]:

# Define model parameters
vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 128
num_heads = 4
ff_dim = 128
num_classes = len(set(y))
max_len = X.shape[1]

# Model architecture with multi-head attention and causal masking
inputs = Input(shape=(max_len,))
embedding_layer = Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_len)(inputs)
attention_output = MultiHeadAttention(num_heads=num_heads, key_dim=embedding_dim)(embedding_layer, embedding_layer)
attention_output = LayerNormalization(epsilon=1e-6)(attention_output)
attention_output = Dropout(0.1)(attention_output)
ff_output = Dense(ff_dim, activation='relu')(attention_output)
ff_output = LayerNormalization(epsilon=1e-6)(ff_output)
ff_output = Dropout(0.1)(ff_output)
flat_output = GlobalAveragePooling1D()(ff_output)
outputs = Dense(num_classes, activation='softmax')(flat_output)

# Compile the model
model = Model(inputs=inputs, outputs=outputs)
model.compile(optimizer=Adam(learning_rate=1e-4), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Model summary
model.summary()

# Train the model
model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=50, batch_size=128)

# Evaluate the model on test data
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print("Test Loss:", test_loss)
print("Test Accuracy:", test_accuracy)

# Save the model
model.save('meddra_model3.h5')

# Convert the model to TensorFlow.js format
tfjs.converters.save_keras_model(model, 'tfjs_meddra_model3')


Epoch 1/100
[1m307/307[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 148ms/step - accuracy: 0.1096 - loss: 7.3004 - val_accuracy: 0.2096 - val_loss: 5.7925
Epoch 2/100
[1m307/307[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 151ms/step - accuracy: 0.2569 - loss: 5.4257 - val_accuracy: 0.3235 - val_loss: 4.7631
Epoch 3/100
[1m307/307[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 147ms/step - accuracy: 0.3736 - loss: 4.4435 - val_accuracy: 0.4511 - val_loss: 4.0662
Epoch 4/100
[1m307/307[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 145ms/step - accuracy: 0.4729 - loss: 3.7047 - val_accuracy: 0.5036 - val_loss: 3.5984
Epoch 5/100
[1m307/307[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 146ms/step - accuracy: 0.5315 - loss: 3.2240 - val_accuracy: 0.5357 - val_loss: 3.2624
Epoch 6/100
[1m307/307[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 146ms/step - accuracy: 0.5747 - loss: 2.8517 - val_accuracy: 0.5694 - val_loss: 3.0092
Epoc



Test Loss: 2.587895154953003
Test Accuracy: 0.6406993269920349


ValueError: Unable to create dataset (name already exists)