In [None]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import re
from collections import Counter
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from gensim.models import Word2Vec
import numpy as np

# Ensure compatibility with newer versions of numpy
if hasattr(np, 'object'):
    np.object_ = np.object

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import MultiLabelBinarizer
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from nltk.stem import WordNetLemmatizer
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Embedding, MultiHeadAttention, LayerNormalization, Dropout, GlobalAveragePooling1D, LSTM

import pandas as pd

# Define the paths to your CSV files
csv_file_path_1 = '../data.csv'
csv_file_path_2 = '../AEcodiert240430_UTF8.csv'
csv_file_path_3 = '../meddra_zkls.csv'
csv_file_path_4 = '../meddra_zkls2.csv'

# Read the CSV files into DataFrames
try:
    df1 = pd.read_csv(csv_file_path_1, delimiter=';', encoding='utf-8')
    df2 = pd.read_csv(csv_file_path_2, delimiter=';', encoding='utf-8')
    df3 = pd.read_csv(csv_file_path_3, delimiter=';', encoding='utf-8')
    df4 = pd.read_csv(csv_file_path_4, delimiter=';', encoding='utf-8')

except FileNotFoundError as e:
    print(e)
    exit()


#concat all the dataframes
df = pd.concat([df1, df2, df3, df4], ignore_index=True)
# Data Cleaning
nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(stopwords.words('german'))

def clean_text(text):
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    return text

def tokenize_text(text):
    return word_tokenize(text)

def remove_stopwords(words):
    return [word for word in words if word.lower() not in stop_words]

def to_lowercase(words):
    return [word.lower() for word in words]


# Ensure labels are in string format and split correctly
df['llt_code'] = df['llt_code'].astype(str)
labels = df['llt_code'].apply(lambda x: x.split(','))

# Expand the DataFrame by splitting rows with multiple llt_codes
def split_and_expand_df(df):
    def split_row(row):
        llt_code_list = row['llt_code'].split(',')
        return pd.DataFrame({'llt_code': llt_code_list, 'ae_description': row['ae_description']})
    
    expanded_rows = [split_row(row) for _, row in df.iterrows()]
    expanded_df = pd.concat(expanded_rows, ignore_index=True)
    return expanded_df

expanded_df = split_and_expand_df(df)

# Re-process the expanded DataFrame
expanded_df['cleaned_text'] = expanded_df['ae_description'].apply(clean_text)
expanded_df['tokenized_text'] = expanded_df['cleaned_text'].apply(tokenize_text)
expanded_df['filtered_text'] = expanded_df['tokenized_text'].apply(remove_stopwords)
expanded_df['lowercase_text'] = expanded_df['filtered_text'].apply(to_lowercase)

# Text Representation
sentences = expanded_df['lowercase_text'].tolist()
word2vec_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

def text_to_vector(words):
    vectors = [word2vec_model.wv[word] for word in words if word in word2vec_model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(100)

expanded_df['text_vector'] = expanded_df['lowercase_text'].apply(text_to_vector)
expanded_df['text_vector'] = expanded_df['lowercase_text'].apply(text_to_vector)

text_vectors = np.vstack(expanded_df['text_vector'].values)

# Tokenization and Sequencing
tokenizer = Tokenizer()
tokenizer.fit_on_texts(expanded_df['cleaned_text'])

sequences = tokenizer.texts_to_sequences(expanded_df['cleaned_text'])
word_index = tokenizer.word_index
print(f'Found {len(word_index)} unique tokens.')

max_sequence_length = 100
data = pad_sequences(sequences, maxlen=max_sequence_length)

# Encoding Labels
mlb = MultiLabelBinarizer()

# Get new text_vectors and labels
text_vectors = np.vstack(expanded_df['text_vector'].values)
binary_labels = mlb.fit_transform(expanded_df['llt_code'].apply(lambda x: [x]))

In [None]:
# Split the data into training, test, and validation sets
X_train, X_temp, y_train, y_temp = train_test_split(text_vectors, binary_labels, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Check the shapes of the data
print(f"Shape of X_train: {X_train.shape}")
print(f"Shape of y_train: {y_train.shape}")
print(f"Shape of X_val: {X_val.shape}")
print(f"Shape of y_val: {y_val.shape}")
print(f"Shape of X_test: {X_test.shape}")
print(f"Shape of y_test: {y_test.shape}")

In [None]:
# Define model parameters
vocab_size = len(tokenizer.word_index) + 1
print(f'Found {vocab_size} unique tokens.')

# Define model parameters
embedding_dim = 128
num_heads = 4
ff_dim = 128
# Flatten y_train to handle list elements
flat_y_train = [item for sublist in y_train for item in sublist]
num_classes = len(set(flat_y_train))
max_len = X_train.shape[1]


# Model architecture with multi-head attention and causal masking
inputs = Input(shape=(max_len,))
embedding_layer = Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_len)(inputs)
attention_output = MultiHeadAttention(num_heads=num_heads, key_dim=embedding_dim)(embedding_layer, embedding_layer)
attention_output = LayerNormalization(epsilon=1e-6)(attention_output)
attention_output = Dropout(0.1)(attention_output)
ff_output = Dense(ff_dim, activation='relu')(attention_output)
ff_output = LayerNormalization(epsilon=1e-6)(ff_output)
ff_output = Dropout(0.1)(ff_output)
flat_output = GlobalAveragePooling1D()(ff_output)
outputs = Dense(num_classes, activation='softmax')(flat_output)

# Compile the model
model = Model(inputs=inputs, outputs=outputs)
model.compile(optimizer=Adam(learning_rate=1e-4), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Model summary
model.summary()

# Train the model
model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=50, batch_size=128)

# Evaluate the model on test data
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print("Test Loss:", test_loss)
print("Test Accuracy:", test_accuracy)

# Save the model
model.save('meddra_model3.h5')