In [10]:
import pandas as pd

In [26]:
def create_graph_words_lubm1():
    """
    Reads the LUBM1 RDF data from ./data/jupyter/all_lubm.nt, extracts and processes the graph data using pandas.

    Returns:
    - A pandas DataFrame with two columns: 'input_graph_words' and 'inference_graph_words',
      where each row contains a sequence of words representing a graph
    """

    col_names=["subject", "predicate", "object"]
    data = pd.read_csv("./data/jupyter/all_lubm.nt", delimiter=' ', names=col_names)
    data['object'] = data['object'].apply(lambda x: x.rstrip('.'))

    # Create a new column 'graph' by concatenating the subject, predicate, and object columns
    data["graph"] = data["subject"] + " " + data["predicate"] + " " + data["object"]

    # Group the data by 'graph' and aggregate the results into a single row
    grouped_data = data.groupby("graph").apply(
    lambda x: pd.Series({
        "input_sequence": x.iloc[:-1]["object"].str.cat(sep=" "),
        "target_sequence": x.iloc[1:]["object"].str.cat(sep=" ")
    })).reset_index(level=1, drop=True).reset_index()

    input_sequences = grouped_data["input_sequence"]
    inference_sequences = grouped_data["target_sequence"]

    df = pd.DataFrame({"input_graph_words": input_sequences, "inference_graph_words": inference_sequences})

    return df

In [27]:
df = create_graph_words_lubm1()

IndexError: Too many levels: Index has only 1 level, not 2

In [None]:
import pandas as pd
import numpy as np
from keras.models import Model
from keras.layers import Input, LSTM, Dense
from sklearn.model_selection import train_test_split


class GraphTranslationModel:
    def __init__(self, df, latent_dim=256, batch_size=64, epochs=50):
        self.input_seqs, self.target_seqs, self.input_chars, self.target_chars, self.num_encoder_tokens, \
        self.num_decoder_tokens, self.input_token_index, self.target_token_index, self.encoder_input_data, \
        self.decoder_input_data, self.decoder_target_data = self.prepare_data(df)

        self.encoder, self.decoder, self.model = self.build_model(latent_dim)
        self.input_dim = input_dim
        self.batch_size = batch_size
        self.epochs = epochs

    def prepare_data(self, df):
        input_seqs = df.apply(lambda row: row['subject'] + ' ' + row['predicate'], axis=1)
        target_seqs = df['object']

        # Create input and target character index mappings
        input_chars = sorted(set(' '.join(input_seqs)))
        target_chars = sorted(set(' '.join(target_seqs)))
        num_encoder_tokens = len(input_chars)
        num_decoder_tokens = len(target_chars)

        input_token_index = dict([(char, i) for i, char in enumerate(input_chars)])
        target_token_index = dict([(char, i) for i, char in enumerate(target_chars)])

        # Encode input and target sequences as one-hot vectors
        max_encoder_seq_length = max([len(txt) for txt in input_seqs])
        max_decoder_seq_length = max([len(txt) for txt in target_seqs])

        encoder_input_data = np.zeros((len(input_seqs), max_encoder_seq_length, num_encoder_tokens), dtype='float32')
        decoder_input_data = np.zeros((len(input_seqs), max_decoder_seq_length, num_decoder_tokens), dtype='float32')
        decoder_target_data = np.zeros((len(input_seqs), max_decoder_seq_length, num_decoder_tokens), dtype='float32')

        for i, (input_seq, target_seq) in enumerate(zip(input_seqs, target_seqs)):
            for t, char in enumerate(input_seq):
                encoder_input_data[i, t, input_token_index[char]] = 1.
            for t, char in enumerate(target_seq):
                decoder_input_data[i, t, target_token_index[char]] = 1.
                if t > 0:
                    decoder_target_data[i, t - 1, target_token_index[char]] = 1.

        return input_seqs, target_seqs, input_chars, target_chars, num_encoder_tokens, num_decoder_tokens, \
               input_token_index, target_token_index, encoder_input_data, decoder_input_data, decoder_target_data

    def build_model(self, input_dim=100, output_dim=50, hidden_units=256):
        # Define the encoder layer
        encoder_inputs = Input(shape=(None, input_dim))
        encoder = LSTM(hidden_units, return_state=True)
        encoder_outputs, state_h, state_c = encoder(encoder_inputs)
        encoder_states = [state_h, state_c]

        # Define the decoder layer
        decoder_inputs = Input(shape=(None, output_dim))
        decoder_lstm = LSTM(hidden_units, return_sequences=True, return_state=True)
        decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)
        repeat_d_layer = RepeatVector(decoder_outputs.shape[1])
        repeat_d = repeat_d_layer(encoder_outputs)
        decoder_outputs = decoder_lstm(repeat_d, initial_state=encoder_states)[0]
        decoder_dense = Dense(output_dim, activation='softmax')
        decoder_outputs = decoder_dense(decoder_outputs)

        # Concatenate the encoder and decoder layers
        model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
        model.compile(optimizer='rmsprop', loss='categorical_crossentropy')

        return model

    def translate(self, input_seq):
        # Encode the input graph sequence as a one-hot vector
        encoder_input_seq = np.zeros((1, len(input_seq), self.num_encoder_tokens), dtype='float32')
        for t, char in enumerate(input_seq):
            encoder_input_seq[0, t, self.input_token_index[char]] = 1.

        # Get the initial encoder states
        states_value = self.encoder.predict(encoder_input_seq)

        # Generate the target text sequence using the decoder
        target_seq = np.zeros((1, 1, self.num_decoder_tokens))
        target_seq[0, 0, self.target_token_index['\t']] = 1.

        stop_condition = False
        decoded_sentence = ''
        while not stop_condition:
            output_tokens, h, c = self.decoder.predict([target_seq] + states_value)

            # Sample a token from the output distribution
            sampled_token_index = np.argmax(output_tokens[0, -1, :])
            sampled_char = self.target_chars[sampled_token_index]
            decoded_sentence += sampled_char

            # Exit condition: either hit max length or find stop character
            if (sampled_char == '\n' or len(decoded_sentence) > self.num_decoder_tokens):
                stop_condition = True

            # Update the target sequence with the sampled token
            target_seq = np.zeros((1, 1, self.num_decoder_tokens))
            target_seq[0, 0, sampled_token_index] = 1.

            # Update the encoder states
            states_value = [h, c]

        return decoded_sentence

    def train(self, val_split=0.2):
        # Split data into training and validation sets
        X_train, X_val, y_train, y_val = train_test_split(self.encoder_input_data, self.decoder_input_data,
                                                          test_size=val_split, random_state=42)

        # Train the NMT model
        history = self.model.fit([X_train, y_train], y_train, batch_size=self.batch_size, epochs=self.epochs,
                                 validation_data=([X_val, y_val], y_val))