In [9]:
import requests
import json

def fetch_data(url):
    response = requests.get(url)
    data = response.json()
    return data

def parse_data(data):
    conversations = {}

    for entry in data['rows']:
        conver_id = entry['row']['conversation_id']
        if conver_id not in conversations:
            conversations[conver_id] = []
        
        conversations[conver_id].append({
            'speaker': entry['row']['speaker'],
            'datetime': entry['row']['date_time'],
            'text': entry['row']['text']

        })
    return conversations

def preprocess_text(text):
    return text.lower()

def prepare_training_data(conversations):
    processed_data = []
    for convo_id, messages in conversations.items():
        processed_convo = []
        for message in messages:
            processed_text = preprocess_text(message['text'])
            processed_convo.append((message['speaker'], processed_text))
        processed_data.append(processed_convo)
    return processed_data


import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding
import numpy as np

def train_model(training_data):
    inputs, outputs = [], []


    for data in training_data:
        for i in range(len(data) -1):
            inputs.append(data[i][1])
            outputs.append(data[i + 1][1])
    
    # Tokenize text
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(inputs + outputs)
    input_sequences = tokenizer.texts_to_sequences(inputs)
    output_sequences = tokenizer.texts_to_sequences(outputs)


    # Pad sequences
    max_sequence_len = max([len(x) for x in input_sequences])
    input_padded = pad_sequences(input_sequences, maxlen=max_sequence_len, padding='post')
    output_padded = pad_sequences(output_sequences, maxlen=max_sequence_len, padding='post')

    # Model parameters
    vocab_size = len(tokenizer.word_index) + 1
    embedding_dim = 50
    lstm_units = 128

    # Define an encoder
    encoder_inputs = Input(shape=(None,))
    encoder_embedding = Embedding(vocab_size, embedding_dim)(encoder_inputs)
    encoder_outputs, state_h, state_c = LSTM(lstm_units, return_state=True)(encoder_embedding)
    encoder_states = [state_h, state_c]

    # Define a decoder
    decoder_inputs = Input(shape=(None,))
    decoder_embedding = Embedding(vocab_size, embedding_dim)(decoder_inputs)
    decoder_lstm = LSTM(lstm_units, return_sequences=True, return_state=True)
    decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
    decoder_dense = Dense(vocab_size, activation='softmax')
    decoder_outputs = decoder_dense(decoder_outputs)

    # Define the model
    model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

    # Compile the model
    model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    model.summary()

    decoder_input_data = np.zeros_like(output_padded)
    decoder_input_data[:, 1:] = output_padded[:, :-1]
    decoder_input_data[:, 0] = tokenizer.word_index['<start>']  # Assuming you have <start> token

    decoder_target_data = np.expand_dims(output_padded, -1)

    model.fit([input_padded, decoder_input_data], decoder_target_data,
          batch_size=64,
          epochs=50,
          validation_split=0.2)
    
    model.save('seq2seq_conversation_model.h5')




url = 'https://datasets-server.huggingface.co/rows?dataset=talkmap%2Fbanking-conversation-corpus&config=default&split=train&offset=1&length=100'

data = fetch_data(url)  
conversations = parse_data(data)
#print(json.dumps(conversations, indent=2))  # This prints the JSON data with indentation for readability
training_data = prepare_training_data(conversations)
#print('training_data', json.dumps(training_data, indent=2))
train_model(training_data)


KeyError: '<start>'