In [2]:
import json
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, Embedding, LSTM, Lambda, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
import tensorflow as tf

def load_data(claims_file, evidence_file):
    # Load files
    with open(claims_file, 'r') as file:
        claims_data = json.load(file)
    with open(evidence_file, 'r') as file:
        evidence_data = json.load(file)

    pairs = []
    labels = []

    # Prepare positive and negative pairs
    all_evidence_ids = list(evidence_data.keys())
    for claim_id, info in claims_data.items():
        claim_text = info['claim_text']
        positive_evidences = info['evidences']
        for evidence_id in positive_evidences:
            if evidence_id in all_evidence_ids:
                pairs.append((claim_text, evidence_data[evidence_id]))
                labels.append(1)  # Positive label
            else:
                continue

        # Create negative pairs by selecting random evidences
        negative_evidences = np.random.choice([eid for eid in all_evidence_ids if eid not in positive_evidences],
                                              size=len(positive_evidences), replace=False)
        for neg_id in negative_evidences:
            pairs.append((claim_text, evidence_data[neg_id]))
            labels.append(0)  # Negative label

    return pairs, labels

# Load training and validation data
train_pairs, train_labels = load_data('train-claims.json', 'filtered_evidence.json')
dev_pairs, dev_labels = load_data('dev-claims.json', 'filtered_evidence.json')


KeyboardInterrupt: 

In [9]:
print(train_pairs[1])

('Not only is there no scientific evidence that CO2 is a pollutant, higher CO2 concentrations actually help ecosystems support more plant and animal life.', 'Plants can grow as much as 50 percent faster in concentrations of 1,000 ppm CO 2 when compared with ambient conditions, though this assumes no change in climate and no limitation on other nutrients.')


In [5]:
def preprocess_data(pairs):
    # Unzip pairs
    claim_texts, evidence_texts = zip(*pairs)

    # Tokenize text
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(claim_texts + evidence_texts)
    
    # Convert texts to sequences
    claim_seqs = tokenizer.texts_to_sequences(claim_texts)
    evidence_seqs = tokenizer.texts_to_sequences(evidence_texts)
    
    # Pad sequences
    max_length = max(max(len(seq) for seq in claim_seqs), max(len(seq) for seq in evidence_seqs))
    claim_padded = pad_sequences(claim_seqs, maxlen=max_length)
    evidence_padded = pad_sequences(evidence_seqs, maxlen=max_length)
    
    return claim_padded, evidence_padded, tokenizer, max_length

# Process training and validation data
claim_train, evidence_train, tokenizer, max_length = preprocess_data(train_pairs)
claim_val, evidence_val, _, _ = preprocess_data(dev_pairs)

train_labels = np.array(train_labels)
dev_labels = np.array(dev_labels)


In [6]:
from tensorflow.keras.layers import Input, Embedding, SimpleRNN, Lambda, Dense
def create_siamese_model(vocab_size, embedding_dim, max_length):
    claim_input = Input(shape=(max_length,), name='claim_input')
    evidence_input = Input(shape=(max_length,), name='evidence_input')

    shared_embedding = Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length)
    shared_lstm = SimpleRNN(64)

    encoded_claim = shared_lstm(shared_embedding(claim_input))
    encoded_evidence = shared_lstm(shared_embedding(evidence_input))

    L1_distance = Lambda(lambda tensors: tf.abs(tensors[0] - tensors[1]))
    similarity = Dense(1, activation='sigmoid')(L1_distance([encoded_claim, encoded_evidence]))

    model = Model(inputs=[claim_input, evidence_input], outputs=similarity)
    model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Create model
vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 100
model = create_siamese_model(vocab_size, embedding_dim, max_length)



In [7]:
model.fit(
    [claim_train, evidence_train], train_labels,
    validation_data=([claim_val, evidence_val], dev_labels),
    epochs=10,
    batch_size=16
)



Epoch 1/10


: 