In [72]:
import pandas as pd
import numpy as np
import torch

from ClaimEvicenceDataset import ClaimEvidenceLstmDataset
from SimpleTokenizer import SimpleTokenizer

In [44]:
def calc_label_distribution(dataset):
    label_distribution = [0,0]
    for label in dataset["label"]:
        label_distribution[label] += 1
    return label_distribution

In [45]:
train_dataset = pd.read_csv("./data/train.csv")

train_label_distrib = calc_label_distribution(train_dataset)
print("Training data distribution:",train_label_distrib)
total = np.sum(train_label_distrib)
class_weights = total/train_label_distrib
print("Weight for neg/pos class:", class_weights)

Training data distribution: [15654, 5854]
Weight for neg/pos class: [1.37396193 3.67406901]


In [46]:
dev_dataset = pd.read_csv("./data/dev.csv")
dev_label_distrib = calc_label_distribution(dev_dataset)
print("Development data distribution:",dev_label_distrib)

selected_idx = []
num_neg_labels = 0
idx = 0
for label in dev_dataset["label"]:
    if label == 1:
        selected_idx.append(idx)
    else:
        if num_neg_labels < dev_label_distrib[1]:
            selected_idx.append(idx)
            num_neg_labels += 1
    idx += 1
uniform_dev_dataset = dev_dataset.iloc[selected_idx]
uniformed_dev_label_distrib = calc_label_distribution(uniform_dev_dataset)
print("Uniformed development data distribution:",uniformed_dev_label_distrib)

uniform_dev_dataset.to_csv("./data/dev_uniformed.csv")

Development data distribution: [4286, 1640]
Uniformed development data distribution: [1640, 1640]


In [74]:
print("==> Loading Training Set from file, using simple tokenizer:")
train_dataset = ClaimEvidenceLstmDataset("./data/train.csv", type="train", tokenizer=SimpleTokenizer(to_lower=False, keep_punctuation=False))
train_dataset.save_dataset("./data/pt/train_dataset.pt")
train_dataset.save_embedding_mat("./data/pt/default_embedding_mat.pt")
train_dataset.save_vocab("./data/default_vocab.pt")

print("\n==> Loading Development Set from file:")
val_dataset = ClaimEvidenceLstmDataset("./data/dev.csv",
                                   vocab=train_dataset.vocab,
                                   type="validation")
val_dataset.save_dataset("./data/pt/dev_dataset.pt")

print("\n==> Loading Uniformed Development Set from file:")
val_dataset = ClaimEvidenceLstmDataset("./data/dev_uniformed.csv",
                                   vocab=train_dataset.vocab,
                                   type="validation")
val_dataset.save_dataset("./data/dev_uniformed_dataset.pt")



print("\n\n==> Loading Training Set from file, using POS tokenizer:")
train_dataset = ClaimEvidenceLstmDataset("./data/train.csv", type="train")
train_dataset.save_dataset("./data/pt/pos_train_dataset.pt")
train_dataset.save_embedding_mat("./data/pt/pos_embedding_mat.pt")
train_dataset.save_vocab("./data/pos_vocab.pt")

print("\n==> Loading Development Set from file:")
val_dataset = ClaimEvidenceLstmDataset("./data/dev.csv",
                                   vocab=train_dataset.vocab,
                                   type="validation")
val_dataset.save_dataset("./data/pt/pos_dev_dataset.pt")

print("\n==> Loading Uniformed Development Set from file:")
val_dataset = ClaimEvidenceLstmDataset("./data/dev_uniformed.csv",
                                   vocab=train_dataset.vocab,
                                   type="validation")
val_dataset.save_dataset("./data/pt/pos_dev_uniformed_dataset.pt")

==> Loading Training Set from file, using simple tokenizer:
Loading 'word2vec-google-news-300' as pretrained word embeddings...
Loading training dataset and building vocab...  row (21507/21508)
Building Word Embeddings...
Converting Text to Sequences...
Done!
Dataset saved to: ./data/pt/train_dataset.pt
Word Embedding Matrix saved to: ./data/pt/default_embedding_mat.pt
Vocab saved to: ./data/default_vocab.pt

==> Loading Development Set from file:
Loading validation dataset...   row (5925/5926)
Converting Text to Sequences...
Using exist vocab...
Done!
Dataset saved to: ./data/pt/dev_dataset.pt

==> Loading Uniformed Development Set from file:
Loading validation dataset...   row (3279/3280)
Converting Text to Sequences...
Using exist vocab...
Done!
Dataset saved to: ./data/dev_uniformed_dataset.pt
==> Loading Training Set from file, using POS tokenizer:
Loading 'word2vec-google-news-300' as pretrained word embeddings...
Loading training dataset and building vocab...  row (21507/21508)
