In [1]:
import sys
from tqdm import tqdm
from collections import Counter
import functions as f

### Set hyperparameters

In [2]:
# Select language
language = "english"
# language = "persian"

# Select number of sentences for each corpus
num_sents = 250

# Select number of similar words to create new word pairs
top_n = 10
# threshold = [0.95, 0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1, 0,0]
threshold = 0.0

# Select seed for training data (1-5)
seed = 3

BorC = "B"
# BorC = "C"

### Load corpora

In [3]:
# corpus_A, corpus_B = f.load_corpus(language)

### Load vectors

In [4]:
vectors = f.load_vectors(language)
# vector_type = "fastText"
vector_type = "GloVe"
bert_vectors = "vectors/"+language+"_corpus=A_sents="+str(num_sents)+"_seed="+str(seed)+".magnitude"
print(vectors)
print(bert_vectors)

./glove.840B.300d.magnitude
vectors/english_corpus=A_sents=250_seed=3.magnitude


### Get word-word-relation triples for each corpus

In [5]:
corpus_A = "train_data/en/"+language+"_train_corpus=A_sents="+str(num_sents)+"_seed="+str(seed)+".txt"
corpus_B = "train_data/en/"+language+"_train_corpus="+BorC+"_sents="+str(num_sents)+"_seed="+str(seed)+".txt"

In [6]:
list_A, sentences_A = f.process_training_data(corpus_A)

250 sentences processed
4,677 tokens processed
Average sentence length:	18.708
4,438 head-dependent:relation pairs
4,447 head-dependent-relation:sentence triples


In [7]:
list_B, sentences_B = f.process_training_data(corpus_B)

250 sentences processed
6,333 tokens processed
Average sentence length:	25.332
5,704 head-dependent:relation pairs
5,723 head-dependent-relation:sentence triples


In [8]:
# output_A = language+"_A_mismatches"+".tsv"
# output_B = language+"_B_mismatches"+".tsv"
# get list for corpus A data
# list_A, sentences_A = f.process_ud_data(corpus_A, num_sents_A)
# get list for corpus B data
# need to keep process_wsj_data for now for English data
# list_B, sentences_B = f.process_wsj_data(corpus_B, num_sents_B)
# list_B, sentences_B = f.process_ud_data(corpus_B, num_sents_B)

In [9]:
# for triple in sentences_A:
#     print(triple, sentences_A[triple])
# for sentence in sentences_A[triple]:
#     print(triple, sentence)

In [10]:
# for triple in sentences_B:
# #     print(triple, sentences_B[triple])
#     for sentence in sentences_B[triple]:
#         print(triple, sentence)

### Compare triples from each corpus to find mismatched relations

In [11]:
mismatches_A = {}
mismatches_B = {}
for pair_B in list_B.keys():
    # If the (head, dependent) pair is in both corpus A and B
    if pair_B in list_A.keys():
        # get the relations for that pair in B and in A
        relations_B = list_B[pair_B]
        relations_A = list_A[pair_B]
        # TODO: decide which ones we actually care about...
        # get the relations in B NOT in A and relations in A NOT in B
        not_in_A = [x for x in relations_B if x not in set(relations_A)]
        not_in_B = [x for x in relations_A if x not in set(relations_B)]
        # if there are relations not in A/B,
        # add entry to mismatches_A or mismatches_B for that pair-relation combo
        if len(not_in_A) != 0:
            mismatches_B[pair_B] = not_in_A
        if len(not_in_B) != 0:
            mismatches_A[pair_B] = not_in_B

In [12]:
print(f"{len(mismatches_A)} pairs with a relation in corpus A but not in corpus B")
print(f"{len(mismatches_B)} pairs with a relation in corpus B but not in corpus A")

14 pairs with a relation in corpus A but not in corpus B
13 pairs with a relation in corpus B but not in corpus A


### Most Frequent Label

#### Generate conversion dictionaries and spreadsheet for analysis

In [13]:
conversion_A_simple = f.get_conversions_simple(mismatches_A, 
                                               sentences_A, 
                                               list_A, 
                                               list_B)

In [14]:
conversion_B_simple = f.get_conversions_simple(mismatches_B, 
                                               sentences_B, 
                                               list_B, 
                                               list_A)

#### Create human-readable spreadsheet for debugging

In [15]:
# human_readable_A_simple = "human_readable/"+corpus_A[11:-4]+"_simple_human_readable.tsv"
# human_readable_B_simple = "human_readable/"+corpus_B[11:-4]+"_simple_human_readable.tsv"

In [16]:
# f.generate_human_readable_output(human_readable_A_simple, mismatches_A, sentences_A, list_A, list_B)
# f.generate_human_readable_output(human_readable_B_simple, mismatches_B, sentences_B, list_B, list_A)

#### Create converted conllu files for corpus A and corpus B

In [17]:
# converted_corpus_A_simple = corpus_A[:-4]+"_converted_simple.conllu"
converted_corpus_B_simple = corpus_B[:-4]+"_converted_simple.conllu"

In [18]:
# f.apply_conversions(corpus_A, converted_corpus_A_simple, conversion_A_simple)
f.apply_conversions(corpus_B, converted_corpus_B_simple, conversion_B_simple)

### Pretrained Word Vectors

#### Generate conversion dictionaries and spreadsheet for analysis

In [19]:
# thresh = 0.0

In [20]:
# for thresh in threshold:
#     conversion_A_pretrained = f.get_conversions_pretrained(mismatches_A, 
#                                                            sentences_A, 
#                                                            list_A, 
#                                                            list_B,
#                                                            vectors,
#                                                            top_n,
#                                                            thresh)
#     converted_corpus_A_pretrained = corpus_A[:-4]+"_converted_pretrained="+vector_type+"_thresh="+str(thresh)+".conllu"
#     f.apply_conversions(corpus_A, converted_corpus_A_pretrained, conversion_A_pretrained)

In [21]:
conversion_B_pretrained = f.get_conversions_pretrained(mismatches_B, 
                                                       sentences_B, 
                                                       list_B, 
                                                       list_A,
                                                       vectors,
                                                       top_n,
                                                       threshold)
converted_corpus_B_pretrained = corpus_B[:-4]+"_converted_pretrained="+vector_type+".conllu"
f.apply_conversions(corpus_B, converted_corpus_B_pretrained, conversion_B_pretrained)

#### Create converted files for corpus A and corpus B

In [22]:
# converted_corpus_A_pretrained = corpus_A[:-4]+"_converted_pretrained="+vector_type+".conllu"
# converted_corpus_B_pretrained = corpus_B[:-4]+"_converted_pretrained="+vector_type+".conllu"

In [23]:
# f.apply_conversions(corpus_A, converted_corpus_A_pretrained, conversion_A_pretrained)
# f.apply_conversions(corpus_B, converted_corpus_B_pretrained, conversion_B_pretrained)

In [24]:
conversion_B_BERT = f.get_conversions_pretrained(mismatches_B, 
                                                       sentences_B, 
                                                       list_B, 
                                                       list_A,
                                                       vectors,
                                                       top_n,
                                                       threshold)
converted_corpus_B_BERT = corpus_B[:-4]+"_converted_BERT.conllu"
f.apply_conversions(corpus_B, converted_corpus_B_BERT, conversion_B_BERT)