In [1]:
import torch
import pickle
import pandas as pd
from datasets import load_dataset
from torch.utils.data import DataLoader, Dataset
import importlib

In [2]:
# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Model for NER 
model_name = 'dslim/distilbert-NER'

# Custom keywords
try:
    with open('keyword_matching/directory.pkl', 'rb') as file:
        keywords = pickle.load(file)
except FileNotFoundError or FileExistsError:
    with open('directory.pkl', 'rb') as file:
        keywords = pickle.load(file)

# Hyperparameters
input_size = 512
output_size = 768
num_layers = 4          # may require tuning
hidden_size = 256       # may require tuning
num_classes = 97        # 96 different relations plus '0' for no relation
learning_rate = 0.001   # may require tuning
batch_size = 32
num_epochs = 5
PAIR_EMBEDDING_WIDTH = 1540
PAIR_EMBEDDING_LENGTH = 3000

In [None]:
'''
To do:
- sanity check ending position
- check that indices are correct
'''

In [3]:
'''
To do:
- sanity check ending position
'''

data = load_dataset('docred', trust_remote_code=True)

In [10]:
train_df = pd.DataFrame(data['train_annotated'])
distant_df = pd.DataFrame(data['train_distant'])
test_df = pd.DataFrame(data['test'])
val_df = pd.DataFrame(data['validation'])

In [11]:
def check_indicing(data):
    length = len(data)
    indices = []

    for i in range(length):

        instance = data.iloc[i]
        raw_sents = instance['sents']

        sents, _, _ = custom.get_info(instance)

        count1 = sum(len(sent) for sent in raw_sents)
        count2 = sum(len(sent.split(' ')) for sent in sents)

        if count1 != count2:
            indices.append(i)

    return indices

print('train:', check_indicing(train_df))
print('distant:', check_indicing(distant_df))
print('test:', check_indicing(test_df))
print('val:', check_indicing(val_df))

train: []
distant: []
test: []
val: []


In [49]:
import CustomDataset
importlib.reload(CustomDataset)
custom = CustomDataset.CustomDocREDDataset(
    dataset='train_annotated',
    input_size=input_size,
    model_name=model_name,
    custom_keywords=keywords,
    device=device,
    length = 0
)

Starting preprocessing...
Preprocessing skipped.


In [32]:
def standardize_text(text):
    replacements = {
        ' ': '_', '–': '-',
        '\xa0 ': '_', '\xa0': '_', # quick fix: adding space here introduces a lot of issues
        '“': '"', '”': '"'
    }
    for old, new in replacements.items():
        text = text.replace(old, new)         
    return text

def get_info(instance):
    sents_raw = instance['sents']
    standardized_sents = [
        [standardize_text(word) for word in sent]
        for sent in sents_raw
    ]
    sents = [
        ''.join([' ' + word if word != ' ' else word for word in sent]).strip()
        for sent in standardized_sents
    ]   
    vertexSet = instance['vertexSet']
    labels = instance['labels']

    return sents, vertexSet, labels

In [38]:
sents, vertexSet, labels = get_info(instance)

In [48]:
entities = custom.extract_entities(sents)

TypeError: list indices must be integers or slices, not str

In [42]:
indexed = custom.get_entity_positions(sents, entities, 0)
for i in indexed:
    print(i)

{'s_id': 0, 'pos': [0, 3], 'word': 'The Out Crowd', 'entity': 'ORG'}
{'s_id': 0, 'pos': [5, 6], 'word': 'U.S.', 'entity': 'LOC'}
{'s_id': 0, 'pos': [12, 14], 'word': 'Matt Hollywood', 'entity': 'PER'}
{'s_id': 0, 'pos': [19, 20], 'word': 'The', 'entity': 'MISC'}
{'s_id': 0, 'pos': [20, 23], 'word': 'Brian Jonestown Massacre', 'entity': 'MISC'}
{'s_id': 1, 'pos': [8, 9], 'word': 'Portland', 'entity': 'LOC'}
{'s_id': 1, 'pos': [10, 11], 'word': 'Oregon', 'entity': 'LOC'}
{'s_id': 1, 'pos': [13, 15], 'word': 'Stuart Valentine', 'entity': 'PER'}
{'s_id': 1, 'pos': [17, 19], 'word': 'Elliott Barnes', 'entity': 'PER'}
{'s_id': 1, 'pos': [21, 23], 'word': 'Joe Patterson', 'entity': 'PER'}
{'s_id': 1, 'pos': [26, 28], 'word': 'Sarah Jane', 'entity': 'PER'}
{'s_id': 2, 'pos': [9, 11], 'word': 'Gregg Williams', 'entity': 'PER'}
{'s_id': 2, 'pos': [13, 15], 'word': 'Dandy Warhols', 'entity': 'ORG'}
{'s_id': 2, 'pos': [22, 25], 'word': 'Elephant Stone Records', 'entity': 'ORG'}
{'s_id': 3, 'pos': 

In [59]:
head = labels['head']
tail = labels['tail']
relation_ids = labels['relation_id']
print('head:',head)
print('tail:',tail)
print('relation:',relation_ids)
print('vertexset:')
for i, v in enumerate(vertexSet):
    print(f'{i}: {v}')
print(len(tail))

head: [1, 1, 2, 6, 6, 10, 0, 0, 0, 0, 17, 26, 5, 29, 29]
tail: [6, 26, 3, 1, 1, 5, 2, 12, 4, 5, 18, 1, 1, 1, 5]
relation: ['P150', 'P361', 'P463', 'P131', 'P17', 'P740', 'P527', 'P162', 'P571', 'P740', 'P162', 'P527', 'P17', 'P17', 'P740']
vertexset:
0: [{'name': 'The Out Crowd', 'sent_id': 0, 'pos': [0, 3], 'type': 'ORG'}, {'name': 'The Out Crowd', 'sent_id': 5, 'pos': [0, 3], 'type': 'ORG'}]
1: [{'name': 'U.S.', 'sent_id': 0, 'pos': [5, 6], 'type': 'LOC'}]
2: [{'name': 'Matt Hollywood', 'sent_id': 0, 'pos': [12, 14], 'type': 'PER'}, {'name': 'Matt Hollywood', 'sent_id': 6, 'pos': [0, 2], 'type': 'PER'}, {'name': 'Matt Hollywood', 'sent_id': 7, 'pos': [7, 9], 'type': 'PER'}, {'name': 'Matt Hollywood', 'sent_id': 8, 'pos': [3, 5], 'type': 'PER'}]
3: [{'name': 'The Brian Jonestown Massacre', 'sent_id': 0, 'pos': [19, 23], 'type': 'ORG'}]
4: [{'name': '2001', 'sent_id': 1, 'pos': [6, 7], 'type': 'TIME'}]
5: [{'name': 'Portland', 'sent_id': 7, 'pos': [27, 28], 'type': 'LOC'}, {'name': 'Po

In [55]:
triplets = custom.make_triplets(vertexSet, labels)
print('triplets:')
print(len(triplets))
for t in triplets:
    print(t)

triplets:
35
{'head': {'s_id': 0, 'pos': [5, 6], 'word': 'U.S.'}, 'relation_id': 32, 'tail': {'s_id': 1, 'pos': [10, 11], 'word': 'Oregon'}}
{'head': {'s_id': 0, 'pos': [5, 6], 'word': 'U.S.'}, 'relation_id': 56, 'tail': {'s_id': 5, 'pos': [26, 28], 'word': 'North American'}}
{'head': {'s_id': 0, 'pos': [12, 14], 'word': 'Matt Hollywood'}, 'relation_id': 61, 'tail': {'s_id': 0, 'pos': [19, 23], 'word': 'The Brian Jonestown Massacre'}}
{'head': {'s_id': 6, 'pos': [0, 2], 'word': 'Matt Hollywood'}, 'relation_id': 61, 'tail': {'s_id': 0, 'pos': [19, 23], 'word': 'The Brian Jonestown Massacre'}}
{'head': {'s_id': 7, 'pos': [7, 9], 'word': 'Matt Hollywood'}, 'relation_id': 61, 'tail': {'s_id': 0, 'pos': [19, 23], 'word': 'The Brian Jonestown Massacre'}}
{'head': {'s_id': 8, 'pos': [3, 5], 'word': 'Matt Hollywood'}, 'relation_id': 61, 'tail': {'s_id': 0, 'pos': [19, 23], 'word': 'The Brian Jonestown Massacre'}}
{'head': {'s_id': 1, 'pos': [10, 11], 'word': 'Oregon'}, 'relation_id': 28, 'tail

In [60]:
pairs = custom.make_pairs(indexed)
for p in pairs:
    print(p)

({'s_id': 5, 'pos': [26, 28], 'entity': 'MISC'}, {'s_id': 2, 'pos': [22, 25], 'entity': 'ORG'})
({'s_id': 4, 'pos': [10, 12], 'entity': 'PER'}, {'s_id': 1, 'pos': [8, 9], 'entity': 'LOC'})
({'s_id': 1, 'pos': [21, 23], 'entity': 'PER'}, {'s_id': 8, 'pos': [7, 10], 'entity': 'MISC'})
({'s_id': 7, 'pos': [20, 23], 'entity': 'ORG'}, {'s_id': 1, 'pos': [8, 9], 'entity': 'LOC'})
({'s_id': 7, 'pos': [24, 26], 'entity': 'PER'}, {'s_id': 4, 'pos': [10, 12], 'entity': 'PER'})
({'s_id': 8, 'pos': [3, 5], 'entity': 'PER'}, {'s_id': 7, 'pos': [27, 28], 'entity': 'LOC'})
({'s_id': 1, 'pos': [21, 23], 'entity': 'PER'}, {'s_id': 7, 'pos': [7, 9], 'entity': 'PER'})
({'s_id': 1, 'pos': [26, 28], 'entity': 'PER'}, {'s_id': 7, 'pos': [32, 34], 'entity': 'PER'})
({'s_id': 2, 'pos': [9, 11], 'entity': 'PER'}, {'s_id': 5, 'pos': [9, 11], 'entity': 'ORG'})
({'s_id': 4, 'pos': [29, 32], 'entity': 'ORG'}, {'s_id': 7, 'pos': [29, 31], 'entity': 'ORG'})
({'s_id': 7, 'pos': [24, 26], 'entity': 'PER'}, {'s_id': 1,

In [61]:
for s in sents:
    print(' '.join(s))

The Out Crowd was a U.S. indie rock band featuring bass guitarist Matt Hollywood , a former member of The Brian Jonestown Massacre .
He formed the group in late 2001 in Portland , Oregon with drummer Stuart Valentine , guitarist Elliott Barnes , bassist Joe Patterson and tambourine player Sarah Jane .
Go on , Give a Damn was produced by Gregg Williams ( The Dandy Warhols ) and released in early 2003 on Elephant Stone Records .
The band toured on the American West Coast in 2003 .
Then I Saw The Holy City    was produced by Brian Coates , engineer / producer for The Dandy Warhols , and released in the fall of 2004 on The Kora Records .
The Out Crowd supported fellow neo - psychedelic bands Dead Meadow and The Warlocks nationally in 2004 as well as The Dandy Warhols on their 2005 North American tour .
Matt Hollywood has confirmed that the band broke up in 2006 .
Since the demise of the band , Matt Hollywood has been recording songs and playing shows for his new project The Rebel Drones , 

In [63]:
sents, vertexSet, labels = custom.get_info(instance)

In [64]:
text_emb, pair_emb, triplet_emb = custom.embed_data(sents, pairs, triplets)

In [65]:
for e in text_emb:
    print(e)

[ 1.44060403e-01  4.53592569e-01 -2.50122212e-02  2.37497896e-01
 -2.57223397e-02 -5.22852421e-01  1.54825613e-01  5.71127236e-02
  1.21932015e-01 -1.38418460e+00 -2.21699893e-01 -1.87859416e-01
 -1.74732924e-01  4.49737608e-02 -2.15912104e-01  1.00152023e-01
  2.09360421e-01 -3.21791023e-01 -5.67213178e-01 -5.86593263e-02
  1.44946113e-01  2.00143009e-01  6.38823926e-01  1.70706078e-01
 -2.56217569e-01  1.57894790e-01  1.50390297e-01 -1.14082396e-01
  1.20242983e-02  2.67061710e-01  9.29147154e-02  5.67486405e-01
 -3.75206381e-01  3.25118214e-01  2.84471542e-01  1.84179589e-01
 -2.96160161e-01 -7.22890556e-01 -2.27320671e-01 -9.38450813e-01
 -5.69827855e-01  3.49139631e-01 -2.56377846e-01 -2.99645662e-01
 -4.94283110e-01 -1.01093078e+00 -4.34901059e-01 -2.76903272e-01
 -7.42741525e-01 -3.99222434e-01  4.32994723e-01  6.04438961e-01
  3.77728343e-01  4.24616754e-01 -1.84801891e-01  5.40731788e-01
  1.98659703e-01 -3.91151994e-01  2.05459550e-01  2.24115878e-01
 -4.19934809e-01 -9.38657

In [35]:
instance = train_df.iloc[2404]
raw = instance['sents']
for s in raw:
    print(' '.join(s))
print('\n')
sents, vertexSet, labels = get_info(instance)
for s in sents:
    print(s)

The Out Crowd was a U.S. indie rock band featuring bass guitarist Matt Hollywood , a former member of The Brian Jonestown Massacre .
He formed the group in late 2001 in Portland , Oregon with drummer Stuart Valentine , guitarist Elliott Barnes , bassist Joe Patterson and tambourine player Sarah Jane .
Go on , Give a Damn was produced by Gregg Williams ( The Dandy Warhols ) and released in early 2003 on Elephant Stone Records .
The band toured on the American West Coast in 2003 .
Then I Saw The Holy City    was produced by Brian Coates , engineer / producer for The Dandy Warhols , and released in the fall of 2004 on The Kora Records .
The Out Crowd supported fellow neo - psychedelic bands Dead Meadow and The Warlocks nationally in 2004 as well as The Dandy Warhols on their 2005 North American tour .
Matt Hollywood has confirmed that the band broke up in 2006 .
Since the demise of the band , Matt Hollywood has been recording songs and playing shows for his new project The Rebel Drones , 

In [789]:
SPACE_TOKEN = '[SPACE]'

print(new)

for i, word in enumerate(new):
    if word == SPACE_TOKEN:
        new[i] = ' '

print(new)

['In', '2015/16', 'she', 'co', '-', 'wrote', 'for', '[SPACE]', 'Bridget', 'Jones', "'", 'Diary', '(', 'musical', ')', '[SPACE]', 'with', 'Lily', 'Allen', 'and', 'Greg', 'Kurstin']
['In', '2015/16', 'she', 'co', '-', 'wrote', 'for', ' ', 'Bridget', 'Jones', "'", 'Diary', '(', 'musical', ')', ' ', 'with', 'Lily', 'Allen', 'and', 'Greg', 'Kurstin']


In [47]:
sents = instance['sents']
labels = instance['labels']
vertexSet = instance['vertexSet']
for v in vertexSet:
    print(v)
head = labels['head']
tail = labels['tail']
print(head)


[{'name': 'The Out Crowd', 'sent_id': 0, 'pos': [0, 3], 'type': 'ORG'}, {'name': 'The Out Crowd', 'sent_id': 5, 'pos': [0, 3], 'type': 'ORG'}]
[{'name': 'U.S.', 'sent_id': 0, 'pos': [5, 6], 'type': 'LOC'}]
[{'name': 'Matt Hollywood', 'sent_id': 0, 'pos': [12, 14], 'type': 'PER'}, {'name': 'Matt Hollywood', 'sent_id': 6, 'pos': [0, 2], 'type': 'PER'}, {'name': 'Matt Hollywood', 'sent_id': 7, 'pos': [7, 9], 'type': 'PER'}, {'name': 'Matt Hollywood', 'sent_id': 8, 'pos': [3, 5], 'type': 'PER'}]
[{'name': 'The Brian Jonestown Massacre', 'sent_id': 0, 'pos': [19, 23], 'type': 'ORG'}]
[{'name': '2001', 'sent_id': 1, 'pos': [6, 7], 'type': 'TIME'}]
[{'name': 'Portland', 'sent_id': 7, 'pos': [27, 28], 'type': 'LOC'}, {'name': 'Portland', 'sent_id': 1, 'pos': [8, 9], 'type': 'LOC'}]
[{'name': 'Oregon', 'sent_id': 1, 'pos': [10, 11], 'type': 'LOC'}]
[{'name': 'Stuart Valentine', 'sent_id': 1, 'pos': [13, 15], 'type': 'PER'}, {'name': 'Stuart Valentine', 'sent_id': 7, 'pos': [32, 34], 'type': 'PE