In [9]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [10]:
import sys
import os

path = "/content/drive/MyDrive/NLP_Project_New"
sys.path.append(os.path.abspath(path))

In [11]:
import os
import re
import random
import pandas as pd
import nlp_project_functions as functions

dir = f"{path}/data/data_preprocessed"

In [12]:
texts, labels = functions.read_conll_data(f"{path}/data/train_test_val/train.tsv")

In [13]:
def sent_has_names(ls: list) -> bool:
  ners = ["B", "I"]
  if [s for s in ls if any(xs in s for xs in ners)]:
    return True
  else:
    return False

In [14]:
len(texts)

23440

In [15]:
for i in [2, 4, 8, 16, 32, 64]:
  subset = int(len(texts) / i)
  no_ne_ratio = 3 * int(subset / 4) # 25% of sentences have to be names

  nr = 0
  no_ners = 0

  subset_texts = []
  subset_labels = []

  for x, (text, label) in enumerate(zip(texts, labels)):
    if nr == subset:
      break
    else:
      if sent_has_names(label):
        subset_texts.extend(text)
        subset_texts.append("")
        subset_labels.extend(label)
        subset_labels.append("")
        nr += 1
      else:
        if no_ners < no_ne_ratio:
          subset_texts.extend(text)
          subset_texts.append("")
          subset_labels.extend(label)
          subset_labels.append("")
          no_ners += 1
          nr += 1

  train_smaller = pd.DataFrame(list(zip(subset_texts, subset_labels)))

  #train_smaller.to_csv(f"{path}/data/train_test_val/train_smaller/{i}_part_train.tsv", sep="\t", index=False, header=False)




# General statistics about the dataset

## 1. NE-to-token ratio

In [16]:
texts, labels = functions.read_conll_data(f"{path}/data/train_test_val/all_data.tsv")
all_labels = [x for xs in labels for x in xs]
persons = [s for s in all_labels if s.endswith("PER")]
locations = [s for s in all_labels if s.endswith("LOC")]

ne = len(persons) + len(locations)
ne_to_all = (ne/len(all_labels)) * 100
all_ne_sentences = len([sent for sent in labels if sent_has_names(sent)])
ne_sentences_to_all = (all_ne_sentences/len(labels)) * 100
avg_sentence_length = len(all_labels)/len(labels)
avg_ne_per_ne_sent = ne/all_ne_sentences
print(f"The complete corpus consists of {len(all_labels):,} tokens.")
print(f"Of these, {len(persons):,} are names of individuals and {len(locations):,} are names of places. This makes for a total of {ne:,} named entities, or {ne_to_all:.2f}% of all tokens.")
print(f"The complete corpus consists of {len(labels):,} sentences, of which {all_ne_sentences:,} (={ne_sentences_to_all:.2f}%) contain named entities.")
print(f"The average sentence consists of {avg_sentence_length:.2f} words.")
print(f"The sentences with named entities contain on average {avg_ne_per_ne_sent:.2f} named entities.")

The complete corpus consists of 798,912 tokens.
Of these, 12,364 are names of individuals and 3,382 are names of places. This makes for a total of 15,746 named entities, or 1.97% of all tokens.
The complete corpus consists of 30,873 sentences, of which 7,198 (=23.31%) contain named entities.
The average sentence consists of 25.88 words.
The sentences with named entities contain on average 2.19 named entities.


In [18]:
for i in ["train", "test"]:
  i_texts, i_labels = functions.read_conll_data(f"{path}/data/train_test_val/{i}.tsv")
  i_all_labels = [x for xs in i_labels for x in xs]
  i_persons = [s for s in i_all_labels if s.endswith("PER")]
  i_locations = [s for s in i_all_labels if s.endswith("LOC")]

  i_ne = len(i_persons) + len(i_locations)
  i_ne_to_all = (i_ne/len(i_all_labels)) * 100
  i_all_ne_sentences = len([sent for sent in i_labels if sent_has_names(sent)])
  i_ne_sentences_to_all = (i_all_ne_sentences/len(i_labels)) * 100
  i_avg_sentence_length = len(i_all_labels)/len(i_labels)
  i_avg_ne_per_ne_sent = i_ne/i_all_ne_sentences

  print(f"The {i} corpus consists of {len(i_all_labels):,} tokens.")
  print(f"Of these, {len(i_persons):,} are names of individuals and {len(i_locations):,} are names of places.\nThis makes for a total of {i_ne:,} named entities, or {i_ne_to_all:.2f}% of all {i} tokens.")
  print(f"The complete {i} corpus consists of {len(i_labels):,} sentences, of which {i_all_ne_sentences:,} (={i_ne_sentences_to_all:.2f}%) contain named entities.")
  print(f"The average sentence in the {i} corpus consists of {i_avg_sentence_length:.2f} words.")
  print(f"The sentences with named entities in the {i} corpus contain on average {i_avg_ne_per_ne_sent:.2f} named entities.")
  print("\n=======\n")


The train corpus consists of 638,813 tokens.
Of these, 9,699 are names of individuals and 2,760 are names of places.
This makes for a total of 12,459 named entities, or 1.95% of all train tokens.
The complete train corpus consists of 23,440 sentences, of which 5,637 (=24.05%) contain named entities.
The average sentence in the train corpus consists of 27.25 words.
The sentences with named entities in the train corpus contain on average 2.21 named entities.


The test corpus consists of 157,050 tokens.
Of these, 2,450 are names of individuals and 619 are names of places.
This makes for a total of 3,069 named entities, or 1.95% of all test tokens.
The complete test corpus consists of 5,859 sentences, of which 1,436 (=24.51%) contain named entities.
The average sentence in the test corpus consists of 26.80 words.
The sentences with named entities in the test corpus contain on average 2.14 named entities.




In [19]:
texts, labels = functions.read_conll_data(f"{path}/data/train_test_val/train_lessdense_names.tsv")
all_labels = [x for xs in labels for x in xs]
persons = [s for s in all_labels if s.endswith("PER")]
locations = [s for s in all_labels if s.endswith("LOC")]

ne = len(persons) + len(locations)
ne_to_all = (ne/len(all_labels)) * 100
all_ne_sentences = len([sent for sent in labels if sent_has_names(sent)])
ne_sentences_to_all = (all_ne_sentences/len(labels)) * 100
avg_sentence_length = len(all_labels)/len(labels)
avg_ne_per_ne_sent = ne/all_ne_sentences
print(f"The complete corpus consists of {len(all_labels):,} tokens.")
print(f"Of these, {len(persons):,} are names of individuals and {len(locations):,} are names of places. This makes for a total of {ne:,} named entities, or {ne_to_all:.2f}% of all tokens.")
print(f"The complete corpus consists of {len(labels):,} sentences, of which {all_ne_sentences:,} (={ne_sentences_to_all:.2f}%) contain named entities.")
print(f"The average sentence consists of {avg_sentence_length:.2f} words.")
print(f"The sentences with named entities contain on average {avg_ne_per_ne_sent:.2f} named entities.")

The complete corpus consists of 232,464 tokens.
Of these, 9,699 are names of individuals and 2,760 are names of places. This makes for a total of 12,459 named entities, or 5.36% of all tokens.
The complete corpus consists of 6,197 sentences, of which 5,637 (=90.96%) contain named entities.
The average sentence consists of 37.51 words.
The sentences with named entities contain on average 2.21 named entities.


In [None]:
# create corpus with higher density of names

texts, labels = functions.read_conll_data(f"{path}/data/train_test_val/train.tsv")

#subset = int(len(texts) / 4)
#no_ne_ratio = int(subset / 10) # 90% of sentences have to contain names

no_ne_ratio = 560
nr = 0
no_ners = 0

subset_texts = []
subset_labels = []

for x, (text, label) in enumerate(zip(texts, labels)):
  #if nr == subset:
  #  break
  #else:
  if sent_has_names(label):
    subset_texts.extend(text)
    subset_texts.append("")
    subset_labels.extend(label)
    subset_labels.append("")
    nr += 1
  else:
    if no_ners < no_ne_ratio:
      subset_texts.extend(text)
      subset_texts.append("")
      subset_labels.extend(label)
      subset_labels.append("")
      no_ners += 1
      nr += 1

train_dense = pd.DataFrame(list(zip(subset_texts, subset_labels)))

print(len(texts))
print(nr)
print(len(train_dense))

In [None]:
train_dense.to_csv(f"{path}/data/train_test_val/train_lessdense_names.tsv", sep="\t", index=False, header=False)