<a href="https://colab.research.google.com/github/ua-datalab/QNLP/blob/main/data_cleanup.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Cleanup Code

This notebook is setup to read the uspantekan data files (.conllu), cleanup the non-ASCII characters in the name of the file. It extract the relevant lines that contain the Spanish translations of the text along with the line IDs, from different files, adds a label (0, 1...) to indicate file origin. Finally, all the lines are randomized and split into train, test and dev files to be fed to the lambeq pipeline.

Currently, we are working on `Bailes_de_Uspantán.conllu` and `Bailes_de_Uspantan.conllu`.

In [None]:
import os
os.listdir()
os.chdir(".")

import numpy as np

Rename files to remove non-ASCII characters:

In [None]:
# ASCII converter to rename files with non-ASCII characters:
!rename -n 's/[^\x00-\x7F]//g' *


rename(Bailes_de_Uspantán.conllu, Bailes_de_Uspantan.conllu)
rename(Educación_en_la_comunidad.conllu, Educacion_en_la_comunidad.conllu)


In [None]:
def file_open(ls: list):
  data_list = []
  for i in ls:
    f = open(i, "r")
    lines = f.readlines()
    data_list.append(lines)
  if len(data_list) >0:
    print("file import successful\n no. of files found:", len(data_list))
  return data_list

Import files:

In [None]:
data_list = file_open([each for each in os.listdir(".") if each.endswith('.conllu')] )

file import successful
 no. of files found: 2


In [None]:
# extract list of sentence IDs and text in spanish:
def sent_extractor(data:list, label:int):
  # extract sentence id and text separately. this ensures that null values
  # or missing data can be accounted for:
  sent_ids = []
  text_spn = []
  for line in data:
      if "sent_id" in line:
          sent_id = line.split("=")[1].strip()
          sent_ids.append(sent_id)
      elif "text " in line:
      # elif "text[spn]" in line:
          text = line.split("=")[1].strip(" ,\n")
          text_spn.append(text)
  print("number of ids and sentences in dataset:",
        len(sent_ids), len(text_spn))

  # create dictionary with sent_id-text pairs:
  spanish_data_dict = {}
  for i in range(len(sent_ids)):
    spanish_data_dict[sent_ids[i]] = str(label)+ "  "+ text_spn[i]+ " ."

  return spanish_data_dict

In [None]:
#Create dataset in the lambeq example format.
# Take every item in each example file, and save
# the count (ie classification label), sentence, and a period.:
data = {}
for count, item in enumerate(data_list):
  data.update(sent_extractor(item, count))
  print("updated list of sentences: ", len(data))

#  Check length
sents = list(data.values())

number of ids and sentences in dataset: 76 76
updated list of sentences:  76
number of ids and sentences in dataset: 36 36
updated list of sentences:  112


Randomize sentences in sents,
get 80%, 10% of the length of sents, in integer
split the list into 3 sublists
write to file

In [None]:
# Randomize sentences
# and save files:
sents = list(data.values())
print(sents[:10])
np.random.shuffle(sents)
print(sents[:10])

['0  Estetinyol jun kiitz chawechaq .', '0  eeeh ójor laj qatinmit .', "0  fuert alegre, alegre cuandotpe nimq'iij mayo .", "0  eeeh wi' jun, jun montón jb'anen tran taq pues, eeeh wi' nimq'iij .", "0  este, pero lamentablemente que loq'ori xan perder jun kiitz .", "0  jun kiitz eh tradiciones porque ójor iin xink'íych .", "0  eh talvez injunab' como de... aaaah ocho o nueve años, de siete a ocho, nueve años .", "0  wi' jun xjooj, jun xjooj, jun tradición digamos .", "0  este, lastima xna' desaparecer, ta' chiki'n, ta' chiki' tran taq, sach jwiich .", "0  y ma xaq ta iin inb'ínk, sino que k'ii ooj aj tinmit que, lastima que ta' chki'n, va ."]
["1  K'amtzáwch inb'ij Elssy Méndez, iin aj niri B'aa Kub'i .", "0  eeeh wi' jun, jun montón jb'anen tran taq pues, eeeh wi' nimq'iij .", "1  tpe taq neri laj qatinmit chi jk'ixik, tijb'an taq, tijtaq taq b'ik nimaq taq tinmit. .", "1  tb'e taq l chaak tk'ame' sii', tb'e taq rik'il jqaaj, ehhh chi jk'amik eh wákx, wi' jwákxaq .", "1  jli  tich'a'w

In [None]:
#  Splits
train_count = round(len(sents)*0.8)
test_count = round(len(sents)*0.1)
dev_count = len(sents) - train_count - test_count
print(train_count, test_count, dev_count)

train = sents[:train_count]
test = sents[train_count:train_count+test_count]
dev = sents[train_count+test_count:]

90 11 11


In [None]:
#  Write data files:
def write_list_to_file(lst, prefix):
    fname = prefix + "_" + lst + ".txt"
    with open(fname, 'w') as f:
        for item in lst:
            f.write(str(item) + '\n')


In [None]:
write_list_to_file(train, "uspantan")
write_list_to_file(test, "uspantan")
write_list_to_file(dev, "uspantan")