<a href="https://colab.research.google.com/github/ymoslem/Adaptive-MT-LLM-Fine-tuning/blob/main/Data-Processing-Adaptive-MT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Processing of Medical Spanish-to-English datasets

This notebook is part of the repository [Adaptive-MT-LLM-Fine-tuning](https://github.com/ymoslem/Adaptive-MT-LLM-Fine-tuning).

# Downloading Datasets

In [None]:
import os

data_path = "/content/drive/MyDrive/data/"
directory = os.path.join(data_path, "spanish")

if not os.path.exists(directory):
  os.mkdir(directory)
  os.chdir(directory)
else:
  os.chdir(directory)

os.getcwd()

'/content/drive/MyDrive/data/spanish'

In [None]:
srclang = "es"
tgtlang = "en"


all_corpora = False
ccaligned = False
paracrawl = False

In [None]:
import requests

opus_url = "https://opus.nlpl.eu/opusapi/?source="+srclang+"&target="+tgtlang+"&preprocessing=moses&version=latest"
response = requests.get(opus_url)
response_json = response.json()
corpora = response_json["corpora"]
corpora[0]

{'alignment_pairs': 62191,
 'corpus': 'bible-uedin',
 'documents': '',
 'id': 23469,
 'latest': 'True',
 'preprocessing': 'moses',
 'size': 5291,
 'source': 'en',
 'source_tokens': 1550348,
 'target': 'es',
 'target_tokens': 1434719,
 'url': 'https://object.pouta.csc.fi/OPUS-bible-uedin/v1/moses/en-es.txt.zip',
 'version': 'v1'}

In [None]:
# Medical corpora
corpora_names = ["EMEA", "ELRC_2682", "ELRC_2922", "ELRC_2923", "ELRC_3382", "tico-19", "SciELO",
                "ELRC-3210-antibiotic", "ELRC-antibiotic",
                "ELRC-3077-wikipedia_health", "ELRC-wikipedia_health",
                "ELRC-3299-EUROPARL_covid", "ELRC-3470-EC_EUROPA_covid", "ELRC-3571-EUR_LEX_covid",
                "ELRC-3612-presscorner_covid", "ELRC-EUROPARL_covid", "ELRC-presscorner_covid",
                "ELRC-EUIPO_list"
                ]

urls = [corpus["url"] for corpus in corpora]

In [None]:
# Get data size

data_size = 0

for corpus in corpora:
    if corpus["corpus"] in corpora_names and corpus["alignment_pairs"]:
        data_size += corpus["alignment_pairs"]

print("Line count:", format(data_size, ','))

Line count: 2,303,897


In [None]:
# Download the datasets

!pip3 install wget -U -q

from wget import download
import shutil
import os
from tqdm.notebook import tqdm


os.chdir(directory)
print("Downloading to:", os.getcwd(), end="")


for corpus in corpora:
    if corpus["corpus"] in corpora_names:
        print("•", corpus["corpus"])
        url = corpus["url"]
        filename = download(url)

        shutil.unpack_archive(filename)
        os.remove(filename)


Downloading to: /content/drive/MyDrive/data/spanish
• ELRC_2682

• ELRC_2922

• ELRC_2923

• ELRC-3077-wikipedia_health

• ELRC-3210-antibiotic

• ELRC-3299-EUROPARL_covid

• ELRC_3382

• ELRC-3470-EC_EUROPA_covid

• ELRC-3571-EUR_LEX_covid

• ELRC-3612-presscorner_covid

• ELRC-antibiotic

• ELRC-EUIPO_list

• ELRC-EUROPARL_covid

• ELRC-presscorner_covid

• ELRC-wikipedia_health

• EMEA

• SciELO

• tico-19


In [None]:
source_files = []
target_files = []

for filename in os.listdir("."):
    if filename.endswith((".ids", ".scores", ".xml", "LICENSE", "README")):
        os.remove(filename)
    elif filename.endswith(srclang) and "filtered" not in filename:
        source_files.append(filename)
    elif filename.endswith(tgtlang) and "filtered" not in filename:
        target_files.append(filename)

if len(source_files) == len(target_files):
    source_files = sorted(source_files)
    target_files = sorted(target_files)

    print(*source_files)
    print(*target_files)
else:
    print("Different length of source and target files", len(source_files), len(target_files))

ELRC-3077-wikipedia_health.en-es.es ELRC-3210-antibiotic.en-es.es ELRC-3299-EUROPARL_covid.en-es.es ELRC-3470-EC_EUROPA_covid.en-es.es ELRC-3571-EUR_LEX_covid.en-es.es ELRC-3612-presscorner_covid.en-es.es ELRC-EUIPO_list.en-es.es ELRC-EUROPARL_covid.en-es.es ELRC-antibiotic.en-es.es ELRC-presscorner_covid.en-es.es ELRC-wikipedia_health.en-es.es ELRC_2682.en-es.es ELRC_2922.en-es.es ELRC_2923.en-es.es ELRC_3382.en-es.es EMEA.en-es.es SciELO.en-es.es tico-19.en-es.es
ELRC-3077-wikipedia_health.en-es.en ELRC-3210-antibiotic.en-es.en ELRC-3299-EUROPARL_covid.en-es.en ELRC-3470-EC_EUROPA_covid.en-es.en ELRC-3571-EUR_LEX_covid.en-es.en ELRC-3612-presscorner_covid.en-es.en ELRC-EUIPO_list.en-es.en ELRC-EUROPARL_covid.en-es.en ELRC-antibiotic.en-es.en ELRC-presscorner_covid.en-es.en ELRC-wikipedia_health.en-es.en ELRC_2682.en-es.en ELRC_2922.en-es.en ELRC_2923.en-es.en ELRC_3382.en-es.en EMEA.en-es.en SciELO.en-es.en tico-19.en-es.en


# Filtering Datasets

In [None]:
!wget -qq https://github.com/ymoslem/MT-Preparation/raw/main/filtering/filter.py -O filter.py

In [None]:
from filter import prepare

print("Filtering files...")

for source_file, target_file in zip(source_files, target_files):
    print("\n•••", source_file, target_file)
    prepare(source_file, target_file, srclang, tgtlang)

Filtering files...

••• ELRC-3077-wikipedia_health.en-es.es ELRC-3077-wikipedia_health.en-es.en
Dataframe shape (rows, columns): (5115, 2)
--- Rows with Empty Cells Deleted	--> Rows: 5114
--- Duplicates Deleted			--> Rows: 5114
--- Source-Copied Rows Deleted		--> Rows: 5112
--- Too Long Source/Target Deleted	--> Rows: 5028
--- HTML Removed			--> Rows: 5028
--- Rows will remain in true-cased	--> Rows: 5028
--- Rows with Empty Cells Deleted	--> Rows: 5028
--- Rows Shuffled			--> Rows: 5028
--- Source Saved: ELRC-3077-wikipedia_health.en-es.es-filtered.es
--- Target Saved: ELRC-3077-wikipedia_health.en-es.en-filtered.en

••• ELRC-3210-antibiotic.en-es.es ELRC-3210-antibiotic.en-es.en
Dataframe shape (rows, columns): (937, 2)
--- Rows with Empty Cells Deleted	--> Rows: 936
--- Duplicates Deleted			--> Rows: 936
--- Source-Copied Rows Deleted		--> Rows: 936
--- Too Long Source/Target Deleted	--> Rows: 921
--- HTML Removed			--> Rows: 921
--- Rows will remain in true-cased	--> Rows: 921
--- 

In [None]:
!wget -qq https://github.com/ymoslem/MT-Preparation/raw/main/filtering/semantic_filter.py -O semantic_filter.py

In [None]:
!pip3 install sentence_transformers -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m25.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m73.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m37.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m105.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m86.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.0/295.0 kB[0m [31m36.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for sentence_transformers (setup.py) ... [?25l[?25hdone


# Semantic Filtering

In [None]:
from semantic_filter import filter, line_count
from sentence_transformers import SentenceTransformer


muse_langs = ['ar', 'de', 'en', 'es', 'fr', 'it', 'ko', 'nl', 'pt', 'pt', 'ru', 'tr', 'zh']
para_langs = ["ar", "bg", "ca", "cs", "da", "de", "en", "el", "es", "et", "fa", "fi", "fr", "gl", "gu", "he", "hi", "hr", "hu", "hy", "id", "it", "ja", "ka", "ko", "ku", "lt", "lv", "mk", "mn", "mr", "ms", "my", "nb", "nl", "pl", "pt", "pt", "ro", "ru", "sk", "sl", "sq", "sr", "sv", "th", "tr", "uk", "ur", "vi", "zh"]

# Download and load the model
model_cache = "/content/drive/Shareddrives/adapt-yasmin/models"

if srclang in muse_langs and tgtlang in muse_langs:
    model_name = "distiluse-base-multilingual-cased-v1"  # 15 languages
elif srclang in para_langs and tgtlang in para_langs:
    model_name = "paraphrase-multilingual-MiniLM-L12-v2"  # 50 languages
else:
    raise SystemExit("Language pair is not supported!")



model = SentenceTransformer(model_name, device="cuda", cache_folder=model_cache)
print("Model loaded:", model_name)

threshold = 0.45
chunk_size = 5000  # or more if memory allows

# Start a multiprocessing pool
pool = model.start_multi_process_pool()


print("Semanticly filtering files...")

for source_file, target_file in zip(source_files, target_files):
    source_file = source_file + "-filtered." + srclang
    target_file = target_file + "-filtered." + tgtlang
    print("\n\n•••", source_file, target_file)

    file_line_count = line_count(source_file)
    print("Originsl line count:", file_line_count)

    # Filter
    filter(source_file,
           target_file,
           srclang,
           tgtlang,
           model,
           pool,
           threshold,
           chunk_size,
           file_line_count
           )

    file_line_count_after_filtering = line_count(source_file + ".semantic." + srclang)
    print("New line count:", file_line_count)
    print(source_file + ".semantic." + srclang)
    print(target_file + ".semantic." + tgtlang)

# Close the the multiprocessing pool
model.stop_multi_process_pool(pool)

Model loaded: distiluse-base-multilingual-cased-v1
Semanticly filtering files...


••• ELRC-3077-wikipedia_health.en-es.es-filtered.es ELRC-3077-wikipedia_health.en-es.en-filtered.en
Originsl line count: 5028
5000 | 5028 | New line count: 5028
ELRC-3077-wikipedia_health.en-es.es-filtered.es.semantic.es
ELRC-3077-wikipedia_health.en-es.en-filtered.en.semantic.en


••• ELRC-3210-antibiotic.en-es.es-filtered.es ELRC-3210-antibiotic.en-es.en-filtered.en
Originsl line count: 921
921 | New line count: 921
ELRC-3210-antibiotic.en-es.es-filtered.es.semantic.es
ELRC-3210-antibiotic.en-es.en-filtered.en.semantic.en


••• ELRC-3299-EUROPARL_covid.en-es.es-filtered.es ELRC-3299-EUROPARL_covid.en-es.en-filtered.en
Originsl line count: 644
644 | New line count: 644
ELRC-3299-EUROPARL_covid.en-es.es-filtered.es.semantic.es
ELRC-3299-EUROPARL_covid.en-es.en-filtered.en.semantic.en


••• ELRC-3470-EC_EUROPA_covid.en-es.es-filtered.es ELRC-3470-EC_EUROPA_covid.en-es.en-filtered.en
Originsl line count: 2

# Global Filtering

In [None]:
!cat *semantic.en > all.en
!wc -l all.en

!cat *semantic.es > all.es
!wc -l all.es

1037465 all.en
1037465 all.es


In [None]:
!ls all.*

all.en	all.es


In [None]:
# Remove duplicates from the all.* files
from global_filter import prepare

source_file = "all." + srclang
target_file = "all." + tgtlang

print("Filtering the 'all.*' files...")

prepare(source_file, target_file, srclang, tgtlang)

print("Done!")

Filtering the 'all.*' files...
Dataframe shape (rows, columns): (1037465, 2)
--- Rows with Empty Cells Deleted	--> Rows: 1037465
--- Duplicates Deleted			--> Rows: 991108
--- Too-Long Source/Target Deleted	--> Rows: 922343
--- Rows with Empty Cells Deleted	--> Rows: 922343
--- Rows Shuffled			--> Rows: 922343
--- Source Saved: all.es-filtered.es
--- Target Saved: all.en-filtered.en
Done!


# Split Train, Dev and Test

In [None]:
!wget -qq https://github.com/ymoslem/MT-Preparation/raw/main/train_dev_split/train_dev_test_split.py -O train_dev_test_split.py

In [None]:
from train_dev_test_split import split_dataset

source_file = f"all.{srclang}-filtered.{srclang}"
target_file = f"all.{tgtlang}-filtered.{tgtlang}"

segment_no_dev = 60000
segment_no_test = 60000

print("Splitting files:", source_file, target_file, sep="\n")

split_dataset(segment_no_dev, segment_no_test, source_file, target_file)

Splitting files:
all.es-filtered.es
all.en-filtered.en
Dataframe shape: (922343, 2)
--- Empty Cells Deleted --> Rows: 922343
--- Wrote Files
Done!
Output files
all.es-filtered.es.train
all.en-filtered.en.train
all.es-filtered.es.dev
all.en-filtered.en.dev
all.es-filtered.es.test
all.en-filtered.en.test


In [None]:
!wc -l all.es-filtered.*

   922343 all.es-filtered.es
    60000 all.es-filtered.es.dev
    60000 all.es-filtered.es.test
   802343 all.es-filtered.es.train
  1844686 total


## Extract a unique "Context Dataset" to retrieve fuzzy matches

In [None]:
!head -n 10000 all.es-filtered.es.test > all.es-filtered.es.real.test
!tail -n 50000 all.es-filtered.es.test > all.es-filtered.es.fuzzy.test

!head -n 10000 all.es-filtered.es.dev > all.es-filtered.es.real.dev
!tail -n 50000 all.es-filtered.es.dev > all.es-filtered.es.fuzzy.dev

In [None]:
!head -n 10000 all.en-filtered.en.test > all.en-filtered.en.real.test
!tail -n 50000 all.en-filtered.en.test > all.en-filtered.en.fuzzy.test

!head -n 10000 all.en-filtered.en.dev > all.en-filtered.en.real.dev
!tail -n 50000 all.en-filtered.en.dev > all.en-filtered.en.fuzzy.dev

In [None]:
!rename 's/all.en-/all-/g' *
!rename 's/all.es-/all-/g' *

In [None]:
!wc -l all-filtered.*

In [None]:
!head -n 5 all-filtered.es.test