In [None]:
# (1) EXTRACT DATASETS
# Mount Colab account to Google Drive
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
# Inspect dataset
%cd /content/
!pip install datasets
from datasets import load_dataset, inspect_dataset
inspect_dataset("ahazeemi/iwslt14-en-fr", "iwslt14")  # replace with the datasets you are using

In [None]:
# Load dataset and generate train-test-validation splits
# IWSLT '14 and WMT '14 data are stored in parquet files
# IWSLT '17 data is already pre-split
import glob
import os

# Example of loading IWSLT '14 FR-EN
dataset = load_dataset('/content/iwslt14', data_files={'train': '/content/iwslt14/data/train-00000-of-00001.parquet', 'test': '/content/iwslt14/data/test-00000-of-00001.parquet', 'validation':'/content/iwslt14/data/validation-00000-of-00001.parquet'})

# Example of loading WMT'14 FR-EN
%cd /content/wmt14/de-en/

directory = ''
pattern = 'train-00000-of-0000*.parquet'  # Specify your wildcard pattern here

train_files = glob.glob(os.path.join('', pattern))
train_files.sort()

print(train_files)

%cd /content/

train_dataset = load_dataset('wmt14/de-en/', data_files=train_files)
test_eval_dataset = load_dataset('wmt14/de-en/', data_files={'test': 'test-*.parquet', 'validation':'validation-*.parquet'})

# Example of loading IWSLT '17 FR-EN
dataset = load_dataset('/content/iwslt2017/iwslt2017.py', name='iwslt2017-fr-en')


In [None]:
# Concatenate datasets and then apply the IWSLT '14 splits for that language pair
# Only need to change splits for IWSLT '17 and WMT '14
from datasets import concatenate_datasets

# For IWSLT '14
train_data = dataset['train']['translation']
eval_data = dataset['validation']['translation']
test_data = dataset['test']['translation']

# For WMT '14
concatenated_dataset = concatenate_datasets([train_dataset['train'], test_eval_dataset['test'], test_eval_dataset['validation']])

# For IWSLT '17
concatenated_dataset = concatenate_datasets([dataset['train'], dataset['validation'], dataset['test']])

# Generate new splits (only run this for IWSLT '17 and WMT '14)
# Split sizes for IWSLT '14 (change hardcoded values depending on language pair)
train_len = 179435
test_len = 3666
valid_len = 903

# Calculate the slicing indices based on the lengths
data = concatenated_dataset['translation']

train_data = data[:train_len]
eval_data = data[train_len:(train_len + valid_len)]
test_data = data[(train_len + valid_len):(train_len + valid_len + test_len)]

In [None]:
# Check if length of each split matches with IWSLT '14
print(len(train_data))
print(len(eval_data))
print(len(test_data))

# Check data
print(train_data[0])

In [None]:
# Output splits into the right format in local folder
def writeToTxt(data, key, output_file_path, count):
   #key = lang i.e. 'de'
    with open(output_file_path, mode='w') as file:
        for row in data[:count]:
          file.write(row[key] + '\n')

goalDir = '/content/res/'
src = 'fr'  # change this depending on language

# Change hardcoded numbers to the size of the splits
writeToTxt(training_data, src, goalDir + 'train.' + src, 179435)
writeToTxt(training_data, 'en', goalDir + 'train.en', 179435)
writeToTxt(validation_data, src, goalDir + 'dev.'+src, 903)
writeToTxt(validation_data, 'en', goalDir + 'dev.en', 903)
writeToTxt(testing_data, src, goalDir + 'test.'+src, 3666)
writeToTxt(testing_data, 'en', goalDir + 'test.en', 3666)

In [None]:
# Check number of lines in the txt files
def printLines(filePath):
  with open(filePath, 'r') as f:
      lines = len(f.readlines())
      print(lines)

%cd /content/res/
printLines("test.fr")
printLines("test.en")
printLines("train.fr")
printLines("train.en")
printLines("dev.fr")
printLines("dev.en")

In [None]:
# Save content in local folder to Google Drive
!cp -r /content/res/ /content/gdrive/MyDrive/NLP-MLS/Datasets/iwslt14-fr-en # change destination

In [None]:
# Check content of files in Google Drive
import os
os.chdir(goalDir)
!tail -5 test.fr
!tail -5 test.en

In [None]:
# (2) PREPROCESS DATASETS
# Set up environment
import numpy
if numpy.__version__ >= '1.24':
  !pip uninstall -y numpy
  !pip install "numpy<1.24"

def install_torch():
  !pip install torch==1.12.0+cu113 torchvision==0.13.0+cu113 torchaudio==0.12.0 --extra-index-url https://download.pytorch.org/whl/cu113

install_torch()

import torch
try:
    if torch.__version__ != '1.12.0+cu113':
        !pip uninstall -y torch torchdata torchtext #make sure have correct torch version
        install_torch()
except ImportError:
    install_torch()

print(torch.__version__)

!nvidia-smi
!wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64/cuda-ubuntu1604.pin
!mv cuda-ubuntu1604.pin /etc/apt/preferences.d/cuda-repository-pin-600

!wget https://developer.download.nvidia.com/compute/cuda/11.3.0/local_installers/cuda-repo-ubuntu1604-11-3-local_11.3.0-465.19.01-1_amd64.deb
!dpkg -i cuda-repo-ubuntu1604-11-3-local_11.3.0-465.19.01-1_amd64.deb

!apt-key add /var/cuda-repo-ubuntu1604-11-3-local/7fa2af80.pub
!apt-get update
!apt-get -y install cuda-11-3
!apt autoremove

!nvcc --version

!update-alternatives --set cuda /usr/local/cuda-11.3

!nvidia-smi

!nvcc --version

import tensorflow as tf
tf.test.gpu_device_name()

In [None]:
# Clone paper's Github repo and install requirements
! apt-get install git

!ssh-keygen -t rsa -b 4096
!ssh-keyscan -t rsa github.com >> ~/.ssh/known_hosts
!cat /root/.ssh/id_rsa.pub  # add SSH key to your Github

%cd /content/
!git clone git@github.com:chenllliang/MLS.git

%cd /content/MLS/fairseq
!pip install --editable ./
!pip install sacremoses

%cd /content/
!git clone git@github.com:moses-smt/mosesdecoder.git /content/MLS/Tools/mosesdecoder/
!git clone git@github.com:rsennrich/subword-nmt.git /content/MLS/Tools/subword-nmt/


In [None]:
# Code to unzip databin file
!unzip /content/MLS/databin/iwslt14-de-en-bin.zip -d /content/MLS/databin/

In [None]:
# Code to remove databin contents
!rm -r /content/MLS/databin/iwslt14-de-en-isolated-new/
!rm -r /content/MLS/databin/iwslt14-de-en-joined-new

In [None]:
# Run preprocessing script
%cd /content/MLS/scripts/
!bash preprocess.sh /content/gdrive/MyDrive/NLP-MLS/Datasets/iwslt14-fr-en-/res/ en fr  # change destination and languages
%cd /content/

In [None]:
# Save files from runtime to Google Drive (change destination)
!cp -r /content/res/ /content/gdrive/MyDrive/NLP-MLS/Datasets/iwslt14-fr/res/
!cp -r /content/MLS/databin/iwslt14-de-en-isolated-new/ /content/gdrive/MyDrive/NLP-MLS/Datasets/iwslt14-fr-en/iwslt14-fr-en-isolated-new/
!cp -r /content/MLS/databin/iwslt14-de-en-joined-new/ /content/gdrive/MyDrive/NLP-MLS/Datasets/iwslt14-fr-en/iwslt14-fr-en-joined-new/