<a href="https://colab.research.google.com/github/vikramkrishnan9885/MyColab/blob/master/NeuralMachineTranslation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import requests

from argparse import Namespace
from nltk.tokenize import word_tokenize
import numpy as np
import pandas as pd

import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
def progress_bar(some_iter):
    try:
        from tqdm import tqdm
        return tqdm(some_iter)
    except ModuleNotFoundError:
        return some_iter

def download_file_from_google_drive(id, destination):
    print("Trying to fetch {}".format(destination))

    def get_confirm_token(response):
        for key, value in response.cookies.items():
            if key.startswith('download_warning'):
                return value

        return None

    def save_response_content(response, destination):
        CHUNK_SIZE = 32768

        with open(destination, "wb") as f:
            for chunk in progress_bar(response.iter_content(CHUNK_SIZE)):
                if chunk: # filter out keep-alive new chunks
                    f.write(chunk)

    URL = "https://docs.google.com/uc?export=download"

    session = requests.Session()

    response = session.get(URL, params = { 'id' : id }, stream = True)
    token = get_confirm_token(response)

    if token:
        params = { 'id' : id, 'confirm' : token }
        response = session.get(URL, params = params, stream = True)

    save_response_content(response, destination)

In [None]:
file_id_0 = '1o2ac0EliUod63sYUdpow_Dh-OqS3hF5Z'
destination_0 = 'eng-fra.txt'
download_file_from_google_drive(file_id_0, destination_0)

Trying to fetch eng-fra.txt


292it [00:00, 2262.38it/s]


In [None]:
file_id_1 = '1jLx6dZllBQ3LXZkCjZ4VciMQkZUInU10'
destination_1 = 'simplest_eng_fra.csv'
download_file_from_google_drive(file_id_1, destination_1)

Trying to fetch simplest_eng_fra.csv


30it [00:00, 2761.95it/s]


In [None]:
args = Namespace(
    source_data_path="eng-fra.txt",
    output_data_path="simplest_eng_fra.csv",
    perc_train=0.7,
    perc_val=0.15,
    perc_test=0.15,
    seed=1337
)

In [None]:
with open(args.source_data_path) as fp:
    lines = fp.readlines()

In [None]:
lines = [line.replace("\n", "").lower().split("\t") for line in lines]

In [None]:
data = []
for english_sentence, french_sentence in lines:
    data.append(
        {
            "english_tokens": word_tokenize(english_sentence, language="english"),
            "french_tokens": word_tokenize(french_sentence, language="french")
        }
    )

In [None]:
filter_phrases = (
    ("i", "am"), ("i", "'m"), 
    ("he", "is"), ("he", "'s"),
    ("she", "is"), ("she", "'s"),
    ("you", "are"), ("you", "'re"),
    ("we", "are"), ("we", "'re"),
    ("they", "are"), ("they", "'re")
)

In [None]:
data_subset = {phrase: [] for phrase in filter_phrases}
for datum in data:
    key = tuple(datum['english_tokens'][:2])
    if key in data_subset:
        data_subset[key].append(datum)

In [None]:
counts = {k: len(v) for k,v in data_subset.items()}
counts, sum(counts.values())

({('he', "'s"): 787,
  ('he', 'is'): 1069,
  ('i', "'m"): 4760,
  ('i', 'am'): 805,
  ('she', "'s"): 316,
  ('she', 'is'): 504,
  ('they', "'re"): 470,
  ('they', 'are'): 194,
  ('we', "'re"): 1053,
  ('we', 'are'): 181,
  ('you', "'re"): 2474,
  ('you', 'are'): 449},
 13062)

In [None]:
np.random.seed(args.seed)

dataset_stage3 = []
for phrase, datum_list in sorted(data_subset.items()):
    np.random.shuffle(datum_list)
    n_train = int(len(datum_list) * args.perc_train)
    n_val = int(len(datum_list) * args.perc_val)

    for datum in datum_list[:n_train]:
        datum['split'] = 'train'
        
    for datum in datum_list[n_train:n_train+n_val]:
        datum['split'] = 'val'
        
    for datum in datum_list[n_train+n_val:]:
        datum['split'] = 'test'
    
    dataset_stage3.extend(datum_list)    

In [None]:
# here we pop and assign into the dictionary, thus modifying in place
for datum in dataset_stage3:
    datum['source_language'] = " ".join(datum.pop('english_tokens'))
    datum['target_language'] = " ".join(datum.pop('french_tokens'))

In [None]:
nmt_df = pd.DataFrame(dataset_stage3)

In [None]:
nmt_df.head()

Unnamed: 0,split,source_language,target_language
0,train,he 's the cutest boy in town .,c'est le garçon le plus mignon en ville .
1,train,he 's a nonsmoker .,il est non-fumeur .
2,train,he 's smarter than me .,il est plus intelligent que moi .
3,train,he 's a lovely young man .,c'est un adorable jeune homme .
4,train,he 's three years older than me .,il a trois ans de plus que moi .


In [None]:
nmt_df.to_csv(args.output_data_path)