# Translating fun!

There are two initial scopes to this:
- first, get a word (or words) to translate
- second, use hugging face to translate the sentences

In [1]:
import pandas as pd 
import requests
from bs4 import BeautifulSoup
import time

## Obtain words and sentences

In [2]:
def parse_wotd_url(
    url: str
) -> tuple[str, str, list[str], str]:
    """Function that takes in the url where the word of the day is located
        And parses out the word, definition, examples and the stand-alone
        word enty link.

        Parameters
        ----------
        url : str
            The url pointing to the word of the day location.

        Returns
        wotd : str
            Word of the day (on the given url).
        definition : str
            Definition of the wotd.
        examples : list
            Example or examples of the wotd.
        entry_link : str
            Url for the wotd entry in the dictionary.
    """

    result = requests.get(url)

    if result.status_code != 200:
        raise ValueError(f'Website did not respond as expected. Status code: {result.status_code}')

    soup = BeautifulSoup(result.text, features="html.parser")

    # can see previous words of the day using h2 tags
    wotd = soup.find_all('h2')[0].text

    meaning_tag = soup.find_all('h2')[1]

    # get the definition and examples until we reach the "see the entry"
    sibling = meaning_tag
    definition = ''
    examples = []
    entry_link = ''
    while True:
        sibling = sibling.find_next_sibling('p')

        if definition == '':
            definition = sibling.text
        elif "See the entry" in sibling.text:
            entry_link = sibling.find(href=True).attrs.get('href', 'Not found')
            break
        else:
            examples.append(sibling.text.replace('//', '').strip())

    return wotd, definition, examples, entry_link

In [23]:
wotd, definition, examples, entry_link = parse_wotd_url('https://www.merriam-webster.com/word-of-the-day/')

print('word of the day', wotd)
print('definition:', definition)
for example in examples:
    print(example)
print('entry link:', entry_link)

word of the day flotsam
definition: Flotsam refers to the floating pieces that remain after a shipwreck, or more broadly to any floating debris or wreckage. It is also used figuratively to refer to miscellaneous or unimportant material, often in the phrase "flotsam and jetsam." 
Driftwood and other flotsam washed onto the beach.
Their apartment was adorned with the flotsam and jetsam of thrift stores and yard sales.
entry link: https://www.merriam-webster.com/dictionary/flotsam


In [4]:
row_data = {
    'Language': 'Eng',
    'Wotd': wotd,
    'Definition': definition,
    'EntryLink': entry_link
}

for i, example in enumerate(examples):
    row_data[f'Example_{i+1}'] = example

test_df = pd.DataFrame([row_data])
test_df

Unnamed: 0,Language,Wotd,Definition,EntryLink,Example_1,Example_2
0,Eng,flotsam,Flotsam refers to the floating pieces that rem...,https://www.merriam-webster.com/dictionary/flo...,Driftwood and other flotsam washed onto the be...,Their apartment was adorned with the flotsam a...


## Translation with pipeline

In [5]:
# Use a pipeline as a high-level helper
from transformers import pipeline

In [6]:
eng_to_ita = pipeline(
    "translation", 
    model="facebook/m2m100_418M", 
    tokenizer="facebook/m2m100_418M",
    src_lang="en", 
    tgt_lang="it"  # Italian code
)

eng_to_slo = pipeline(
    "translation", 
    model="facebook/m2m100_418M", 
    tokenizer="facebook/m2m100_418M",
    src_lang="en", 
    tgt_lang="sl"  # Slovenian code
)


Device set to use cpu
Device set to use cpu


In [7]:
print('===== word =====')
print(wotd)
print('ita:',eng_to_ita(wotd)[0]['translation_text'])
print('slo:',eng_to_slo(wotd)[0]['translation_text'])
print('')

print('===== definition =====')
print(definition)
print('ita:',eng_to_ita(definition)[0]['translation_text'])
print('slo:',eng_to_slo(definition)[0]['translation_text'])
print('')

print('===== examples =====')
for example in examples:
    print(example)
    print('ita:',eng_to_ita(example)[0]['translation_text'])
    print('slo:',eng_to_slo(example)[0]['translation_text'])

===== word =====
flotsam
ita: La flotta
slo: Flotacija

===== definition =====
Flotsam refers to the floating pieces that remain after a shipwreck, or more broadly to any floating debris or wreckage. It is also used figuratively to refer to miscellaneous or unimportant material, often in the phrase "flotsam and jetsam." 
ita: Flotsam si riferisce ai pezzi fluttuanti che rimangono dopo un naufragio, o più ampiamente a qualsiasi spazzatura o rottura fluttuante. è anche usato figurativamente per riferirsi a materiale miscelano o inimportante, spesso nella frase "flotsam e jetsam".
slo: Flotsam se nanaša na plavajoče dele, ki ostanejo po pomorskem napadu, ali širše na kakršne koli plavajoče odpadke ali razpoke. Uporablja se tudi figurativno, da se nanaša na nepomembno ali nepomembno material, pogosto v izrazu "flotsam in jetsam".

===== examples =====
Driftwood and other flotsam washed onto the beach.
ita: Driftwood e altri flotsam lavati sulla spiaggia.
slo: Driftwood in drugi plovili so 

In [8]:
# examples[1].replace('// ', '')

## Random 10 wotd and translation

In [9]:
# could go back to 2022-01-01 is not more, based on June 20, 2025 structure...
# instead, more fun to just take 1001 days of values, by including today and excluding the 1001 ago!
end_date = pd.to_datetime('today').normalize()
start_date = end_date - pd.to_timedelta('1000d')
print(end_date)
print(start_date)

dates_1001 = pd.date_range(start_date, end_date, freq='D')
print(len(dates_1001))

2025-06-25 00:00:00
2022-09-29 00:00:00
1001


In [None]:
# to generate 10 random ones ...
# for date in pd.Series(dates_1001).sample(10):
#     print(date.date())
#     wotd, definition, examples, entry_link = parse_wotd_url(f'https://www.merriam-webster.com/word-of-the-day/{date.date()}')

#     print(wotd)
#     for i, example in enumerate(examples):
#         print(example)   
#     print() 
#     time.sleep(2)

2022-11-23
vamoose
With the sheriff and his posse hot on their tails, the bank robbers knew they had better vamoose.

2023-04-23
gravamen
The gravamen of Walter’s letter to the editor was that the newspaper frequently reports on the school system's failures but rarely covers its successes and improvements.

2025-06-15
progeny
Many Americans are the progeny of immigrants.
The champion thoroughbred passed on his speed, endurance, and calm temperament to his progeny, many of whom became successful racehorses themselves.
This landmark study is the progeny of many earlier efforts to explore the phenomenon.

2025-03-02
transpire
The monument will ensure that posterity will not soon forget the historic events that transpired on that day.

2024-10-31
hallowed
The church stands on hallowed ground.
Community service is one of the organization’s most hallowed traditions.

2024-07-31
impeccable
Although the restaurant was a bit expensive, we found its memorable cuisine, luxurious decor, and impecc

In [11]:
# examples_slo = [res['translation_text'] for res in eng_to_slo([wotd, definition] + examples)]
# examples_slo

In [None]:
# model_name = f"facebook/nllb-200-distilled-600M"
# test_pipe = pipeline("translation", model=model_name,  src_lang="enh_Latn", tgt_lang="ita_Latn")  # Italian code) # tgt_lang ="slv_Latn" for slo

Device set to use cpu


## Translate using Tokenizers

This is better than pipe when using multiple languages.

In [14]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

model_name = "facebook/nllb-200-distilled-600M"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)


In [None]:
def translate(
    text: str 
) -> tuple[str, str]:
    inputs = tokenizer(text, return_tensors="pt")
    # SLO
    bos = tokenizer.convert_tokens_to_ids("slv_Latn")
    outs = model.generate(**inputs, forced_bos_token_id=bos, max_length=512)
    slo_translation = tokenizer.batch_decode(outs, skip_special_tokens=True)[0]
    # ITA
    bos = tokenizer.convert_tokens_to_ids("ita_Latn")
    outs = model.generate(**inputs, forced_bos_token_id=bos, max_length=512)
    ita_translation = tokenizer.batch_decode(outs, skip_special_tokens=True)[0]

    return slo_translation, ita_translation


In [28]:
print('===== word =====')
print(wotd)
slo_translation, ita_translation = translate(wotd)
print('slo:',slo_translation)
print('ita:',ita_translation)
print('')

print('===== definition =====')
print(definition)
slo_translation, ita_translation = translate(definition)
print('slo:',slo_translation)
print('ita:',ita_translation)
print('')

print('===== examples =====')
for example in examples:
    print(example)
    slo_translation, ita_translation = translate(example)
    print('slo:',slo_translation)
    print('ita:',ita_translation)
    print('')

===== word =====
flotsam
slo: vročino
ita: di fiocco

===== definition =====
Flotsam refers to the floating pieces that remain after a shipwreck, or more broadly to any floating debris or wreckage. It is also used figuratively to refer to miscellaneous or unimportant material, often in the phrase "flotsam and jetsam." 
slo: Flotsam se nanaša na plujoče dele, ki ostanejo po brodolomju, ali širše na vse plujoče ostanke ali romove.
ita: Il flotsam si riferisce ai pezzi galleggianti rimasti dopo un naufragio, o più in generale a qualsiasi detriti galleggianti o relitto.

===== examples =====
Driftwood and other flotsam washed onto the beach.
slo: Drva in druga drva se je odplavila na plažo.
ita: Legno di deriva e altri flotsam spargono sulla spiaggia.
Their apartment was adorned with the flotsam and jetsam of thrift stores and yard sales.
slo: Njihov stanovanje je bilo okrašeno z plinom in plinom trgovin in trgovin.
ita: Il loro appartamento era ornato di negozi di scorte e vendite di giar

In [None]:
# this is to match how I designed it for streamlit
translated = [translate(res) for res in [wotd, definition] + examples]

In [34]:
slovenian = [x[0] for x in translated]
italian = [x[1] for x in translated]

In [35]:
slovenian

['vročino',
 'Flotsam se nanaša na plujoče dele, ki ostanejo po brodolomju, ali širše na vse plujoče ostanke ali romove.',
 'Drva in druga drva se je odplavila na plažo.',
 'Njihov stanovanje je bilo okrašeno z plinom in plinom trgovin in trgovin.']

In [36]:
italian

['di fiocco',
 'Il flotsam si riferisce ai pezzi galleggianti rimasti dopo un naufragio, o più in generale a qualsiasi detriti galleggianti o relitto.',
 'Legno di deriva e altri flotsam spargono sulla spiaggia.',
 'Il loro appartamento era ornato di negozi di scorte e vendite di giardini.']