In [1]:
import pandas as pd
import os
import re
import torch

from io import StringIO
from rouge_score import rouge_scorer
from sacrebleu.metrics import BLEU
from tqdm import tqdm
from transformers import pipeline
from transformers import MarianMTModel, MarianTokenizer

# Set the CUDA visible devices for GPU utilization
os.environ["CUDA_VISIBLE_DEVICES"]="1"

In [2]:
# Read data
file_path = 'data/eng-spa.txt'
try:
    with open(file_path, 'r') as file:
        data = file.read()
except FileNotFoundError:
    print("The specified file was not found.")
except Exception as e:
    print("An error occurred:", e)

In [3]:
# Define column names and create a DataFrame from the read data
columns = ['English', 'Spanish']
df = []
for line in data.split('\n'):
    if line:
        english, spanish, *_ = line.split('\t')
        df.append([english, spanish])

df = pd.DataFrame(df, columns=columns)

In [4]:
# Preprocess the DataFrame by removing punctuation and converting to lowercase
df['English'] = df['English'].apply(lambda x: re.sub(r'[^\w\s]', '', x))
df['Spanish'] = df['Spanish'].apply(lambda x: re.sub(r'[^\w\s]', '', x))
df['English'] = df['English'].str.lower()
df['Spanish'] = df['Spanish'].str.lower()

# Shuffle and select a subset of the DataFrame
shuffled_df = df.sample(frac=1).reset_index(drop=True)
df = shuffled_df[:24000]


In [5]:
# Check device availability (GPU or CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load pre-trained MarianMT model and tokenizer
model_name = 'Helsinki-NLP/opus-mt-es-en'  # Spanish to English model
model = MarianMTModel.from_pretrained(model_name).to(device)
tokenizer = MarianTokenizer.from_pretrained(model_name)

# Translate Spanish sentences to English using the model
translated_sentences = []
for sentence in tqdm(df['Spanish']):
    inputs = tokenizer.encode(sentence, return_tensors="pt").to(device)
    translated = model.generate(inputs)
    translated_text = tokenizer.decode(translated[0], skip_special_tokens=True)
    translated_sentences.append(translated_text)

df['Translated_English'] = translated_sentences


Using device: cuda


100%|██████████| 24000/24000 [38:50<00:00, 10.30it/s] 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Translated_English'] = translated_sentences


In [6]:
# Further preprocessing on the translated English sentences
df['Translated_English'] = df['Translated_English'].apply(lambda x: re.sub(r'[^\w\s]', '', x))
df['Translated_English'] = df['Translated_English'].str.lower()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Translated_English'] = df['Translated_English'].apply(lambda x: re.sub(r'[^\w\s]', '', x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Translated_English'] = df['Translated_English'].str.lower()


In [7]:
# Evaluate BLEU score
eng_ref = df['English'].tolist()
eng_hyp = df['Translated_English'].tolist()
bleu = BLEU()
result = bleu.corpus_score(eng_ref, [eng_hyp])

In [8]:
# Print BLEU score result
result

BLEU = 58.01 78.0/62.8/52.3/44.2 (BP = 1.000 ratio = 1.033 hyp_len = 149464 ref_len = 144690)

In [9]:
# Calculate ROGUE scores
total_r1 = 0
total_r2 = 0
total_rl = 0
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

for pred, target in zip(eng_hyp, eng_ref):
    scores = scorer.score(pred, target)
    total_r1 += scores['rouge1'].fmeasure
    total_r2 += scores['rouge2'].fmeasure
    total_rl += scores['rougeL'].fmeasure
    
print("ROGUE-1", 100*total_r1/len(eng_hyp))
print("ROGUE-2", 100*total_r2/len(eng_hyp))
print("ROGUE-L", 100*total_rl/len(eng_hyp))

ROGUE-1 80.45469755485243
ROGUE-2 63.77772441079022
ROGUE-L 79.09322621199321


In [10]:
input_sents = df["Spanish"].to_list()
target_sents = df["English"].to_list()
pred_sents = df["Translated_English"].to_list()


In [11]:
print("Input Sentence:", input_sents[0])
print("Target Sentence:", target_sents[0])
print("Predicted Sentence:", pred_sents[0])


Input Sentence: te gustaría enseñarme a jugar al ajedrez
Target Sentence: would you like me to teach you how to play chess
Predicted Sentence: youd like to teach me how to play chess


In [12]:
print("Input Sentence:", input_sents[1])
print("Target Sentence:", target_sents[1])
print("Predicted Sentence:", pred_sents[1])


Input Sentence: no se permite fumar en este restaurante
Target Sentence: smoking isnt allowed in this restaurant
Predicted Sentence: smoking is not allowed in this restaurant


In [13]:
print("Input Sentence:", input_sents[5])
print("Target Sentence:", target_sents[5])
print("Predicted Sentence:", pred_sents[5])


Input Sentence: no me gusta la comida picante
Target Sentence: i dont like spicy food
Predicted Sentence: i dont like spicy food


In [14]:
print("Input Sentence:", input_sents[-6])
print("Target Sentence:", target_sents[-6])
print("Predicted Sentence:", pred_sents[-6])


Input Sentence: ayer la pasé verdaderamente bien
Target Sentence: i had a really good time yesterday
Predicted Sentence: i had a really good time yesterday
