In [2]:
import re
import torch
import pandas as pd
import numpy as np
from transformers import MarianMTModel, MarianTokenizer

In [3]:
# Cargar el modelo y el tokenizador para traducción de inglés a español
model_name_en_es = "Helsinki-NLP/opus-mt-en-es"
device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
# Verificar si hay dispositivos GPU disponibles
# if torch.cuda.device_count() > 1:
#     # Si hay más de un dispositivo GPU, habilitar Data Parallelism
    # model_name_en_es = torch.nn.DataParallel(model_name_en_es)
model_en_es = MarianMTModel.from_pretrained(model_name_en_es).to(device)
tokenizer_en_es = MarianTokenizer.from_pretrained(model_name_en_es)
# model_en_es.resize_token_embeddings(len(tokenizer_en_es))

In [4]:
# Función para traducir una lista de oraciones
def translate_sentences(sentences, model, tokenizer):
    input_ids = (tokenizer.batch_encode_plus(sentences, return_tensors="pt", padding=True).to(device))["input_ids"]
    translated = model.generate(input_ids)
    translated_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
    return translated_text

def clean_translation(text):
    cleaned_text = text.replace("..", "")
    return cleaned_text

def remove_repetitive_no(text):
    pattern = r'- No(?:, no)+\.'
    cleaned_text = re.sub(pattern, '', text)
    return cleaned_text

def translate_en_to_es(text):
    text = clean_translation(text)
    sentences = text.split(".")
    translated_sentences = translate_sentences(sentences, model_en_es, tokenizer_en_es)
    translated_text = ". ".join(translated_sentences)
    cleaned_text = remove_repetitive_no(translated_text)
    cleaned_text = cleaned_text.rstrip()
    return cleaned_text

In [5]:
summary_bbc_news = pd.read_csv('summary_bbc_news.csv')
summary_bbc_news

Unnamed: 0,Title,Articles,Summaries,Categories
0,Ad sales boost Time Warner profit,Ad sales boost Time Warner profit\n\nQuarterly...,TimeWarner said fourth quarter sales rose 2% t...,business
1,Dollar gains on Greenspan speech,Dollar gains on Greenspan speech\n\nThe dollar...,The dollar has hit its highest level against t...,business
2,Yukos unit buyer faces loan claim,Yukos unit buyer faces loan claim\n\nThe owner...,Yukos' owner Menatep Group says it will ask Ro...,business
3,High fuel prices hit BA's profits,High fuel prices hit BA's profits\n\nBritish A...,"Rod Eddington, BA's chief executive, said the ...",business
4,Pernod takeover talk lifts Domecq,Pernod takeover talk lifts Domecq\n\nShares in...,Pernod has reduced the debt it took on to fund...,business
...,...,...,...,...
2220,BT program to beat dialler scams,BT program to beat dialler scams\n\nBT is intr...,BT is introducing two initiatives to help beat...,tech
2221,Spam e-mails tempt net shoppers,Spam e-mails tempt net shoppers\n\nComputer us...,A third of them read unsolicited junk e-mail a...,tech
2222,Be careful how you code,Be careful how you code\n\nA new European dire...,This goes to the heart of the European project...,tech
2223,US cyber security chief resigns,US cyber security chief resigns\n\nThe man mak...,Amit Yoran was director of the National Cyber ...,tech


In [6]:
df = summary_bbc_news
df

Unnamed: 0,Title,Articles,Summaries,Categories
0,Ad sales boost Time Warner profit,Ad sales boost Time Warner profit\n\nQuarterly...,TimeWarner said fourth quarter sales rose 2% t...,business
1,Dollar gains on Greenspan speech,Dollar gains on Greenspan speech\n\nThe dollar...,The dollar has hit its highest level against t...,business
2,Yukos unit buyer faces loan claim,Yukos unit buyer faces loan claim\n\nThe owner...,Yukos' owner Menatep Group says it will ask Ro...,business
3,High fuel prices hit BA's profits,High fuel prices hit BA's profits\n\nBritish A...,"Rod Eddington, BA's chief executive, said the ...",business
4,Pernod takeover talk lifts Domecq,Pernod takeover talk lifts Domecq\n\nShares in...,Pernod has reduced the debt it took on to fund...,business
...,...,...,...,...
2220,BT program to beat dialler scams,BT program to beat dialler scams\n\nBT is intr...,BT is introducing two initiatives to help beat...,tech
2221,Spam e-mails tempt net shoppers,Spam e-mails tempt net shoppers\n\nComputer us...,A third of them read unsolicited junk e-mail a...,tech
2222,Be careful how you code,Be careful how you code\n\nA new European dire...,This goes to the heart of the European project...,tech
2223,US cyber security chief resigns,US cyber security chief resigns\n\nThe man mak...,Amit Yoran was director of the National Cyber ...,tech


In [7]:
# Dividir el DataFrame en bloques de 100 filas
category_mapping = {'business': 'negocios', 'entertainment': 'entretenimiento', 'politics': 'política', 'sport': 'deporte', 'tech': 'tecnología'}

blocks = np.array_split(df, len(df) // 20)
ini=43
# Procesar cada bloque y guardar en archivos
for i, block in enumerate(blocks[ini:]):
    try:
        df_espanol = pd.DataFrame(columns=['Título', 'Articulos', 'Resumen', 'Categoría'])
        df_espanol['Categoría'] = block['Categories'].map(category_mapping)
        
        for index, row in block.iterrows():
            translated_title = translate_en_to_es(row['Title'])
            translated_articles = translate_en_to_es(row['Articles'])
            translated_summaries = translate_en_to_es(row['Summaries'])
            df_espanol.loc[index] = [translated_title, translated_articles, translated_summaries, row['Categories']]
        
        # Guardar el bloque en un archivo CSV
        file_name = f"bbc_news_es_bloque{((i+ini+1)*20)-1}.csv"
        df_espanol.to_csv(file_name, index=False)
        
    except Exception as e:
        print(f"Error al procesar el bloque {((i+ini+1)*20)-1}: {str(e)}")
        break

# Consolidar todos los bloques en un solo DataFrame
df_consolidado = pd.concat(blocks)

# Guardar el archivo consolidado
df_consolidado.to_csv("bbc_news_es_consolidado.csv", index=False)



Error al procesar el bloque 1199: CUDA out of memory. Tried to allocate 242.00 MiB (GPU 1; 7.80 GiB total capacity; 5.78 GiB already allocated; 220.69 MiB free; 6.63 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF


In [22]:
category_mapping = {'business': 'negocios', 'entertainment': 'entretenimiento', 'politics': 'política', 'sport': 'deporte', 'tech': 'tecnología'}

blocks = np.array_split(df, len(df) // 20)
ini=42
# Procesar cada bloque y guardar en archivos
for i, block in enumerate(blocks[ini:]):
    try:
        df_espanol = pd.DataFrame(columns=['Título', 'Articulos', 'Resumen', 'Categoría'])
        df_espanol['Categoría'] = block['Categories'].map(category_mapping)
        
        for index, row in block.iterrows():
            translated_title = translate_en_to_es(row['Title'])
            translated_articles = translate_en_to_es(row['Articles'])
            translated_summaries = translate_en_to_es(row['Summaries'])
            df_espanol.loc[index] = [translated_title, translated_articles, translated_summaries, row['Categories']]
        
        # Guardar el bloque en un archivo CSV
        file_name = f"bbc_news_es_bloque{((i+ini+1)*20)-1}.csv"
        df_espanol.to_csv(file_name, index=False)
        
    except Exception as e:
        print(f"Error al procesar el bloque {((i+ini+1)*20)-1}: {str(e)}")
        break

# Consolidar todos los bloques en un solo DataFrame
df_consolidado = pd.concat(blocks)

# Guardar el archivo consolidado
df_consolidado.to_csv("bbc_news_es_consolidado.csv", index=False)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73


In [15]:
import os
import pandas as pd
# category_mapping = {'business': 'negocios', 'entertainment': 'entretenimiento', 'politics': 'política', 'sport': 'deporte', 'tech': 'tecnología'}
df_espanol = pd.DataFrame(columns=['Título', 'Articulos', 'Resumen', 'Categoría'])

# blocks = np.array_split(df, len(df) // 20)
list_sp = os.listdir()
for i in list_sp:
    if i[:len("bbc_news_es_bloque")] == "bbc_news_es_bloque":
        df_espanol= pd.concat([df_espanol,pd.read_csv(i)])
df_espanol
df_espanol.to_csv('bbc_news_es.csv', index=False)

In [11]:
df_espanol = pd.read_csv('bbc_news_es_bloque859.csv', index_col=0)
# df_espanol
df_espanol.to_csv('bbc_news_es_bloque859.csv', index=False)

In [12]:
for index, row in (blocks[42][16:].drop(labels=[862], axis=0)).iterrows():
    translated_title = translate_en_to_es(row['Title'])
    translated_articles = translate_en_to_es(row['Articles'])
    translated_summaries = translate_en_to_es(row['Summaries'])
    df_espanol.loc[index] = [translated_title, translated_articles, translated_summaries, row['Categories']]



In [13]:
# df_espanol.at[761,df_espanol.keys()[2]]
df_espanol
# =["entretenimiento"]
# df_espanol.drop(labels=[762], axis=0)

Unnamed: 0,Título,Articulos,Resumen,Categoría
845,Películas sobre el triunfo de la guerra en Sun...,Películas sobre el triunfo de la guerra en Sun...,Por qué luchamos con el premio del gran jurado...,entertainment
846,Honor para el actor DiCaprio,"La ""carrera excepcional"" del actor DiCaprio Ac...","""Estaba muy feliz de que (DiCaprio) viniera y ...",entertainment
847,Howl ayuda a impulsar los cines japoneses,Howl ayuda a impulsar los cines de Japón La ta...,"El Último Samurai, protagonizado por Tom Cruis...",entertainment
848,Keanu Reeves dio la estrella de Hollywood,Keanu Reeves le dio a la estrella de Hollywood...,"""Cuando tenía 15 años le pregunté a mi madre s...",entertainment
849,De Niro completa golpe de caja,De Niro completa el golpe de taquilla Robert D...,"En el gráfico de taquillas del Reino Unido, Me...",entertainment
850,Día-Lewis para el honor de Berlín,Day-Lewis se estrena para el actor de honor de...,El actor Daniel Day-Lewis recibirá un premio p...,entertainment
851,Compositor estadounidense recrea partitura de ...,Compositor estadounidense recrea partitura de ...,Un musicólogo estadounidense ha recreado una p...,entertainment
852,Se cierra el programa de caridad de Applegate,Applegate's Charity show cierra el musical est...,"Applegate, quien protagonizó la comedia de tel...",entertainment
853,Estudio Ray Charles se convierte en museo,Estudio Ray Charles se convierte en museo Un m...,Un museo dedicado a la carrera del legendario ...,entertainment
854,Britney ataca 'falsos tabloides',Britney ataca 'falsos tabloides' La estrella p...,"La estrella pop Britney Spears ha atacado ""fal...",entertainment


In [14]:
file_name = f"bbc_news_es_bloque{((42+1)*20)-1}.csv"
# df_espanol.to_csv(file_name, index=True)

df_espanol.drop(labels=[862], axis=0).replace({"Categoría":"entertainment"},"entretenimiento").to_csv(file_name)

In [11]:
# list((blocks[37])["Articles"][19:20])
blocks[42][16:]
# blocks[43][16:].drop(labels=[862], axis=0)

Unnamed: 0,Title,Articles,Summaries,Categories
861,Vera Drake's Bafta triumph hope,Vera Drake's Bafta triumph hope\n\nAt the Baft...,"""If Mike Leigh is going to win awards for anyt...",entertainment
862,Roundabout continues nostalgia trip,Roundabout continues nostalgia trip\n\nThe new...,One puppet show that I personally would love t...,entertainment
863,Stars shine on Bafta red carpet,Stars shine on Bafta red carpet\n\nHollywood s...,"Keanu Reeves, who presented the best actress a...",entertainment
864,Bafta to hand out movie honours,Bafta to hand out movie honours\n\nMovie stars...,Staunton is one of the favourites to land the ...,entertainment


In [21]:
file_name = f"bbc_news_es_bloque{((37+1)*20)-1}.csv"
df_espanol.to_csv(file_name, index=True)