# Preprocess Newspapers

In [1]:
# read files
import pandas as pd

df = pd.read_csv('../data/3_newspapers_ja/japanese_news.csv', sep='\t', encoding='utf-8')
df.head()

  df = pd.read_csv('../data/3_newspapers_ja/japanese_news.csv', sep='\t', encoding='utf-8')


Unnamed: 0,source,date,title,author,text
0,kobe-np.co.jp,2005-07-01,,,会見した北口寛人市長は「刑事訴訟で被告となっている職員にはそれぞれ主張があるが、組織全体とし...
1,kobe-np.co.jp,2005-07-01,,,明石・歩道橋事故をめぐる民事訴訟で、神戸地裁から計五億六千八百万円の賠償を命じられた兵庫県（...
2,kobe-np.co.jp,2007-04-07,,,会見後、遺族代理人の渡部吉泰弁護士は「裁判長が『問うべき者を問わないのは正義に反する』とはっ...
3,kobe-np.co.jp,2007-04-07,,,遺族会は、雑踏警備本部長を務めた元明石署長ら二人の起訴を求め、活動を続けている。
4,kobe-np.co.jp,2007-04-07,,,五人の遺族が閉廷後に会見。二女の優衣菜ちゃん＝当時（８つ）＝を亡くした三木清さん（３８）＝姫...


In [3]:
# tanslate Japanese article to English
from dotenv import load_dotenv
import aiohttp
import asyncio
import time
import os

start_time = time.time()

# load .env file
load_dotenv()

DEEPL_API_KEY = os.getenv('DEEPL_API_KEY_2')
if not DEEPL_API_KEY:
    raise ValueError("API key not found in .env file.")

# Set up DeepL API key
DEEPL_API_KEY = '16bab4d8-e8cd-4117-a2fd-cadc8eaea15a'
API_URL = 'https://api.deepl.com/v2/translate'

# Function to translate a batch of texts asynchronously
async def async_translate_batch(texts, target_lang='EN-US', session=None, attempt=1):
    """Translate a batch of texts asynchronously using DeepL API"""
    if not texts:
        return []

    # Prepare API request
    params = {
        'auth_key': DEEPL_API_KEY,
        'text': texts,
        'target_lang': target_lang
    }

    try:
        async with session.post(API_URL, data=params) as response:
            result = await response.json()
            return [t['text'] for t in result.get('translations', [])]

    except Exception as e:
        if attempt > 5:  # Retry up to 5 times
            print(f'Translation failed for batch {texts}: {e}')
            return ['Translation Error'] * len(texts)

        wait_time = 2 ** attempt  # Exponential backoff
        print(f'Retrying in {wait_time} seconds due to API error: {e}')
        await asyncio.sleep(wait_time)
        return await async_translate_batch(texts, target_lang, session, attempt + 1)

# Function to translate an entire column asynchronously with batching
async def async_translate_column(column_texts, batch_size=10):
    """Translate an entire column in batches asynchronously"""
    results = []
    async with aiohttp.ClientSession() as session:
        for i in range(0, len(column_texts), batch_size):
            batch = column_texts[i:i + batch_size]
            translated_batch = await async_translate_batch(batch, session=session)
            results.extend(translated_batch)
            await asyncio.sleep(0.5)  # Reduce API load
    return results

# Main function to translate all necessary columns
async def main():
    """Main function to translate specific columns in DataFrame"""
    tasks = [
        asyncio.create_task(async_translate_column(df['title'].tolist(), batch_size=50)),
        asyncio.create_task(async_translate_column(df['text'].tolist(), batch_size=50))
    ]
    results = await asyncio.gather(*tasks)

    # Assign translated results back to DataFrame
    df['title'], df['text'] = results

# Run the asynchronous function (for Jupyter Notebook compatibility)
await main()

# Display the translated DataFrame
display(df)

end_time = time.time()
print(f"Execution time: {end_time - start_time:.4f} seconds")

CancelledError: 

In [60]:
# Remove erroneous values

## dates-like values
df['is_no_alphabet_text'] = ~df['text'].str.contains(r'[a-zA-Z]', regex=True, na=False)
df = df[~df['is_no_alphabet_text']]
df = df.drop(columns='is_no_alphabet_text')

## Delete lines containing full-width characters
df = df[~df['text'].apply(lambda x: bool(re.search(r'[\u3000-\u9FFF\uFF01-\uFF60]', str(x))))]

## Remove specific values
df = df[(df['text']!='Translation Error')&(df['text']!='nan')]

## Only lines beginning with numbers or letters (A-Z, a-z, 0-9) are retained.
df = df[df['text'].str.match(r'^[A-Za-z]', na=False)]

df

Unnamed: 0,source,date,title,author,text
0,kobe-np.co.jp,2005-07-01,,,"At the press conference, Mayor Hiroto Kitaguch..."
1,kobe-np.co.jp,2005-07-01,,,"On March 30, Akashi City officially announced ..."
2,kobe-np.co.jp,2007-04-07,,,"After the press conference, attorney Yoshiyasu..."
3,kobe-np.co.jp,2007-04-07,,,The bereaved family association is continuing ...
4,kobe-np.co.jp,2007-04-07,,,The families of the five victims held a press ...
...,...,...,...,...,...
312899,kobe-np.co.jp,2011-11-21,,,The final qualifying round will be held until ...
312950,mainichi.jp,2021-09-29,"Death of ""Golgo 13"" manga artist, Takao Saito,...",,Manga artist Takao Saito (real name Takao Sait...
312952,mainichi.jp,2021-10-18,Naoki Prize-winning author Fumio Yamamoto dies...,,Naoki Prize-winning author Fumio Yamamoto (rea...
312953,mainichi.jp,2021-10-26,"Manga Artist Sanpei Shiratsuchi Dies at 89: ""K...",,"On August 8, manga artist Sanpei Shirato (real..."


In [61]:
import os

# save newspaper data
save_dir = '../data/4_newspapers_en'
os.makedirs(save_dir, exist_ok=True)

# Define file paths using relative paths
csv_file_path = os.path.join(save_dir, 'newspaper_en.csv')
pickle_file_path = os.path.join(save_dir, 'newspaper_en.pkl')

# Save DataFrame to CSV
df.to_csv(csv_file_path, index=False, encoding='utf-8-sig')
# Save DataFrame to Pickle
df.to_pickle(pickle_file_path)