# Prepreocess EWS

## Read files and combined them into one dataframe 

In [4]:
from IPython.display import display
import glob
import numpy as np
import os
import pandas as pd

# Define the target folder and filename pattern
folder_path = '../data/1_EWS_ja'
file_pattern = '*_watcher4.csv'

# Get a list of matching CSV file paths
csv_files = glob.glob(os.path.join(folder_path, file_pattern))

# Function to extract necessary data from each file
def extract_EWS_data(df, filename):
    # Create empty lists to store extracted data
    years = []
    months = []
    regions = []
    topics = []
    economic_judgments = []
    sectors_or_occupations = []
    reasons = []
    details = []

    # Extract year and month from filename
    year = filename[:4]   # First 4 characters
    month = filename[4:6] # 5th and 6th characters
    
    # Keys to extract data
    region_keys = ['北海道', '東北', '北関東', '南関東', '甲信越', '東海', '北陸', '近畿', '中国', '四国', '九州', '沖縄']
    topic_keys = ['家計', '企業', '雇用']
    judge_keys = ['◎', '○', '□', '▲', '×']
    
    # Variable to keep track of the current region and topic
    current_region = None
    current_topic = None
    
    # Iterate over rows to extract data
    for index, row in df.iterrows():
        row_values = row.tolist()  # Convert row to list

        # Skip if all elements are NaN
        if pd.Series(row_values).isna().all():
            continue

        # Update region and topic
        if isinstance(row_values[0], str):  # Ensure first column is a string
            # Update Region
            matched_regions = [region_key for region_key in region_keys if region_key in row_values[0]]
            if matched_regions:
                current_region = matched_regions[0]
            
            # Update Topic
            matched_topics = [topic_key for topic_key in topic_keys if topic_key in row_values[0]]
            if matched_topics:
                current_topic = matched_topics[0]
                # Replace topic representation
                topic_mapping = {'家計': '家計動向関連', '企業': '企業動向関連', '雇用': '雇用関連'}
                current_topic = topic_mapping.get(current_topic, current_topic)

        # Extract data if the judgment column has valid data
        if isinstance(row_values[2], str) and any(judge_key in row_values[2] for judge_key in judge_keys):
            years.append(year)
            months.append(month)
            regions.append(current_region)
            topics.append(current_topic)
            economic_judgments.append(row_values[2].strip())
            sectors_or_occupations.append(row_values[3].strip() if isinstance(row_values[3], str) else np.nan)
            reasons.append(row_values[4].strip() if isinstance(row_values[4], str) else np.nan)
            details.append(row_values[5].strip() if isinstance(row_values[5], str) else np.nan)
    
    # Create cleaned dataframe
    df_extracted = pd.DataFrame({
        'Year': years,
        'Month': months,
        'Region': regions,
        'Topic': topics,
        'Economic Judgment': economic_judgments,
        'Sector/Occupation': sectors_or_occupations,
        'Reason for Judgment': reasons,
        'Details': details
    })

    return df_extracted

# Read all matching CSV files into a list of DataFrames
dfs = []
for file in csv_files:
    try:
        df = pd.read_csv(file, encoding='utf-8')  # Try UTF-8 first
    except UnicodeDecodeError:
        try:
            df = pd.read_csv(file, encoding='cp932')  # Try CP932 (Shift_JIS)
        except UnicodeDecodeError:
            df = pd.read_csv(file, encoding='shift_jis')  # Try Shift_JIS as a last resort

    # Extract necessary data and append to list
    df_extracted = extract_EWS_data(df, os.path.basename(file))
    dfs.append(df_extracted)

# Merge all DataFrames into one
if dfs:
    combined_df = pd.concat(dfs, ignore_index=True)
    display(combined_df)
else:
    print('No matching CSV files found.')

Unnamed: 0,Year,Month,Region,Topic,Economic Judgment,Sector/Occupation,Reason for Judgment,Details
0,2000,01,北海道,家計動向関連,◎,－,－,
1,2000,01,北海道,家計動向関連,○,商店街代表者等,,・価格が低下したことにより購買意欲が上向いており、売上点数が増加している。
2,2000,01,北海道,家計動向関連,○,レストラン（高級）スタッフ,,・２～３か月前と比べると、売上が伸びている。12月に比べると、２倍の伸びを示している。
3,2000,01,北海道,家計動向関連,□,百貨店の売場主任・担当者,,・３か月前と同様に、客は必要な物でも慎重な購買を行っている。今月スタートの冬物値下げで商品価...
4,2000,01,北海道,家計動向関連,□,スーパーの店長・店員,,・今月は雪が多いせいもあり、良いとはいえない。客単価も悪く、店の状態をみる限り、景気は良いと...
...,...,...,...,...,...,...,...,...
340739,2025,02,沖縄,雇用関連,□,人材派遣会社（総務担当）,求職者数の動き,・求職の新規登録者が増加せず、相変わらず人手不足が続いている。
340740,2025,02,沖縄,雇用関連,□,求人情報誌製作会社（営業）,求人数の動き,・全体の求人数は、前月比で５％程度の微減である。３か月前と比較すると増減がなく横ばいである。...
340741,2025,02,沖縄,雇用関連,□,学校［大学］（就職支援担当）,それ以外,・物価高が続く限り、消費者は消費意欲を抑えるとみている。
340742,2025,02,沖縄,雇用関連,▲,－,－,－


## Translate Japanese into English

- By custom dictionary

In [5]:
# replace mapping
region_dict = {'北海道': 'Hokkaido', '東北': 'Tohoku',
               '北関東': 'Northern Kanto', '南関東': 'Southern Kanto',
               '甲信越': 'Koshinetsu', '東海': 'Tokai', 
               '北陸': 'Hokuriku', '近畿': 'Kansai', 
               '中国': 'Chugoku', '四国': 'Shikoku', 
               '九州': 'Kyushu', '沖縄': 'Okinawa'}
topic_dict = {'家計動向関連': 'Household Activity', '企業動向関連': 'Corporate Activity', '雇用関連': 'Employment'}
judge_dict = {'◎': 'Excellent', '○': 'Good', '□': 'Unchanged', '▲': 'Slightly Bad', '×': 'Bad'}
reason_dict = {
    # sales prices
    '単価の動き': 'Trends in Sales Price', 
    '受注単価や販売単価の動き': 'Trends in Sales Price', 
    '受注価格や販売': 'Trends in Sales Price',
    '受注価格や販売価格': 'Trends in Sales Price', 
    '受注価格や販売価格の動き': 'Trends in Sales Price', 
    # sales volume
    '販売量の動き': 'Trends in Sales Volume', 
    '受注量や販売量': 'Trends in Sales Volume', 
    '受注量や販売量の動き': 'Trends in Sales Volume',
    # customers
    '客の様子': 'Customer Behavior', 
    'お客様の様子': 'Customer Behavior', 
    'お客様の動き': 'Customer Behavior', 
    '来客数の動き': 'Trends in Customer Visits', 
    # client
    '取引先の様子': 'Client Situation', 
    '取引先の動き': 'Client Situation',
    # comptetior
    '競争相手の様子': 'Competitor Situation', 
    '競争相手の動き': 'Competitor Situation', 
    # surrounding companies
    '周辺企業の様子': 'Surrounding Companies Situation', 
    # employment
    '求人数の動き': 'Trends in Job Openings', 
    '求人数の動き求人数の動き': 'Trends in Job Openings', 
    '求職者数の動き': 'Trends in Job Seekers',  
    '採用者数の動き': 'Trends in Employment Numbers', 
    '雇用形態の様子': 'Trends in Employment Forms',
    # miscellaneous
    'それ以外': 'Others', 
    'その他': 'Others', 
    '＊': 'No significant responses available', 
    '－': 'No response available',
    '・製造業、サービス業等を問わ': np.nan # error value
}

# replace
combined_df['Region'] = combined_df['Region'].replace(region_dict)
combined_df['Topic'] = combined_df['Topic'].replace(topic_dict)
combined_df['Economic Judgment'] = combined_df['Economic Judgment'].replace(judge_dict)
combined_df['Reason for Judgment'] = combined_df['Reason for Judgment'].replace(reason_dict)
combined_df['Sector/Occupation'] = combined_df['Sector/Occupation'].replace({'＊': 'No significant responses available', '－': 'No response available'})
combined_df['Details'] = combined_df['Details'].replace({'＊': 'No significant responses available', '－': 'No response available'})

# show results
display(combined_df)

Unnamed: 0,Year,Month,Region,Topic,Economic Judgment,Sector/Occupation,Reason for Judgment,Details
0,2000,01,Hokkaido,Household Activity,Excellent,No response available,No response available,
1,2000,01,Hokkaido,Household Activity,Good,商店街代表者等,,・価格が低下したことにより購買意欲が上向いており、売上点数が増加している。
2,2000,01,Hokkaido,Household Activity,Good,レストラン（高級）スタッフ,,・２～３か月前と比べると、売上が伸びている。12月に比べると、２倍の伸びを示している。
3,2000,01,Hokkaido,Household Activity,Unchanged,百貨店の売場主任・担当者,,・３か月前と同様に、客は必要な物でも慎重な購買を行っている。今月スタートの冬物値下げで商品価...
4,2000,01,Hokkaido,Household Activity,Unchanged,スーパーの店長・店員,,・今月は雪が多いせいもあり、良いとはいえない。客単価も悪く、店の状態をみる限り、景気は良いと...
...,...,...,...,...,...,...,...,...
340739,2025,02,Okinawa,Employment,Unchanged,人材派遣会社（総務担当）,Trends in Job Seekers,・求職の新規登録者が増加せず、相変わらず人手不足が続いている。
340740,2025,02,Okinawa,Employment,Unchanged,求人情報誌製作会社（営業）,Trends in Job Openings,・全体の求人数は、前月比で５％程度の微減である。３か月前と比較すると増減がなく横ばいである。...
340741,2025,02,Okinawa,Employment,Unchanged,学校［大学］（就職支援担当）,Others,・物価高が続く限り、消費者は消費意欲を抑えるとみている。
340742,2025,02,Okinawa,Employment,Slightly Bad,No response available,No response available,No response available


- By generative AI

In [7]:
from dotenv import load_dotenv
import aiohttp
import asyncio
import time

start_time = time.time()

# load .env file
load_dotenv()

# Set up DeepL API key
DEEPL_API_KEY = os.getenv('DEEPL_API_KEY_1')
if not DEEPL_API_KEY:
    raise ValueError("API key not found in .env file.")
API_URL = 'https://api.deepl.com/v2/translate'

# Function to translate a batch of texts asynchronously
async def async_translate_batch(texts, target_lang='EN-US', session=None, attempt=1):
    """Translate a batch of texts asynchronously using DeepL API"""
    if not texts:
        return []

    # Prepare API request
    params = {
        'auth_key': DEEPL_API_KEY,
        'text': texts,
        'target_lang': target_lang
    }

    try:
        async with session.post(API_URL, data=params) as response:
            result = await response.json()
            return [t['text'] for t in result.get('translations', [])]

    except Exception as e:
        if attempt > 5:  # Retry up to 5 times
            print(f'Translation failed for batch {texts}: {e}')
            return ['Translation Error'] * len(texts)

        wait_time = 2 ** attempt  # Exponential backoff
        print(f'Retrying in {wait_time} seconds due to API error: {e}')
        await asyncio.sleep(wait_time)
        return await async_translate_batch(texts, target_lang, session, attempt + 1)

# Function to translate an entire column asynchronously with batching
async def async_translate_column(column_texts, batch_size=10):
    """Translate an entire column in batches asynchronously"""
    results = []
    async with aiohttp.ClientSession() as session:
        for i in range(0, len(column_texts), batch_size):
            batch = column_texts[i:i + batch_size]
            translated_batch = await async_translate_batch(batch, session=session)
            results.extend(translated_batch)
            await asyncio.sleep(0.5)  # Reduce API load
    return results

# Main function to translate all necessary columns
async def main():
    """Main function to translate specific columns in DataFrame"""
    tasks = [
        asyncio.create_task(async_translate_column(combined_df['Sector/Occupation'].tolist(), batch_size=50)),
        asyncio.create_task(async_translate_column(combined_df['Details'].tolist(), batch_size=50))
    ]
    results = await asyncio.gather(*tasks)

    # Assign translated results back to DataFrame
    combined_df['Sector/Occupation'], combined_df['Details'] = results

# Run the asynchronous function (for Jupyter Notebook compatibility)
await main()

# Display the translated DataFrame
display(combined_df)

end_time = time.time()
print(f"Execution time: {end_time - start_time:.4f} seconds")

CancelledError: 

In [None]:
# Remove the extra leading characters.
combined_df['Details'] = combined_df['Details'].str.lstrip('・∙.')

In [None]:
# save EWS data
save_dir = '../data/2_EWS_en'
os.makedirs(save_dir, exist_ok=True)

# Define file paths using relative paths
csv_file_path = os.path.join(save_dir, 'EWS_data_en.csv')
pickle_file_path = os.path.join(save_dir, 'EWS_data_en.pkl')

# Save DataFrame to CSV
combined_df.to_csv(csv_file_path, index=False, encoding='utf-8-sig')
# Save DataFrame to Pickle
combined_df.to_pickle(pickle_file_path)