In [83]:
import json
# import pykakasi
import pprint
# import cutlet
import requests
# from backoff import on_exception, expo
from requests.exceptions import HTTPError
import time
from fuzzywuzzy import fuzz

In [84]:
stationsF = open('processed_stations_shrinked.json')
stations = json.load(stationsF)

In [85]:
prefectureF = open('open-data-jp-prefectures-master/prefectures.json')
prefectures = json.load(prefectureF)

In [86]:
prefecture_dict = {}

for prefecture in prefectures:
    iso_code = prefecture["iso"]
    # Split romaji and exclude the last word if more than one word
    romaji_parts = prefecture["prefecture_romaji"].split()
    if len(romaji_parts) > 1:
        romaji_name = " ".join(romaji_parts[:-1])
    else:
        romaji_name = romaji_parts[0]  # For single-word cases like "Hokkaido"
    
    # Store in dictionary
    prefecture_dict[iso_code] = romaji_name

# Output the result
print(prefecture_dict)

{'01': 'Hokkaido', '02': 'Aomori', '03': 'Iwate', '04': 'Miyagi', '05': 'Akita', '06': 'Yamagata', '07': 'Fukushima', '08': 'Ibaraki', '09': 'Tochigi', '10': 'Gunma', '11': 'Saitama', '12': 'Chiba', '13': 'Tokyo', '14': 'Kanagawa', '15': 'Niigata', '16': 'Toyama', '17': 'Ishikawa', '18': 'Fukui', '19': 'Yamanashi', '20': 'Nagano', '21': 'Gifu', '22': 'Shizuoka', '23': 'Aichi', '24': 'Mie', '25': 'Shiga', '26': 'Kyoto', '27': 'Osaka', '28': 'Hyōgo', '29': 'Nara', '30': 'Wakayama', '31': 'Tottori', '32': 'Shimane', '33': 'Okayama', '34': 'Hiroshima', '35': 'Yamaguchi', '36': 'Tokushima', '37': 'Kagawa', '38': 'Ehime', '39': 'Kōchi', '40': 'Fukuoka', '41': 'Saga', '42': 'Nagasaki', '43': 'Kumamoto', '44': 'Ōita', '45': 'Miyazaki', '46': 'Kagoshima', '47': 'Okinawa'}


In [41]:
eki = "駅"

In [18]:
text = '猪名寺'
# text = '梅田站'
# text = '中山寺駅'
# text = '新大阪'

In [None]:
# @on_exception(expo, HTTPError, max_tries=5, factor=2)
def fetch_wikidata(place_name: str, use_eki=True):
    url = 'https://www.wikidata.org/w/api.php'
    station_name = place_name if place_name.endswith(eki) else place_name + eki
    params = {
        'action': 'wbsearchentities',
        'format': 'json',
        'search': station_name,
        'language': 'ja'
    }
    try:
        response = requests.get(url, params=params)
        response.raise_for_status()  # Ensure we catch HTTP errors
        return response.json()
    except requests.exceptions.RequestException as e:
        print(f"Error fetching data: {e}")
        return None

def get_romaji_name(place_name, prefecture_id):
    prefecture_name = prefecture_dict.get(prefecture_id, None)
    if prefecture_name is None:
        print(f"Invalid prefecture_id: {prefecture_id}")
        return None, None

    # Fetch data from Wikidata
    data = fetch_wikidata(place_name)
    if data is None or 'search' not in data:
        print(f"{place_name} cannot be found on Wikidata")
        return None, None

    for item in data['search']:
        # Check for an English label in 'display'
        if 'display' in item and 'label' in item['display']:
            label = item['display']['label']
            if label['language'] == 'en':
                # Extract the English name and remove "Station" if present
                romaji_name = label['value'].replace("Station", "").strip()

                # Extract serving city
                if 'description' in item['display'] and 'value' in item['display']['description']:
                    description = item['display']['description']['value']
                    # print(description)
                    if fuzz.partial_ratio(prefecture_name, description) >= 75:
                        if prefecture_name == "Tokyo":
                            city_name = prefecture_name
                            return romaji_name, city_name
                        location_parts = description.split(',')
                        if len(location_parts) >= 2:
                            # Follow the rules based on segment count
                            if len(location_parts) == 2 or len(location_parts) == 3:
                                # If there are 2 or 3 segments, city is the first part
                                city_name = location_parts[0].strip().split()[-1]
                            elif len(location_parts) == 4:
                                # If there are 4 segments, city is the second part
                                if "district" in location_parts[1]:
                                    city_name = location_parts[0].strip().split()[-1]
                                else:
                                    city_name = location_parts[1].strip().split()
                            else:
                                city_name = location_parts[0].strip().split()[-1]
                    
                        return romaji_name, city_name

    print(f'No instance of {place_name} station is found')
    return None, None

In [106]:
get_romaji_name("尼崎", "28")

('Amagasaki', 'Amagasaki')

In [92]:
import copy

In [107]:
translated_stations = []

In [94]:
def translate_station(data):
    translated_data = data.copy()
    stat, city = get_romaji_name(data["name_kanji"], data["prefecture"])
    translated_data["name_romaji"] = stat
    translated_data["city"] = city

    alternative_names = {
        name: get_romaji_name(name, data["prefecture"])[0] for name in data.get("alternative_names", [])
    }
    translated_stations = []
    for station in data["stations"]:
        # Copy station data to avoid modifying the original input
        translated_station = station.copy()
        # Add romaji name, based on the outer name_kanji (station group)
        if translated_station["name_kanji"] != data["name_kanji"]:
            translated_station["name_romaji"] = alternative_names[translated_station["name_kanji"]]
        else:
            translated_station["name_romaji"] = translated_data["name_romaji"]
        # Append to translated_stations list
        translated_stations.append(translated_station)

    # Step 4: Add the translated stations to a new list in the translated data dictionary
    translated_data["stations"] = translated_stations

    return translated_data

In [95]:
from tqdm import tqdm

In [None]:
start = 1986

In [None]:
for i in tqdm(range(1986, len(stations))):
    translated_stations.append(translate_station(stations[i]))

  1%|▏         | 16/1230 [00:07<07:16,  2.78it/s]

No instance of 三宮・花時計前 station is found


  2%|▏         | 19/1230 [00:10<14:01,  1.44it/s]

No instance of ハーバーランド station is found


  5%|▍         | 57/1230 [00:25<08:21,  2.34it/s]

No instance of 天王寺駅前 station is found


  5%|▍         | 59/1230 [00:27<14:46,  1.32it/s]

No instance of 新今宮駅前 station is found


 31%|███▏      | 386/1230 [02:31<05:06,  2.75it/s]

In [73]:
with open("translated_stations_small.json", "w", encoding="utf-8") as of:
    json.dump(translated_stations, of, ensure_ascii=False, indent=4)

In [98]:
with open("translated_stations_test.json", "a", encoding="utf-8") as of:
    json.dump(translated_stations[:1986], of, ensure_ascii=False, indent=4)

### Manually work each one of these:

e.g. https://www.wikidata.org/w/api.php?action=wbsearchentities&format=json&language=en&search=%E9%A7%92%E5%B2%B3

- 駒ケ岳 (Komagatake)
- 幾寅 (Doesn't matter)
- 落合 (Doesn't matter)
- 月ケ岡 (Permanently closed)
- 豊ケ岡 (Permanently closed)
- 千代ケ岡 (Chiyogaoka)
- 溝の口 (Mizonokuchi)
- 新川崎 (Shin-Kawasaki)
- 成田空港（第１旅客ターミナル） (Narita)
- 空港第２ビル（第２旅客ターミナル） (Narita)
- 鹿島サッカースタジアム（臨） (Kashima Soccer Stadium)
- 電鉄富山駅・エスタ前 (Toyama)
- 富山駅北 (Toyama)
- 天王寺駅前 (Doesn't matter)
- 新今宮駅前 (Doesn't matter)
- 熊本駅前 (Doesn't matter)
- 昭和町通り (Nishi-Urakami)

### TODO:
1. Manually change "-ku" to actual city
2. Osaka vs Ōsaka

### Inaccurate translations provided by libraries

In [None]:
kks = pykakasi.kakasi()
result = kks.convert(text)
for item in result:
    print("{}: kana '{}', hiragana '{}', romaji: '{}'".format(item['orig'], item['kana'], item['hira'], item['hepburn']))

寺: kana 'テラ', hiragana 'てら', romaji: 'tera'


In [25]:
katsu = cutlet.Cutlet()
print(katsu.romaji(text))
dicts = ['hepburn', 'kunrei', 'nippon', 'nihon']
for d in dicts:
    katsu = cutlet.Cutlet(d)
    print(katsu.romaji(text))

Tera
Tera
Tera
Tera
Tera
