In [1]:
import json
# import pykakasi
import pprint
# import cutlet
import requests
# from backoff import on_exception, expo
from requests.exceptions import HTTPError
import time
from fuzzywuzzy import fuzz



In [2]:
stationsF = open('processed_stations_shrinked.json')
stations = json.load(stationsF)

In [3]:
prefectureF = open('open-data-jp-prefectures-master/prefectures.json')
prefectures = json.load(prefectureF)

In [4]:
prefecture_dict = {}

for prefecture in prefectures:
    iso_code = prefecture["iso"]
    # Split romaji and exclude the last word if more than one word
    romaji_parts = prefecture["prefecture_romaji"].split()
    if len(romaji_parts) > 1:
        romaji_name = " ".join(romaji_parts[:-1])
    else:
        romaji_name = romaji_parts[0]  # For single-word cases like "Hokkaido"
    
    # Store in dictionary
    prefecture_dict[iso_code] = romaji_name

# Output the result
print(prefecture_dict)

{'01': 'Hokkaido', '02': 'Aomori', '03': 'Iwate', '04': 'Miyagi', '05': 'Akita', '06': 'Yamagata', '07': 'Fukushima', '08': 'Ibaraki', '09': 'Tochigi', '10': 'Gunma', '11': 'Saitama', '12': 'Chiba', '13': 'Tokyo', '14': 'Kanagawa', '15': 'Niigata', '16': 'Toyama', '17': 'Ishikawa', '18': 'Fukui', '19': 'Yamanashi', '20': 'Nagano', '21': 'Gifu', '22': 'Shizuoka', '23': 'Aichi', '24': 'Mie', '25': 'Shiga', '26': 'Kyoto', '27': 'Osaka', '28': 'Hyōgo', '29': 'Nara', '30': 'Wakayama', '31': 'Tottori', '32': 'Shimane', '33': 'Okayama', '34': 'Hiroshima', '35': 'Yamaguchi', '36': 'Tokushima', '37': 'Kagawa', '38': 'Ehime', '39': 'Kōchi', '40': 'Fukuoka', '41': 'Saga', '42': 'Nagasaki', '43': 'Kumamoto', '44': 'Ōita', '45': 'Miyazaki', '46': 'Kagoshima', '47': 'Okinawa'}


In [5]:
eki = "駅"

In [18]:
text = '猪名寺'
# text = '梅田站'
# text = '中山寺駅'
# text = '新大阪'

In [13]:
# @on_exception(expo, HTTPError, max_tries=5, factor=2)
def fetch_wikidata(place_name: str, use_eki=True):
    url = 'https://www.wikidata.org/w/api.php'
    station_name = place_name if place_name.endswith(eki) else place_name + eki
    params = {
        'action': 'wbsearchentities',
        'format': 'json',
        'search': station_name,
        'language': 'ja'
    }
    try:
        response = requests.get(url, params=params)
        response.raise_for_status()  # Ensure we catch HTTP errors
        return response.json()
    except requests.exceptions.RequestException as e:
        print(f"Error fetching data: {e}")
        return None

def get_romaji_name(place_name, prefecture_id):
    prefecture_name = prefecture_dict.get(prefecture_id, None)
    if prefecture_name is None:
        print(f"Invalid prefecture_id: {prefecture_id}")
        return None, None

    # Fetch data from Wikidata
    data = fetch_wikidata(place_name)
    if data is None or 'search' not in data:
        print(f"{place_name} cannot be found on Wikidata")
        return None, None

    for item in data['search']:
        # Check for an English label in 'display'
        if 'display' in item and 'label' in item['display']:
            label = item['display']['label']
            if label['language'] == 'en':
                # Extract the English name and remove "Station" if present
                romaji_name = label['value'].replace("Station", "").strip()

                # Extract serving city
                if 'description' in item['display'] and 'value' in item['display']['description']:
                    description = item['display']['description']['value']
                    # print(description)
                    if fuzz.partial_ratio(prefecture_name, description) >= 75:
                        if prefecture_name == "Tokyo":
                            city_name = prefecture_name
                            return romaji_name, city_name
                        location_parts = description.split(',')
                        if len(location_parts) >= 2:
                            # Follow the rules based on segment count
                            if len(location_parts) == 2 or len(location_parts) == 3:
                                # If there are 2 or 3 segments, city is the first part
                                city_name = location_parts[0].strip().split()[-1]
                            elif len(location_parts) == 4:
                                # If there are 4 segments, city is the second part
                                if "district" in location_parts[1].lower() or "prefecture" in location_parts[1].lower():
                                    city_name = location_parts[0].strip().split()[-1]
                                else:
                                    city_name = location_parts[1].strip().split()
                            else:
                                city_name = location_parts[0].strip().split()[-1]
                    
                        return romaji_name, city_name

    print(f'No instance of {place_name} station is found')
    return None, None

In [18]:
print(get_romaji_name("塚口", "28"))
print(get_romaji_name("木幡", "26"))
print(get_romaji_name("芦屋", "28"))
print(get_romaji_name("阿田和", "24"))

('Tsukaguchi', 'Amagasaki')
('Kohata', 'Uji')
('Ashiya', 'Ashiya')
('Atawa', 'Mihama')


In [19]:
import copy

In [32]:
translated_stations = []

In [33]:
def handle_special_case(data):
    # A very ugly brute force solution
    s_name, p_name = data["name_kanji"], data["prefecture"]
    if s_name == "駒ケ岳":
        return "Komagatake", "Mori"
    elif s_name == "千代ケ岡":
        return "Chiyogaoka", "Asahikawa"
    elif s_name == "溝の口":
        return "Musashi-Mizonokuchi", "Kawasaki"
    elif s_name == "新川崎":
        return "Shin-Kawasaki", "Kawasaki"
    elif s_name == "電鉄富山駅・エスタ前":
        return "Toyama", "Toyama"
    elif s_name == "昭和町通り":
        return "Nishi-Urakami", "Nagasaki"
    elif s_name == "成田空港（第１旅客ターミナル）" or s_name == "空港第２ビル（第２旅客ターミナル）":
        return "Narita Airport", "Narita"
    elif s_name == "鹿島サッカースタジアム（臨）":
        return "Kashima Soccer Stadium", "Kashima"
    else:
        print(f"{s_name, p_name} is not covered in special case")
        return None, None

In [34]:
def translate_station(data):
    translated_data = data.copy()
    stat, city = get_romaji_name(data["name_kanji"], data["prefecture"])
    if stat is None:
        stat, city = handle_special_case(data)
    translated_data["name_romaji"] = stat
    translated_data["city"] = city

    alternative_names = {
        name: get_romaji_name(name, data["prefecture"])[0] for name in data.get("alternative_names", [])
    }
    translated_stations = []
    for station in data["stations"]:
        # Copy station data to avoid modifying the original input
        translated_station = station.copy()
        # Add romaji name, based on the outer name_kanji (station group)
        if translated_station["name_kanji"] != data["name_kanji"]:
            translated_station["name_romaji"] = alternative_names[translated_station["name_kanji"]]
        else:
            translated_station["name_romaji"] = translated_data["name_romaji"]
        # Append to translated_stations list
        translated_stations.append(translated_station)

    # Step 4: Add the translated stations to a new list in the translated data dictionary
    translated_data["stations"] = translated_stations

    return translated_data

In [30]:
from tqdm import tqdm

In [35]:
for i in tqdm(range(len(stations))):
    translated_stations.append(translate_station(stations[i]))

  0%|          | 12/3216 [00:04<19:01,  2.81it/s]

No instance of 駒ケ岳 station is found


  5%|▍         | 151/3216 [00:55<22:36,  2.26it/s]

No instance of 幾寅 station is found
('幾寅', '01') is not covered in special case


  5%|▍         | 152/3216 [00:56<21:34,  2.37it/s]

No instance of 落合 station is found
('落合', '01') is not covered in special case


  8%|▊         | 266/3216 [01:37<16:33,  2.97it/s]

No instance of 月ケ岡 station is found
('月ケ岡', '01') is not covered in special case


  8%|▊         | 269/3216 [01:38<16:27,  2.99it/s]

No instance of 豊ケ岡 station is found
('豊ケ岡', '01') is not covered in special case


  9%|▉         | 296/3216 [01:48<16:18,  2.99it/s]

No instance of 千代ケ岡 station is found


 32%|███▏      | 1039/3216 [06:12<13:59,  2.59it/s]

No instance of 溝の口 station is found


 34%|███▍      | 1108/3216 [06:43<12:58,  2.71it/s]

No instance of 新川崎 station is found


 42%|████▏     | 1336/3216 [08:08<13:48,  2.27it/s]

No instance of 成田空港（第１旅客ターミナル） station is found


 42%|████▏     | 1339/3216 [08:09<12:04,  2.59it/s]

No instance of 空港第２ビル（第２旅客ターミナル） station is found


 42%|████▏     | 1344/3216 [08:11<11:07,  2.81it/s]

No instance of 鹿島サッカースタジアム（臨） station is found


 48%|████▊     | 1538/3216 [09:22<10:04,  2.78it/s]

No instance of 電鉄富山駅・エスタ前 station is found


 48%|████▊     | 1539/3216 [09:24<22:28,  1.24it/s]

No instance of 富山駅北 station is found


 62%|██████▏   | 2002/3216 [12:18<07:13,  2.80it/s]

No instance of 三宮・花時計前 station is found


 62%|██████▏   | 2005/3216 [12:21<12:55,  1.56it/s]

No instance of ハーバーランド station is found


 64%|██████▎   | 2043/3216 [12:36<08:17,  2.36it/s]

No instance of 天王寺駅前 station is found


 64%|██████▎   | 2045/3216 [12:38<14:44,  1.32it/s]

No instance of 新今宮駅前 station is found


 82%|████████▏ | 2646/3216 [16:20<04:33,  2.09it/s]

No instance of 高知駅前 station is found


 89%|████████▊ | 2848/3216 [17:35<03:07,  1.96it/s]

No instance of 熊本駅前 station is found


 90%|█████████ | 2897/3216 [17:52<01:48,  2.94it/s]

No instance of 昭和町通り station is found


100%|██████████| 3216/3216 [19:46<00:00,  2.71it/s]


In [73]:
with open("translated_stations_small.json", "w", encoding="utf-8") as of:
    json.dump(translated_stations, of, ensure_ascii=False, indent=4)

In [36]:
with open("translated_stations_test.json", "w", encoding="utf-8") as of:
    json.dump(translated_stations, of, ensure_ascii=False, indent=4)

### Manually work each one of these:

e.g. https://www.wikidata.org/w/api.php?action=wbsearchentities&format=json&language=en&search=%E9%A7%92%E5%B2%B3

- 駒ケ岳 (Komagatake)
- 幾寅 (Doesn't matter)
- 落合 (Doesn't matter)
- 月ケ岡 (Permanently closed)
- 豊ケ岡 (Permanently closed)
- 千代ケ岡 (Chiyogaoka)
- 溝の口 (Mizonokuchi)
- 新川崎 (Shin-Kawasaki)
- 成田空港（第１旅客ターミナル） (Narita)
- 空港第２ビル（第２旅客ターミナル） (Narita)
- 鹿島サッカースタジアム（臨） (Kashima Soccer Stadium)
- 電鉄富山駅・エスタ前 (Toyama)
- 富山駅北 (Toyama)
- 天王寺駅前 (Doesn't matter)
- 新今宮駅前 (Doesn't matter)
- 熊本駅前 (Doesn't matter)
- 昭和町通り (Nishi-Urakami)

### Inaccurate translations provided by libraries

In [None]:
kks = pykakasi.kakasi()
result = kks.convert(text)
for item in result:
    print("{}: kana '{}', hiragana '{}', romaji: '{}'".format(item['orig'], item['kana'], item['hira'], item['hepburn']))

寺: kana 'テラ', hiragana 'てら', romaji: 'tera'


In [25]:
katsu = cutlet.Cutlet()
print(katsu.romaji(text))
dicts = ['hepburn', 'kunrei', 'nippon', 'nihon']
for d in dicts:
    katsu = cutlet.Cutlet(d)
    print(katsu.romaji(text))

Tera
Tera
Tera
Tera
Tera
