In [205]:
from bs4 import BeautifulSoup
from bs4.element import Tag
import requests
from urllib.parse import urljoin
import re
import json
import multiprocessing
import traceback

In [216]:
crawl_headers = {"User-Agent" : "Katapult/0.1 (github.com/zhuowei/english-to-katakana)"}

In [74]:
# Yes I know MediaWiki has a Query API
def pull_all_from_category(nexturl):
    out = []
    while True:
        print(nexturl)
        req = requests.get(nexturl, headers=crawl_headers)
        req.raise_for_status()
        bs = BeautifulSoup(req.content, 'html.parser')
        alllinks = bs.find("div", "mw-category").find_all("a")
        out += [a.string for a in alllinks]
        next_page = bs.find("a", string="next page")
        if not next_page:
            break
        nexturl = urljoin(nexturl, next_page["href"])
    return out

In [75]:
all_words = pull_all_from_category("https://en.wiktionary.org/wiki/Category:Japanese_terms_borrowed_from_English")

https://en.wiktionary.org/wiki/Category:Japanese_terms_borrowed_from_English
https://en.wiktionary.org/w/index.php?title=Category:Japanese_terms_borrowed_from_English&pagefrom=%E3%81%84%E3%82%93%E3%81%9F%E3%81%B3%E3%82%85%E3%81%82%E3%81%82%0A%E3%82%A4%E3%83%B3%E3%82%BF%E3%83%93%E3%83%A5%E3%82%A2%E3%83%BC#mw-pages
https://en.wiktionary.org/w/index.php?title=Category:Japanese_terms_borrowed_from_English&pagefrom=%E3%81%8B%E3%81%AA%E3%81%A7%E3%81%83%E3%81%82%E3%82%93%E3%81%B5%E3%81%A3%E3%81%A8%E3%81%BC%E3%81%8A%E3%82%8B%0A%E3%82%AB%E3%83%8A%E3%83%87%E3%82%A3%E3%82%A2%E3%83%B3%E3%83%95%E3%83%83%E3%83%88%E3%83%9C%E3%83%BC%E3%83%AB#mw-pages
https://en.wiktionary.org/w/index.php?title=Category:Japanese_terms_borrowed_from_English&pagefrom=%E3%81%91%E3%81%A8%E3%82%93%0A%E3%82%B1%E3%83%88%E3%83%B3#mw-pages
https://en.wiktionary.org/w/index.php?title=Category:Japanese_terms_borrowed_from_English&pagefrom=%E3%81%97%E3%82%83%E3%81%90%E3%82%8A%E3%82%93%E3%81%90%27%0A%E3%82%B8%E3%83%A3%E3%82%B0%E3%8

In [184]:
non_katakana_regex = re.compile("[^ァ-ンー・ヴ゠]")

In [174]:
def filter_func(w):
    # Keep only words made entirely out of katakana. This excludes 28 mixed words.
    return non_katakana_regex.search(w) == None

In [175]:
filter_func("キット狐")

False

In [176]:
filter_func("キャッシュディスペンサー")

True

In [181]:
non_katakana_regex.search("ヴァニタス")

In [199]:
all_words_removed = [a for a in all_words if not filter_func(a)]
all_words_filtered = [str(a) for a in all_words if filter_func(a)]

In [200]:
all_words_removed

['3Dプリンタ',
 '3P',
 'by',
 'CA',
 'CPU',
 'DV',
 'G',
 'GUI',
 'HDTV',
 'ISBN',
 'kango',
 'LCD',
 'LDK',
 'LGBT',
 'NATO',
 'OK',
 'OS',
 'Photoshop',
 'RAM',
 'ROM',
 'RT',
 'S',
 'siat-to͘h',
 'SOS',
 'UFO',
 'Uターン',
 'VR',
 'Web',
 'Windows',
 'RNA',
 'RPG',
 'IMF',
 'IQ',
 'ICBM',
 'IPアドレス',
 'ASEAN',
 'ESP',
 'eメール',
 'ヴァージン諸島',
 'web',
 'AIDS',
 'AI',
 'SI',
 'HIV',
 'FBI',
 '加奈陀',
 '広東',
 'キット狐',
 'CAD',
 '玖馬',
 'ググる',
 '倶楽部',
 'Xマス',
 '珈琲',
 'ココ椰子',
 'CIA',
 'GNP',
 'CD',
 'GDP',
 'CD-ROM',
 'しゃべる',
 '茉莉',
 'スイフト狐',
 '蘇格蘭',
 'スプラトリー群島',
 '西班牙',
 '仙',
 '台北',
 'WHO',
 'WTO',
 'DNA',
 'TNT',
 'Tシャツ',
 'DJ',
 'DTP',
 'DVD',
 'DVD-ROM',
 'ドット積',
 '蕃茄',
 '新西蘭',
 '浬',
 'Pascal',
 'PTA',
 'PR',
 'PTSD',
 'Vサイン',
 '北京ダック',
 '香港',
 '馬来',
 '墨西哥',
 'UNICEF',
 'UNESCO',
 '洋灯',
 '倫敦',
 '華盛頓',
 'ワシントンD.C.',
 'ギャル文字',
 '冗句',
 '垢',
 '瓦',
 '瓦斯',
 '頁']

In [188]:
# fetch the English words corresponding to the Japanese words.
# ends_with_english = re.compile(r"#English$")
starts_with_etymology = re.compile(r"Etymology")

def hunt1(etymelems):
    etym = next(a for a in etymelems if type(a) == Tag and a.name == "p")
    #print(etym)
    if etym:
        #print(etym.find_all("a"))
        h = etym.find(lang="en")
        if h:
            return h.get_text()
    return None
def english_for_japanese(w):
    # print(w)
    url = "https://en.wiktionary.org/wiki/" + w
    req = requests.get(url, headers=crawl_headers)
    req.raise_for_status()
    bs = BeautifulSoup(req.content, 'html.parser')
    jpelems = bs.find(id="Japanese").parent.next_siblings
    all_etyms = [a.find(id=starts_with_etymology) for a in jpelems if type(a) == Tag and a.find(id=starts_with_etymology) != None]
    for etymelem in all_etyms:
        out = hunt1(etymelem.parent.next_siblings)
        if out:
            return out

    raise Exception("what?!" + w)

In [189]:
english_for_japanese("インターネット")

'internet'

In [190]:
english_for_japanese("アール")

'ar'

In [210]:
def fetch_eng_and_jp(a):
    a = str(a)
    try:
        return (a, str(english_for_japanese(a)))
    except:
        traceback.print_exc()
        return None

In [211]:
fetch_eng_and_jp("ホキ")

Traceback (most recent call last):
  File "<ipython-input-210-9bcbf25b30b3>", line 4, in fetch_eng_and_jp
    return (a, str(english_for_japanese(a)))
  File "<ipython-input-188-b81fb101895d>", line 27, in english_for_japanese
    raise Exception("what?!" + w)
Exception: what?!ホキ


In [212]:
with multiprocessing.Pool(20) as pool:
    english_japanese_map = list(pool.map(fetch_eng_and_jp, all_words_filtered))

Traceback (most recent call last):
  File "<ipython-input-210-9bcbf25b30b3>", line 4, in fetch_eng_and_jp
    return (a, str(english_for_japanese(a)))
  File "<ipython-input-188-b81fb101895d>", line 27, in english_for_japanese
    raise Exception("what?!" + w)
Exception: what?!ホキ
Traceback (most recent call last):
  File "<ipython-input-210-9bcbf25b30b3>", line 4, in fetch_eng_and_jp
    return (a, str(english_for_japanese(a)))
  File "<ipython-input-188-b81fb101895d>", line 27, in english_for_japanese
    raise Exception("what?!" + w)
Exception: what?!ホンコン
Traceback (most recent call last):
  File "<ipython-input-210-9bcbf25b30b3>", line 4, in fetch_eng_and_jp
    return (a, str(english_for_japanese(a)))
  File "<ipython-input-188-b81fb101895d>", line 27, in english_for_japanese
    raise Exception("what?!" + w)
Exception: what?!コンリー
Traceback (most recent call last):
  File "<ipython-input-210-9bcbf25b30b3>", line 4, in fetch_eng_and_jp
    return (a, str(english_for_japanese(a)))
  

In [215]:
with open("en_to_jp.json", "w") as outfile:
    json.dump(english_japanese_map, outfile, ensure_ascii=False, indent=0)