In [12]:
from lxml import etree

In [13]:
parser = etree.XMLParser(load_dtd=True)

In [14]:

tree = etree.parse("../JMdict_e", parser=parser)


In [15]:
root = tree.getroot()

In [16]:
root

<Element JMdict at 0x7f0542f8f880>

In [17]:
class Word:
    def __init__(self, entry_el: etree.Element):
        self.entry_el = entry_el
        self.kanjis = self.parse_kanji_el(entry_el)
        self.readings = self.parse_reading_el(entry_el)
        self.senses = self.parse_senses(entry_el.findall("sense"))

    @staticmethod
    def parse_kanji_el(entry: etree.Element):
        kanjis = []
        for kanji in entry.findall("k_ele"):
            _keb = kanji.find("keb")
            if _keb is not None:
                kanjis.append(_keb.text)
        return kanjis

    @staticmethod
    def parse_senses(entries):
        senses = []
        for sense in entries:
            sense_dict = {
                "stagk": sense.xpath("stagk/text()"),
                "stagr": sense.xpath("stagr/text()"),
                "gloss": sense.xpath("gloss/text()"),
                "pos": sense.xpath("pos/text()"),
                "xref": sense.xpath("xref/text()"),
                "s_inf": sense.xpath("s_inf/text()"),
                "misc": sense.xpath("misc/text()"),
            }
            # Drop empty values
            sense_dict = {k: v for k, v in sense_dict.items() if (v or len(v) > 1)}

            senses.append(sense_dict)
        return senses

    @staticmethod
    def parse_reading_el(entry):
        readings = []
        for reading in entry.findall("r_ele"):
            _reb = reading.find("reb")
            if _reb is not None:
                readings.append(_reb.text)
            else:
                continue
        return readings

    def to_dict(self):
        return {
            "entry_id": self.entry_el.find("ent_seq").text,
            "kanjis": self.kanjis,
            "readings": self.readings,
            "senses": self.senses,
        }

In [18]:
Word(root[0]).to_dict()

{'entry_id': '1000000',
 'kanjis': [],
 'readings': ['ヽ'],
 'senses': [{'gloss': ['repetition mark in katakana'],
   'pos': ['unclassified'],
   'xref': ['一の字点']}]}

In [19]:
Word(root[54592]).to_dict()

{'entry_id': '1577980',
 'kanjis': ['居る'],
 'readings': ['いる'],
 'senses': [{'gloss': ['to be (of animate objects)', 'to exist'],
   'pos': ['Ichidan verb', 'intransitive verb'],
   'xref': ['在る・1'],
   'misc': ['word usually written using kana alone']},
  {'gloss': ['to stay'],
   'pos': ['Ichidan verb', 'intransitive verb'],
   'misc': ['word usually written using kana alone']},
  {'gloss': ['to be ...-ing', 'to have been ...-ing'],
   'pos': ['Ichidan verb', 'auxiliary verb'],
   's_inf': ['after the -te form of a verb; indicates continuing action or state'],
   'misc': ['word usually written using kana alone']}]}

## Insert the dictionary data into MongoDB

In [19]:
from pymongo import MongoClient

In [20]:
client = MongoClient("mongodb://biggy.lo:27017/")

In [21]:
db = client["jdict"]

In [23]:
from tqdm import tqdm

In [28]:
data = [Word(entry) for entry in tqdm(root)]

100%|██████████| 192919/192919 [00:14<00:00, 13559.93it/s]


In [29]:
def chunk(l, n):
    """Chunk the list l in chunk of size n"""
    for i in range(0, len(l), n):
        yield l[i:i + n]

In [31]:
next(chunk(data, 10))

[<__main__.Word at 0x168820fd0>,
 <__main__.Word at 0x168820ac0>,
 <__main__.Word at 0x1688210f0>,
 <__main__.Word at 0x16883e1a0>,
 <__main__.Word at 0x1687c4ac0>,
 <__main__.Word at 0x1687c5540>,
 <__main__.Word at 0x168821120>,
 <__main__.Word at 0x168821180>,
 <__main__.Word at 0x1685bf520>,
 <__main__.Word at 0x169c10460>]

In [32]:
for _chk in tqdm(chunk(data, 1000)):
    db.words.insert_many([_w.to_dict() for _w in _chk])

193it [00:11, 16.67it/s]


In [33]:
_word = db.words.find({"$text": {"$search": "love", "$language": "english"}})

In [35]:
[w for w in _word]

[{'_id': ObjectId('62eb16d2fb563c2ca23ff515'),
  'entry_id': '2519780',
  'kanjis': ['愛し愛される'],
  'readings': ['あいしあいされる'],
  'senses': [{'gloss': ['to love and be loved back',
     'to love and be loved in return',
     'to give and receive love'],
    'pos': ['Ichidan verb']}]},
 {'_id': ObjectId('62eb16d3fb563c2ca2404257'),
  'entry_id': '2730570',
  'kanjis': ['愛迷'],
  'readings': ['あいめい'],
  'senses': [{'gloss': ['straying from love',
     'falling out of love',
     'lost love'],
    'pos': ['noun (common) (futsuumeishi)']}]},
 {'_id': ObjectId('62eb16cbfb563c2ca23e0db5'),
  'entry_id': '1177750',
  'kanjis': ['艶種'],
  'readings': ['つやだね'],
  'senses': [{'gloss': ['love affair', 'love rumour', 'love rumor'],
    'pos': ['noun (common) (futsuumeishi)']}]},
 {'_id': ObjectId('62eb16d3fb563c2ca2402ad2'),
  'entry_id': '2666270',
  'kanjis': ['家族愛'],
  'readings': ['かぞくあい'],
  'senses': [{'gloss': ["love for (one's) family",
     'family love',
     'familial love'],
    'pos': ['nou

## Parse the dict into a big JSONL file

In [9]:
import json
from tqdm import tqdm

In [10]:
data = [Word(entry) for entry in tqdm(root)]

100%|██████████| 192919/192919 [00:14<00:00, 13756.29it/s]


In [11]:
data[0]

<__main__.Word at 0x110cae4a0>

In [12]:
with open("jdict.jsonl", "wb") as f:
    for w in tqdm(data):
        f.write(json.dumps(w.to_dict()).encode("utf-8"))
        f.write(b"\n")


100%|██████████| 192919/192919 [00:01<00:00, 105219.15it/s]


## This could also be a table in PG

- By leveraging the JSONB columns, we can insert the whole file as a table in PG
- Schema:

```sql
CREATE TABLE words (
    id serial PRIMARY KEY,
    jdict_entry_id integer,
    kanji varchar,
    reading varchar,
    senses JSONB,
    metadata JSONB
);
```

- But first we have to normalize the data. We can only have at most one `kanji` and one `reading`.
- Postgres has a `COPY` feature, but it utilize CSV. I don't even want to imagine how painful is it to get the JSONB field to be parsed from the CSV 😱.
 

In [20]:
from sqlalchemy import Column, Integer, String, ForeignKey, DateTime, Boolean
from sqlalchemy.dialects import postgresql as pg

from sqlalchemy.ext.declarative import declarative_base

Base = declarative_base()


class Entry(Base):
    """
    A word entry
    """

    __tablename__ = "entries"

    entry_id = Column(Integer, primary_key=True)
    jdict_entry_id = Column(Integer)

    kanji = Column(String)
    reading = Column(String)

    senses = Column(pg.JSONB)
    meta = Column(pg.JSONB)

    @staticmethod
    def new_entry(jdict_entry_id, reading, senses, kanji=None, meta=None):
        """
        Create a new entry
        """
        return Entry(
            jdict_entry_id=jdict_entry_id,
            reading=reading,
            senses=senses,
            kanji=kanji,
            meta=meta,
        )

    def __repr__(self):
        return f"<Entry(entry_id={self.entry_id} reading={self.reading} kanji={self.kanji}>"

    def to_dict(self):
        """
        Return a dictionary representation of this object
        """
        return {
            "entry_id": self.entry_id,
            "jdict_entry_id": self.jdict_entry_id,
            "kanji": self.kanji,
            "reading": self.reading,
            "senses": self.senses,
            "meta": self.meta,
        }


In [21]:
from tqdm import tqdm

In [22]:
from sqlalchemy import create_engine

In [23]:
engine = create_engine("postgresql+psycopg2://postgres:minhdang@localhost:35432/postgres")

In [24]:
from sqlalchemy.orm import sessionmaker

Session = sessionmaker(bind=engine)


In [28]:
sess = Session()
for entry in tqdm(root):
    w = Word(entry)
    _d = w.to_dict()

    _kanji = _d.get("kanjis", [])
    if len(_kanji) >= 1:
        _kanji = _kanji.pop()
    else:
        _kanji = None

    _reading = _d.get("readings", [])
    if len(_reading) >= 1:
        _reading = _reading.pop()
    else:
        _reading = None


    e = Entry.new_entry(
        jdict_entry_id=_d["entry_id"],
        reading=_reading,
        senses=_d["senses"],
        kanji=_kanji,
        )
    sess.add(e)
sess.commit()

sess.close()
    

100%|██████████| 192919/192919 [00:24<00:00, 7773.52it/s]
