In [74]:
import re
import nltk
import time
import html
import requests
import pandas as pd
import sqlite3
import genanki

from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import words
from bs4 import BeautifulSoup

In [13]:
resp = requests.get("https://docs.spring.io/spring-framework/docs/current/reference/html/core.html")

In [14]:
soup = BeautifulSoup(resp.text, 'html.parser')
text = soup.get_text()

In [15]:
tokens = word_tokenize(text)

In [16]:
set(tokens)

{'entail',
 'Templated',
 'most',
 'pointcut=',
 'Companies',
 'advice-ref=',
 'Society',
 'although',
 'AppConfig',
 'compile',
 'MutablePropertySources',
 'ResourceArrayPropertyEditor',
 'reworked',
 'Primitives',
 'file-based',
 'accountDao',
 'setValue',
 'java.util.logging',
 'ComponentNamespaceHandler',
 'dataSource',
 'Integrating',
 'exceptions.properties',
 '/dependency',
 'update',
 "'sa",
 'during',
 'considered',
 'poolTargetSource',
 'errors',
 'ReloadableResourceBundleMessageSource',
 'facilities',
 'org.aspectj.lang.annotation.After',
 '1.9.2',
 'import/',
 'weaving-based',
 'beanInstance',
 'instrumenting',
 'non-null',
 'DateFormatter',
 'See',
 'Turn',
 'JSR-310',
 'Z',
 '—',
 'key=',
 'publisher',
 'object',
 'Example',
 'Alliance',
 'Introductions',
 'little',
 'org.springframework.core.codec',
 'onApplicationEvent',
 'PathResource',
 'suited',
 'experience',
 'raw',
 'case-insensitive',
 'designator',
 'Netty',
 'builds',
 'org.springframework.context.support.Class

In [17]:
dict_en = set(words.words())

In [18]:
ftokens = [ t.lower() for t in tokens if t.lower() in dict_en ]

In [19]:
stopwords_en = set(stopwords.words('english'))  

In [20]:
tokens_without_stopwords = [w for w in ftokens if not w in stopwords_en]  

In [21]:
len(set(tokens_without_stopwords))

2120

In [22]:
ntext = nltk.Text(tokens_without_stopwords)

In [23]:
freq = nltk.FreqDist(ntext)

In [24]:
dict(freq)

{'core': 23,
 'version': 11,
 'table': 34,
 'contents': 9,
 'container': 299,
 'introduction': 34,
 'spring': 859,
 'overview': 7,
 'configuration': 525,
 'groovy': 11,
 'bean': 1468,
 'definition': 243,
 'naming': 17,
 'outside': 17,
 'constructor': 148,
 'static': 98,
 'factory': 126,
 'method': 609,
 'instance': 165,
 'type': 324,
 'dependency': 104,
 'injection': 106,
 'resolution': 35,
 'process': 44,
 'detail': 13,
 'straight': 7,
 'inner': 23,
 'null': 81,
 'empty': 16,
 'string': 420,
 'compound': 4,
 'property': 379,
 'excluding': 2,
 'arbitrary': 16,
 'replacement': 12,
 'singleton': 78,
 'scope': 126,
 'prototype': 61,
 'request': 38,
 'session': 35,
 'application': 201,
 'initial': 8,
 'web': 42,
 'custom': 158,
 'nature': 6,
 'destruction': 25,
 'default': 159,
 'destroy': 21,
 'combining': 11,
 'shutdown': 18,
 'shutting': 4,
 'gracefully': 3,
 'aware': 19,
 'inheritance': 9,
 'extension': 26,
 'example': 667,
 'hello': 5,
 'world': 16,
 'class': 857,
 'name': 360,
 'subs

In [25]:
len(list(freq))

2120

In [33]:
con = sqlite3.connect("./dict/stardict.db")

In [34]:
cur = con.cursor()

In [79]:
my_model = genanki.Model(
  1607392319,
  'Simple Model',
  fields=[
    {'name': 'Word'},
    {'name': 'Phonetic'},
    {'name': 'Translation'},
    {'name': 'Definition'},
  ],
  templates=[
    {
      'name': 'Card 1',
      'qfmt': '''
    <div class="back">
        <div class="word">{{Word}}</div>
    </div>
      ''',
      'afmt': '''
    <div class="back">
        <div class="word">
            {{Word}} <span class="phonetic">{{Phonetic}}</span>
        </div>
        <div class="translation">
            {{Translation}}
        </div>
        <div class="definition">
            {{Definition}}
        </div>
    </div>
      ''',
    },
  ],
  css='''
        .back {
            height: 100vh;
            width: 100%;
            margin: auto;
            font-family: "Arial", serif;
            display: block;
            font-size: 22px;
            border-radius: 8px;
        }

        .back .word {
            font-size: 2em;
            text-align: center;
            padding: 30px;
        }

        .back .phonetic {
            font-size: 0.5em;
            font-style: italic;
        }

        .back .translation {
            font-family: "Fira Code", serif;
            padding: 5px 30px 5px 30px;
        }

        .back .definition {
            padding: 5px 30px 5px 30px;
        }

        p {
            margin: 10px;
        }
      '''
)

In [80]:
my_deck2 = genanki.Deck(2059400112, 'Spring Core')

In [81]:
def format_p(s):
    result = ""
    template = "<p>{}</p>\n"
    for p in s.split('\n'):
        result += template.format(p.strip())
    return result


for word in list(freq):
    w = cur.execute("SELECT * FROM stardict WHERE word = '%s'" % word).fetchone()
    if not w:
        continue
    idx = w[0]
    word = w[1]
    sw = w[2]
    phonetic = w[3] if w[3] else ""
    definition = w[4] if w[4] else ""
    translation = w[5] if w[5] else ""
    pos = w[6]
    collins = w[7]
    oxford = w[8]
    tag = w[9]
    bnc = w[10]
    frq = w[11]
    exchange = w[12]
    detail = w[13]
    audio = w[14]
    
    my_note = genanki.Note(model=my_model, fields=[word, phonetic, format_p(translation), format_p(definition)])
    my_deck2.add_note(my_note)

In [82]:
genanki.Package(my_deck2).write_to_file('output.apkg')