In [1]:
import nagisa
import requests
import time
import pandas as pd
from janome.tokenizer import Tokenizer
from bs4 import BeautifulSoup
from collections import Counter

[dynet] random seed: 1234
[dynet] allocating memory: 32MB
[dynet] memory allocation done.


In [2]:
URL = "https://www.aozora.gr.jp/cards/000148/files/795_43522.html"
book_source = requests.get(URL)

In [21]:
soup = BeautifulSoup(book_source.content, "html.parser")
book_text = soup.find("div",{"class":"main_text"}).get_text()
book_title = soup.find("h1").get_text()
book_author = soup.find("h2").get_text()
file_name = f'{book_author} - {book_title}.csv'

In [4]:
pronouns = nagisa.extract(book_text, extract_postags='代名詞').words
nouns = nagisa.extract(book_text, extract_postags='名詞').words
verbs = nagisa.extract(book_text, extract_postags='動詞').words
i_adjectives = nagisa.extract(book_text, extract_postags='形容詞').words
na_adjectives = nagisa.extract(book_text, extract_postags='形状詞').words

In [None]:
print(pronouns)

In [5]:
def convert_to_base_form(wordlist):
    t = Tokenizer()
    base_forms = []

    for word in wordlist:
        tokens = t.tokenize(word)
        for token in tokens:
            base_forms.append(token.base_form)

    return base_forms


In [6]:
verbs = convert_to_base_form(verbs)
i_adjectives = convert_to_base_form(i_adjectives)
na_adjectives = convert_to_base_form(na_adjectives)

In [7]:
verbs = [word for word in verbs if len(word) > 1]
i_adjectives = [word for word in i_adjectives if len(word) > 1]
na_adjectives = [word for word in na_adjectives if len(word) > 1]

In [10]:
wordlist = pronouns + nouns + verbs + i_adjectives + na_adjectives
wordlist = Counter(wordlist)
wordlist = pd.DataFrame(wordlist.items(), columns = ['Word', 'Count'])
print(wordlist)

    Word  Count
0      余     20
1     子規     18
2      画     24
3      一     20
4     亡友      2
..   ...    ...
289  真面目      1
290   単純      1
291  几帳面      1
292   多大      1
293   雄大      1

[294 rows x 2 columns]


In [None]:
requests.get("https://jisho.org/api/v1/search/words?keyword=渋紙").json()

In [None]:
requests.get("https://jisho.org/api/v1/search/words?keyword=無い").json()['data'][0]['jlpt']

In [17]:
def make_api_request(word):
    url = f"https://jisho.org/api/v1/search/words?keyword={word}"
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        return response.json()
    except (requests.exceptions.RequestException, requests.exceptions.Timeout):
        print(f"Timeout for word: {word}. Retrying in 5 seconds...")
        time.sleep(5)
        return make_api_request(word)

In [12]:
def get_reading(wordlist):
    results = []
    for word in wordlist:
        response = make_api_request(word)
        try:
            result = response['data'][0]['japanese'][0]['reading']
        except (IndexError, KeyError):
            result = "unknown"
        results.append(result)
    return results

In [13]:
def get_jlpt_level(wordlist):
    results = []
    for word in wordlist:
        response = make_api_request(word)
        try:
            if response['data'][0]['jlpt']:
                result = response['data'][0]['jlpt'][0]
            else:
                result = "Unknown"
        except (IndexError, KeyError):
            result = "Unknown"
        results.append(result)
    return results

In [15]:
wordlist_reading = get_reading(wordlist['Word'])
print(wordlist_reading)

Timeout for word: 思い玉え. Retrying in 5 seconds...
Timeout for word: 思い玉え. Retrying in 5 seconds...
Timeout for word: 見玉え. Retrying in 5 seconds...
Timeout for word: 見玉え. Retrying in 5 seconds...
Timeout for word: 見玉え. Retrying in 5 seconds...
Timeout for word: 見玉え. Retrying in 5 seconds...
Timeout for word: 見玉え. Retrying in 5 seconds...
Timeout for word: 見玉え. Retrying in 5 seconds...
Timeout for word: 見玉え. Retrying in 5 seconds...
Timeout for word: 見玉え. Retrying in 5 seconds...
Timeout for word: 見玉え. Retrying in 5 seconds...
Timeout for word: 彼. Retrying in 5 seconds...
Timeout for word: 彼. Retrying in 5 seconds...
Timeout for word: 手間. Retrying in 5 seconds...
Timeout for word: 性情. Retrying in 5 seconds...
Timeout for word: 性情. Retrying in 5 seconds...
Timeout for word: 性情. Retrying in 5 seconds...
Timeout for word: 性情. Retrying in 5 seconds...
Timeout for word: 性情. Retrying in 5 seconds...
Timeout for word: 初心. Retrying in 5 seconds...
Timeout for word: 初心. Retrying in 5 seconds...
Ti

In [16]:
wordlist['Reading'] = wordlist_reading
print(wordlist)

    Word  Count Reading
0      余     20       よ
1     子規     18   ほととぎす
2      画     24      かく
3      一     20      いち
4     亡友      2    ぼうゆう
..   ...    ...     ...
289  真面目      1     まじめ
290   単純      1   たんじゅん
291  几帳面      1  きちょうめん
292   多大      1     ただい
293   雄大      1    ゆうだい

[294 rows x 3 columns]


In [18]:
wordlist_level = get_jlpt_level(wordlist['Word'])
wordlist['Level'] = wordlist_level
print(wordlist)

Timeout for word: 子規. Retrying in 5 seconds...
Timeout for word: 子規. Retrying in 5 seconds...
Timeout for word: 中間. Retrying in 5 seconds...
Timeout for word: 文筆. Retrying in 5 seconds...
Timeout for word: 風. Retrying in 5 seconds...
Timeout for word: 風. Retrying in 5 seconds...
Timeout for word: そこ. Retrying in 5 seconds...
Timeout for word: そこ. Retrying in 5 seconds...
Timeout for word: そこ. Retrying in 5 seconds...
Timeout for word: そこ. Retrying in 5 seconds...
Timeout for word: そこ. Retrying in 5 seconds...
Timeout for word: そこ. Retrying in 5 seconds...
Timeout for word: そこ. Retrying in 5 seconds...
Timeout for word: そこ. Retrying in 5 seconds...
Timeout for word: 重厚. Retrying in 5 seconds...
Timeout for word: 始末. Retrying in 5 seconds...
Timeout for word: 特長. Retrying in 5 seconds...
Timeout for word: 思う. Retrying in 5 seconds...
Timeout for word: 浸る. Retrying in 5 seconds...
Timeout for word: 浸る. Retrying in 5 seconds...
Timeout for word: 浸る. Retrying in 5 seconds...
Timeout for wor

In [22]:
wordlist.to_csv(file_name, index = False)