In [1]:
import nagisa
import requests
import time
import pandas as pd
from janome.tokenizer import Tokenizer
from bs4 import BeautifulSoup
from collections import Counter

[dynet] random seed: 1234
[dynet] allocating memory: 32MB
[dynet] memory allocation done.


In [4]:
URL = "https://www.aozora.gr.jp/cards/000879/files/128_15261.html"

In [5]:
book_source = requests.get(URL)
soup = BeautifulSoup(book_source.content, "html.parser")
book_text = soup.find("div",{"class":"main_text"}).get_text()
book_title = soup.find("h1").get_text()
book_author = soup.find("h2").get_text()
file_name = f'{book_author} - {book_title}.csv'
print(f'Book loaded: {book_title} by {book_author}')

Book loaded: 羅生門 by 芥川龍之介


In [63]:
def extract_wordlist(text):
    t = Tokenizer()
    wordlist = []

    for token in t.tokenize(text):
        if "名詞" in token.part_of_speech or (token.part_of_speech.startswith("動詞") and not token.part_of_speech.startswith("助動詞")) or "形容詞" in token.part_of_speech or "代名詞" in token.part_of_speech:
            wordlist.append(token.base_form)
    return wordlist

In [67]:
wordlist = extract_wordlist(book_text)
wordlist = Counter(wordlist)
wordlist = pd.DataFrame(wordlist.items(), columns = ['Word', 'Count']).sort_values(by = 'Count', ascending = False)
print(wordlist)

    Word  Count
31    する     78
11    ゐる     59
5     下人     44
2      事     31
42     云     30
..   ...    ...
301    惧      1
300   人目      1
299    患      1
298   雨風      1
688   九月      1

[689 rows x 2 columns]


In [69]:
requests.get("https://jisho.org/api/v1/search/words?keyword=下人").json()

{'meta': {'status': 200},
 'data': [{'slug': '下人',
   'is_common': False,
   'tags': [],
   'jlpt': [],
   'japanese': [{'word': '下人', 'reading': 'げにん'}],
   'senses': [{'english_definitions': ['low-rank person', 'menial'],
     'parts_of_speech': ['Noun'],
     'links': [],
     'tags': [],
     'restrictions': [],
     'see_also': [],
     'antonyms': [],
     'source': [],
     'info': []}],
   'attribution': {'jmdict': True, 'jmnedict': False, 'dbpedia': False}}]}

In [72]:
def make_api_request(word):
    url = f"https://jisho.org/api/v1/search/words?keyword={word}"
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        return response.json()
    except (requests.exceptions.RequestException, requests.exceptions.Timeout):
        print(f"Timeout for word: {word}. Retrying in 5 seconds...")
        time.sleep(5)
        return make_api_request(word)

In [70]:
def get_word_info(wordlist):
    readings = []
    levels = []

    for word in wordlist:
        response = make_api_request(word)

        try:
            reading = response['data'][0]['japanese'][0]['reading']
        except (IndexError, KeyError):
            reading = 'unknown'
        
        try:
            level = response['data'][0]['jlpt'][0]
        except (IndexError, KeyError):
            level = 'unknown'

        readings.append(reading)
        levels.append(level)

    return readings, levels

In [73]:
testlist = ['聞く', '焼く', '本気']
readings, levels = get_word_info(testlist)
print(readings)
print(levels)

['きく', 'やく', 'ほんき']
['jlpt-n5', 'jlpt-n4', 'jlpt-n1']


In [None]:
wordlist.to_csv(file_name, index = False)