In [1]:
import nagisa
import requests
import time
import pandas as pd
from janome.tokenizer import Tokenizer
from bs4 import BeautifulSoup
from collections import Counter

[dynet] random seed: 1234
[dynet] allocating memory: 32MB
[dynet] memory allocation done.


In [4]:
URL = "https://www.aozora.gr.jp/cards/000879/files/128_15261.html"

In [5]:
book_source = requests.get(URL)
soup = BeautifulSoup(book_source.content, "html.parser")
book_text = soup.find("div",{"class":"main_text"}).get_text()
book_title = soup.find("h1").get_text()
book_author = soup.find("h2").get_text()
file_name = f'{book_author} - {book_title}.csv'
print(f'Book loaded: {book_title} by {book_author}')

Book loaded: 羅生門 by 芥川龍之介


In [63]:
def extract_wordlist(text):
    t = Tokenizer()
    wordlist = []

    for token in t.tokenize(text):
        if "名詞" in token.part_of_speech or (token.part_of_speech.startswith("動詞") and not token.part_of_speech.startswith("助動詞")) or "形容詞" in token.part_of_speech or "代名詞" in token.part_of_speech:
            wordlist.append(token.base_form)
    return wordlist

In [None]:
extract_wordlist(book_text)

In [None]:
wordlist = pronouns + nouns + verbs + i_adjectives + na_adjectives
wordlist = Counter(wordlist)
wordlist = pd.DataFrame(wordlist.items(), columns = ['Word', 'Count'])
print(wordlist)

In [None]:
requests.get("https://jisho.org/api/v1/search/words?keyword=渋紙").json()

In [None]:
requests.get("https://jisho.org/api/v1/search/words?keyword=無い").json()['data'][0]['jlpt']

In [None]:
def make_api_request(word):
    url = f"https://jisho.org/api/v1/search/words?keyword={word}"
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        return response.json()
    except (requests.exceptions.RequestException, requests.exceptions.Timeout):
        print(f"Timeout for word: {word}. Retrying in 5 seconds...")
        time.sleep(5)
        return make_api_request(word)

In [None]:
def get_reading(wordlist):
    results = []
    for word in wordlist:
        response = make_api_request(word)
        try:
            result = response['data'][0]['japanese'][0]['reading']
        except (IndexError, KeyError):
            result = "unknown"
        results.append(result)
    return results

In [None]:
def get_jlpt_level(wordlist):
    results = []
    for word in wordlist:
        response = make_api_request(word)
        try:
            if response['data'][0]['jlpt']:
                result = response['data'][0]['jlpt'][0]
            else:
                result = "Unknown"
        except (IndexError, KeyError):
            result = "Unknown"
        results.append(result)
    return results

In [None]:
wordlist_reading = get_reading(wordlist['Word'])
print(wordlist_reading)

In [None]:
wordlist['Reading'] = wordlist_reading
print(wordlist)

In [None]:
wordlist_level = get_jlpt_level(wordlist['Word'])
wordlist['Level'] = wordlist_level
print(wordlist)

In [None]:
wordlist.to_csv(file_name, index = False)