In [1]:
import nagisa
import requests
import time
from janome.tokenizer import Tokenizer
from bs4 import BeautifulSoup

[dynet] random seed: 1234
[dynet] allocating memory: 32MB
[dynet] memory allocation done.


In [2]:
URL = "https://www.aozora.gr.jp/cards/000148/files/795_43522.html"
book_source = requests.get(URL)

In [3]:
soup = BeautifulSoup(book_source.content, "html.parser")
book_text = soup.find("div",{"class":"main_text"}).get_text()

In [4]:
pronouns = nagisa.extract(book_text, extract_postags='代名詞').words
nouns = nagisa.extract(book_text, extract_postags='名詞').words
verbs = nagisa.extract(book_text, extract_postags='動詞').words
i_adjectives = nagisa.extract(book_text, extract_postags='形容詞').words
na_adjectives = nagisa.extract(book_text, extract_postags='形状詞').words

In [None]:
print(pronouns)

In [5]:
def convert_to_base_form(wordlist):
    t = Tokenizer()
    base_forms = []

    for word in wordlist:
        tokens = t.tokenize(word)
        for token in tokens:
            base_forms.append(token.base_form)

    return base_forms


In [6]:
verbs = convert_to_base_form(verbs)
i_adjectives = convert_to_base_form(i_adjectives)
na_adjectives = convert_to_base_form(na_adjectives)

In [7]:
verbs = [word for word in verbs if len(word) > 1]
i_adjectives = [word for word in i_adjectives if len(word) > 1]
na_adjectives = [word for word in na_adjectives if len(word) > 1]

In [None]:
requests.get("https://jisho.org/api/v1/search/words?keyword=渋紙").json()

In [None]:
requests.get("https://jisho.org/api/v1/search/words?keyword=無い").json()['data'][0]['jlpt']

In [8]:
def make_api_request(word):
    url = f"https://jisho.org/api/v1/search/words?keyword={word}"
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        return response.json()
    except (requests.exceptions.RequestException, requests.exceptions.Timeout):
        print(f"Timeout for word: {word}. Retrying in 5 seconds...")
        time.sleep(5)
        return make_api_request(word)

In [9]:
def get_reading(wordlist):
    results = []
    for word in wordlist:
        response = make_api_request(word)
        try:
            result = response['data'][0]['japanese'][0]['reading']
        except (IndexError, KeyError):
            result = "unknown"
        results.append(result)
    return results

In [10]:
def get_jlpt_level(wordlist):
    results = []
    for word in wordlist:
        response = make_api_request(word)
        try:
            if response['data'][0]['jlpt']:
                result = response['data'][0]['jlpt'][0]
            else:
                result = "Unknown"
        except (IndexError, KeyError):
            result = "Unknown"
        results.append(result)
    return results

In [11]:
print(i_adjectives)
i_adjectives_reading = get_reading(i_adjectives)
print(i_adjectives_reading)

['長い', '多い', '湿っぽい', '無い', '短い', '旨い', '淋しい', '白い', '寒い', '冷たい', '少い', 'ない', '拙い', '堅い', '旨い', 'ない', 'ない', 'ない', 'ない', 'ない', 'ない', '淋しい', '淋しい']
['ながい', 'おおい', 'しめっぽい', 'ない', 'みじかい', 'うまい', 'さびしい', 'しろい', 'さむい', 'つめたい', 'すくない', 'ない', 'つたない', 'かたい', 'うまい', 'ない', 'ない', 'ない', 'ない', 'ない', 'ない', 'さびしい', 'さびしい']


In [12]:
print(i_adjectives)
i_adjectives_level = get_jlpt_level(i_adjectives)
print(i_adjectives_level)

['長い', '多い', '湿っぽい', '無い', '短い', '旨い', '淋しい', '白い', '寒い', '冷たい', '少い', 'ない', '拙い', '堅い', '旨い', 'ない', 'ない', 'ない', 'ない', 'ない', 'ない', '淋しい', '淋しい']
['jlpt-n2', 'jlpt-n5', 'Unknown', 'Unknown', 'jlpt-n5', 'jlpt-n3', 'jlpt-n4', 'jlpt-n5', 'jlpt-n5', 'jlpt-n5', 'jlpt-n5', 'Unknown', 'Unknown', 'jlpt-n4', 'jlpt-n3', 'Unknown', 'Unknown', 'Unknown', 'Unknown', 'Unknown', 'Unknown', 'jlpt-n4', 'jlpt-n4']
