In [1]:
import nagisa
import requests
import time
import pandas as pd
from janome.tokenizer import Tokenizer
from bs4 import BeautifulSoup
from collections import Counter

[dynet] random seed: 1234
[dynet] allocating memory: 32MB
[dynet] memory allocation done.


In [3]:
URL = "https://www.aozora.gr.jp/cards/000148/files/795_43522.html"
book_source = requests.get(URL)

In [4]:
soup = BeautifulSoup(book_source.content, "html.parser")
book_text = soup.find("div",{"class":"main_text"}).get_text()

In [5]:
pronouns = nagisa.extract(book_text, extract_postags='代名詞').words
nouns = nagisa.extract(book_text, extract_postags='名詞').words
verbs = nagisa.extract(book_text, extract_postags='動詞').words
i_adjectives = nagisa.extract(book_text, extract_postags='形容詞').words
na_adjectives = nagisa.extract(book_text, extract_postags='形状詞').words

In [6]:
print(pronouns)

['余', '子規', '画', '一', '亡友', '記念', '間', 'それ', '袋', '中', '年数', '時', '袋', '所在', '事', '近頃', '転宅', '際', 'どこ', '散逸', '今', 'うち', '表具', '懸物', '気', '渋紙', '袋', '塵', '中', '画', '元', 'まま', '四', '折', '画', 'ほか', '子規', '手紙', '幾', '余', '中', '子規', '余', '寄こ', '最後', 'もの', 'それ', '年月', 'もの', '中間', '例', '画', '三', '一纏め', '表装', '画', '一', '花瓶', '東菊', '図柄', '単簡', '者', '傍', '是', '所', '思い玉え', '病気', '所為', '思い玉え', '嘘', '肱', '見玉え', '註釈', 'ところ', '自分', '子規', '画', '時', '余', '東京', '彼', '画', '東菊', '置きけり', '火', '国', '住みける', '一', '首', '歌', '熊本', '壁', '感じ', '色', '花', '茎', '葉', '硝子', '瓶', '三', '花', '一', '蕾', '二', '葉', '数', '勘定', 'すべて', '九', 'それ', '周囲', '表装', '絹地', '藍', '心持', '子規', '草花', 'ため', '努力', '三', '花', '五', '六', '時間', '手間', 'どこ', 'どこ', 'これ', '骨折', 'ただ', '病中', '根気', '仕事', '決心', '雑作', '俳句', '歌', '彼', '性情', '矛盾', '画', '事', '初心', '彼', '当時', '絵画', '写生', '必要', '不折', 'それ', '一', '一', '上', '実行', '彼', '俳句', '上', '悟入', '方法', '方面', '適用', '事', '適用', '腕', '東菊', '代表', '子規', '画', '才', '章', '彼', '文筆', '絵の具', '皿', '同時', '穂先', '運行', '余', 

In [7]:
def convert_to_base_form(wordlist):
    t = Tokenizer()
    base_forms = []

    for word in wordlist:
        tokens = t.tokenize(word)
        for token in tokens:
            base_forms.append(token.base_form)

    return base_forms


In [8]:
verbs = convert_to_base_form(verbs)
i_adjectives = convert_to_base_form(i_adjectives)
na_adjectives = convert_to_base_form(na_adjectives)

In [9]:
verbs = [word for word in verbs if len(word) > 1]
i_adjectives = [word for word in i_adjectives if len(word) > 1]
na_adjectives = [word for word in na_adjectives if len(word) > 1]

In [10]:
pronouns = Counter(pronouns)
print(pronouns)

Counter({'画': 12, '余': 10, '一': 10, '彼': 10, '子規': 9, '拙': 7, 'それ': 4, '中': 4, '時': 4, '事': 4, 'もの': 4, '東菊': 4, '袋': 3, 'どこ': 3, '三': 3, '花': 3, 'ため': 3, '表装': 2, '所': 2, '思い玉え': 2, 'ところ': 2, '歌': 2, '葉': 2, '時間': 2, '根気': 2, '俳句': 2, '上': 2, '実行': 2, '適用': 2, '正岡': 2, '働き': 2, '瞬間': 2, '字': 2, '亡友': 1, '記念': 1, '間': 1, '年数': 1, '所在': 1, '近頃': 1, '転宅': 1, '際': 1, '散逸': 1, '今': 1, 'うち': 1, '表具': 1, '懸物': 1, '気': 1, '渋紙': 1, '塵': 1, '元': 1, 'まま': 1, '四': 1, '折': 1, 'ほか': 1, '手紙': 1, '幾': 1, '寄こ': 1, '最後': 1, '年月': 1, '中間': 1, '例': 1, '一纏め': 1, '花瓶': 1, '図柄': 1, '単簡': 1, '者': 1, '傍': 1, '是': 1, '病気': 1, '所為': 1, '嘘': 1, '肱': 1, '見玉え': 1, '註釈': 1, '自分': 1, '東京': 1, '置きけり': 1, '火': 1, '国': 1, '住みける': 1, '首': 1, '熊本': 1, '壁': 1, '感じ': 1, '色': 1, '茎': 1, '硝子': 1, '瓶': 1, '蕾': 1, '二': 1, '数': 1, '勘定': 1, 'すべて': 1, '九': 1, '周囲': 1, '絹地': 1, '藍': 1, '心持': 1, '草花': 1, '努力': 1, '五': 1, '六': 1, '手間': 1, 'これ': 1, '骨折': 1, 'ただ': 1, '病中': 1, '仕事': 1, '決心': 1, '雑作': 1, '性情': 1, '矛盾': 1, '初心': 1, '当時':

In [None]:
requests.get("https://jisho.org/api/v1/search/words?keyword=渋紙").json()

In [None]:
requests.get("https://jisho.org/api/v1/search/words?keyword=無い").json()['data'][0]['jlpt']

In [None]:
def make_api_request(word):
    url = f"https://jisho.org/api/v1/search/words?keyword={word}"
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        time.sleep(1)
        return response.json()
    except (requests.exceptions.RequestException, requests.exceptions.Timeout):
        print(f"Timeout for word: {word}. Retrying in 5 seconds...")
        time.sleep(5)
        return make_api_request(word)

In [None]:
def get_reading(wordlist):
    results = []
    for word in wordlist:
        response = make_api_request(word)
        try:
            result = response['data'][0]['japanese'][0]['reading']
        except (IndexError, KeyError):
            result = "unknown"
        results.append(result)
    return results

In [None]:
def get_jlpt_level(wordlist):
    results = []
    for word in wordlist:
        response = make_api_request(word)
        try:
            if response['data'][0]['jlpt']:
                result = response['data'][0]['jlpt'][0]
            else:
                result = "Unknown"
        except (IndexError, KeyError):
            result = "Unknown"
        results.append(result)
    return results

In [None]:
print(i_adjectives)
i_adjectives_reading = get_reading(i_adjectives)
print(i_adjectives_reading)

In [None]:
print(i_adjectives)
i_adjectives_level = get_jlpt_level(i_adjectives)
print(i_adjectives_level)

In [None]:
print(nouns)
nouns_level = get_jlpt_level(nouns)
print(nouns_level)