In [1]:
import requests
from bs4 import BeautifulSoup
from bs4.element import Comment
import re
from collections import Counter
from pathlib import Path
import json
import os
from tqdm.notebook import tqdm

In [2]:
import fugashi

# This is our sample text.
# "Fugashi" is a Japanese snack primarily made of gluten.
text = "麩菓子は、麩を主材料とした日本の菓子。"

# The Tagger object holds state about the dictionary. 
tagger = fugashi.Tagger()

words = [word.surface for word in tagger(text)]
print(*words)

麩 菓子 は 、 麩 を 主材 料 と し た 日本 の 菓子 。


In [3]:
def tag_visible(element):
    if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
        return False
    if isinstance(element, Comment):
        return False
    return True

def create_title_dir(results_path,title_page_href):
    title_dir_name = title_page_href.split('/')[-1]
    title_dir_path = results_path + '/' + title_dir_name

    Path(title_dir_path).mkdir(parents=True, exist_ok=True)
    
    return title_dir_path

def lyrics_links(title_page_soup):
    
    llinks = []
    
    div_songs = title_page_soup.find('div',{'id':'songlist'})

    song_page_links = div_songs.find_all('a')
    len(song_page_links)

    for l in song_page_links:
        song_page_href = l.get('href')
        if song_page_href.startswith('http'):
            llinks.append(song_page_href)
            
    return llinks

def scrape_lyrics(song_page_soup):
    texts = song_page_soup.findAll(text=True)

    div_en_text = song_page_soup.find('div',{'id':'sideKanji'})

    if div_en_text is not None:
        texts = div_en_text.find_all(text=True)

        visible_texts = filter(tag_visible, texts)  
        vis_text = u" ".join(t.strip() for t in visible_texts)
        vis_text = re.sub(r"\[(.*?)\]",'',vis_text)

    else:
        vis_text = None
        
    return vis_text

def normalize_freq(cnt):

    total = sum(cnt.values(), 0.0)
    for key in cnt:
        cnt[key] /= total
        
    return cnt

def sort_dict_by_value(dictionary, reversed_order):
    list_d = list(dictionary.items())   
    list_swap = []                      
    for key, value in list_d:            
        list_swap.append((value, key))  
    list_sorted = sorted(list_swap, reverse=reversed_order) 
    return list_sorted   

In [6]:
results_path = 'D:/CODE/PET/lyrics_vocabulary_search/data/dicts_jpn'

Path(results_path).mkdir(parents=True, exist_ok=True)

In [7]:
url = 'https://www.animesonglyrics.com/topanime'
page = requests.get(url)

soup = BeautifulSoup(page.text)

div_all_100 = soup.find('div',{'class':'panel-body'})

title_page_links = div_all_100.find_all('a')

In [8]:
len(title_page_links)

201

In [9]:
import sqlite3
 
conn = sqlite3.connect("D:/CODE/PET/lyrics_vocabulary_search/data/data.db")
cursor = conn.cursor()
 
cursor.execute("""CREATE TABLE freqs
                  (word text, song text, title text,
                   rel_freq real)
               """)

<sqlite3.Cursor at 0x20b2cff8cc0>

In [10]:
#cursor.execute("""DROP TABLE freqs
#               """)

In [11]:
sql = "SELECT * FROM freqs"
cursor.execute(sql)
print(cursor.fetchall()) # or use fetchone()

[]


In [12]:
sqlite_insert_with_param = """INSERT INTO freqs
                  VALUES (?, ?, ?, ?)"""


In [13]:
for title_link in tqdm(title_page_links[0:210]):
    if title_link.get('name') is not None:
        title_page_href = title_link.get('href') # ссылка на страницу тайтла (на ней ссылки на песни)
        
        title_dir_path = create_title_dir(results_path, title_page_href) # создаем папку для тайтла, туда будем сохранять json'ы отедльных песен
        
        title_name = title_page_href.split('/')[-1]
        
        try:
            # делаем суп из страницы и вытаскиваем ссылки на тексты
            title_page = requests.get(title_page_href)
            title_page_soup = BeautifulSoup(title_page.text)

            song_page_hrefs = lyrics_links(title_page_soup)

            for page_href in tqdm(song_page_hrefs):

                #print(page_href)

                # делаем суп из страницы с текстом
                song_page = requests.get(page_href)
                song_page_soup = BeautifulSoup(song_page.text)

                text = scrape_lyrics(song_page_soup) # вытаскиваем очищенный текст

                if text is not None:

                    #words = re.findall(r'\w+', text.lower()) # вытаскиваем список слов

                    tagger = fugashi.Tagger()
                    words = [word.surface for word in tagger(text)]

                    cnt = Counter(words) # делаем словарь, ключ - слово, значение - число употреблений

                    cnt = normalize_freq(cnt)


                    song_name = page_href.split('/')[-1]

                    for word, freq in cnt.items():
                        data_tuple = (word, song_name, title_name, freq)
                        cursor.execute(sqlite_insert_with_param, data_tuple)


                    json_path = title_dir_path + '/' + song_name + '.json'

                    with open(json_path, 'w') as f:
                       json.dump(cnt, f)
        except:
            #print('could not parse')
            pass

conn.close()

ImportError: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html

In [15]:
for i, j in cnt.items(): 
    print(i,j)

Kanji 0.002702702702702703
Good 0.005405405405405406
- 0.005405405405405406
bye 0.005405405405405406
sadness 0.005405405405405406
! 0.021621621621621623
Girl 0.010810810810810811
meets 0.024324324324324326
, 0.016216216216216217
boy 0.010810810810810811
girl 0.010810810810810811
そして 0.002702702702702703
I 0.005405405405405406
miss 0.002702702702702703
you 0.005405405405405406
super 0.005405405405405406
dream 0.002702702702702703
1 0.002702702702702703
回 0.016216216216216217
め 0.010810810810810811
　 0.032432432432432434
まぐれ 0.002702702702702703
で 0.021621621621621623
2 0.002702702702702703
って 0.005405405405405406
どう 0.002702702702702703
だろう 0.002702702702702703
偶然 0.002702702702702703
じゃ 0.010810810810810811
ない 0.021621621621621623
よ 0.005405405405405406
ね 0.002702702702702703
? 0.005405405405405406
3 0.002702702702702703
目 0.002702702702702703
が 0.013513513513513514
合う 0.002702702702702703
の 0.02702702702702703
は 0.01891891891891892
変わっ 0.002702702702702703
た 0.02702702702702703
ひと 0.0

In [16]:
magic_word_list = ['人類','閥']

In [17]:
titles_dirs = os.listdir(results_path)

In [18]:
ratings = dict()

for d in titles_dirs:
    d_path = results_path + '/' + d

    json_names = os.listdir(d_path)
    
    for j in json_names:
        json_path = d_path + '/' + j
        print(json_path)
        
        dkey = '_'.join(json_path.split('/')[-2:]).split('.json')[0]
        ratings[dkey] = 0
        
        with open(json_path) as f:
            my_dict = json.load(f)

            for word in magic_word_list:
                ratings[dkey] += my_dict.get(word,0)

D:/CODE/PET/lyrics_word_search/data/dicts_jpn/code-geass-lelouch-of-the-rebellion/callin.json
D:/CODE/PET/lyrics_word_search/data/dicts_jpn/code-geass-lelouch-of-the-rebellion/colors.json
D:/CODE/PET/lyrics_word_search/data/dicts_jpn/code-geass-lelouch-of-the-rebellion/dice.json
D:/CODE/PET/lyrics_word_search/data/dicts_jpn/code-geass-lelouch-of-the-rebellion/hitomi-no-tsubasa.json
D:/CODE/PET/lyrics_word_search/data/dicts_jpn/code-geass-lelouch-of-the-rebellion/kaidoku-funou.json
D:/CODE/PET/lyrics_word_search/data/dicts_jpn/code-geass-lelouch-of-the-rebellion/mozaiku-kakera.json
D:/CODE/PET/lyrics_word_search/data/dicts_jpn/code-geass-lelouch-of-the-rebellion/picaresque.json
D:/CODE/PET/lyrics_word_search/data/dicts_jpn/code-geass-lelouch-of-the-rebellion/sakura-burst.json
D:/CODE/PET/lyrics_word_search/data/dicts_jpn/code-geass-lelouch-of-the-rebellion/will-ill.json
D:/CODE/PET/lyrics_word_search/data/dicts_jpn/code-geass-lelouch-of-the-rebellion/yuukyou-seishunka.json
D:/CODE/PET/l