In [1]:
import requests
from bs4 import BeautifulSoup
from bs4.element import Comment
import re
from collections import Counter
from pathlib import Path
import json
import os
from tqdm.notebook import tqdm

In [2]:
import fugashi

# This is our sample text.
# "Fugashi" is a Japanese snack primarily made of gluten.
text = "麩菓子は、麩を主材料とした日本の菓子。"

# The Tagger object holds state about the dictionary. 
tagger = fugashi.Tagger()

words = [word.surface for word in tagger(text)]
print(*words)

麩 菓子 は 、 麩 を 主材 料 と し た 日本 の 菓子 。


In [3]:
def tag_visible(element):
    if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
        return False
    if isinstance(element, Comment):
        return False
    return True

def create_title_dir(results_path,title_page_href):
    title_dir_name = title_page_href.split('/')[-1]
    title_dir_path = results_path + '/' + title_dir_name

    Path(title_dir_path).mkdir(parents=True, exist_ok=True)
    
    return title_dir_path

def lyrics_links(title_page_soup):
    
    llinks = []
    
    div_songs = title_page_soup.find('div',{'id':'songlist'})

    song_page_links = div_songs.find_all('a')
    len(song_page_links)

    for l in song_page_links:
        song_page_href = l.get('href')
        if song_page_href.startswith('http'):
            llinks.append(song_page_href)
            
    return llinks

def scrape_lyrics(song_page_soup):
    texts = song_page_soup.findAll(text=True)

    div_en_text = song_page_soup.find('div',{'id':'sideKanji'})

    if div_en_text is not None:
        texts = div_en_text.find_all(text=True)

        visible_texts = filter(tag_visible, texts)  
        vis_text = u" ".join(t.strip() for t in visible_texts)
        vis_text = re.sub(r"\[(.*?)\]",'',vis_text)

    else:
        vis_text = None
        
    return vis_text

def normalize_freq(cnt):

    total = sum(cnt.values(), 0.0)
    for key in cnt:
        cnt[key] /= total
        
    return cnt

def sort_dict_by_value(dictionary, reversed_order):
    list_d = list(dictionary.items())   
    list_swap = []                      
    for key, value in list_d:            
        list_swap.append((value, key))  
    list_sorted = sorted(list_swap, reverse=reversed_order) 
    return list_sorted   

In [4]:
results_path = 'D:/CODE/PET/lyrics_vocabulary_search/data/dicts_jpn'

Path(results_path).mkdir(parents=True, exist_ok=True)

In [5]:
url = 'https://www.animesonglyrics.com/topanime'
page = requests.get(url)

soup = BeautifulSoup(page.text)

div_all_100 = soup.find('div',{'class':'panel-body'})

title_page_links = div_all_100.find_all('a')

In [6]:
len(title_page_links)

201

In [7]:
import sqlite3
 
conn = sqlite3.connect("D:/CODE/PET/lyrics_vocabulary_search/data/data.db")
cursor = conn.cursor()
 
cursor.execute("""CREATE TABLE freqs
                  (word text, song text, title text,
                   rel_freq real)
               """)

OperationalError: table freqs already exists

In [8]:
#cursor.execute("""DROP TABLE freqs
#               """)

In [9]:
sql = "SELECT * FROM freqs"
cursor.execute(sql)
print(cursor.fetchall()) # or use fetchone()

[]


In [10]:
sqlite_insert_with_param = """INSERT INTO freqs
                  VALUES (?, ?, ?, ?)"""


In [None]:
for title_link in tqdm(title_page_links[0:210]):
    if title_link.get('name') is not None:
        title_page_href = title_link.get('href') # ссылка на страницу тайтла (на ней ссылки на песни)
        
        title_dir_path = create_title_dir(results_path, title_page_href) # создаем папку для тайтла, туда будем сохранять json'ы отедльных песен
        
        title_name = title_page_href.split('/')[-1]
        
        try:
            # делаем суп из страницы и вытаскиваем ссылки на тексты
            title_page = requests.get(title_page_href)
            title_page_soup = BeautifulSoup(title_page.text)

            song_page_hrefs = lyrics_links(title_page_soup)

            for page_href in tqdm(song_page_hrefs):

                #print(page_href)

                # делаем суп из страницы с текстом
                song_page = requests.get(page_href)
                song_page_soup = BeautifulSoup(song_page.text)

                text = scrape_lyrics(song_page_soup) # вытаскиваем очищенный текст

                if text is not None:

                    #words = re.findall(r'\w+', text.lower()) # вытаскиваем список слов

                    tagger = fugashi.Tagger()
                    words = [word.surface for word in tagger(text)]

                    cnt = Counter(words) # делаем словарь, ключ - слово, значение - число употреблений

                    cnt = normalize_freq(cnt)


                    song_name = page_href.split('/')[-1]

                    for word, freq in cnt.items():
                        data_tuple = (word, song_name, title_name, freq)
                        cursor.execute(sqlite_insert_with_param, data_tuple)


                    json_path = title_dir_path + '/' + song_name + '.json'

                    with open(json_path, 'w') as f:
                       json.dump(cnt, f)
        except:
            #print('could not parse')
            pass

conn.close()

  0%|          | 0/201 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/22 [00:00<?, ?it/s]

  0%|          | 0/39 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/29 [00:00<?, ?it/s]

In [None]:
for i, j in cnt.items(): 
    print(i,j)

In [None]:
magic_word_list = ['人類','閥']

In [None]:
titles_dirs = os.listdir(results_path)

In [None]:
ratings = dict()

for d in titles_dirs:
    d_path = results_path + '/' + d

    json_names = os.listdir(d_path)
    
    for j in json_names:
        json_path = d_path + '/' + j
        print(json_path)
        
        dkey = '_'.join(json_path.split('/')[-2:]).split('.json')[0]
        ratings[dkey] = 0
        
        with open(json_path) as f:
            my_dict = json.load(f)

            for word in magic_word_list:
                ratings[dkey] += my_dict.get(word,0)