In [2]:
import pandas as pd
import subprocess
import os
import logging
import re
import json
from typing import Dict

with open('data/am_tieng_viet.json') as file:
    vietnamese_grammar = json.load(file)

def init_logger():
    logging.basicConfig(
    format='%(asctime)s %(levelname)-8s %(message)s',
    level=logging.INFO,
    datefmt='%Y-%m-%d %H:%M:%S')

def clean_mark(text):
    clean_text = text
    for char in vietnamese_grammar['mapping dấu']:
        if char != vietnamese_grammar['mapping dấu'][char]:
            clean_text = clean_text.replace(char,vietnamese_grammar['mapping dấu'][char])
    return clean_text 

def contains_only_vietnamese_letters_and_spaces(text, 
        suffixes = vietnamese_grammar['vần đơn'] + vietnamese_grammar['vần trơn'] + vietnamese_grammar['vần cản'],
        pattern = re.compile(r'^[A-Za-zÀ-ỹ\s]+$')
    ):
    clean_text = clean_mark(text)
    if not bool(pattern.match(clean_text)):
        return False
    # Check if my_str ends with any of the suffixes
    if not any(clean_text.endswith(suffix) for suffix in suffixes):
        return False
    if any(char in clean_text for char in ['f','w','z']):
        return False
    return True

def clean_upper_case(words:Dict[str, int]):
    new_words = words
    word_keys = list(new_words.keys())
    for w_idx,word in enumerate(word_keys):
        word_lower = word.lower()
        if word_lower not in word_keys or word == word_lower:
            continue
        if " " not in word:
            new_words[word.lower()] += new_words[word]
            del new_words[word]
        elif new_words[word] < new_words[word_lower]//2:
            new_words[word_lower] += new_words[word]
            del new_words[word]
        if w_idx %100000 == 0:
            logging.info(f'  - processed {w_idx} words')
    logging.info(f'  - processed {w_idx} words')
    return new_words

def clean_words_dict(words:Dict[str, int]):
    new_words = words
    word_keys = list(new_words.keys())

    for word in word_keys:
        if not contains_only_vietnamese_letters_and_spaces(word):
            del new_words[word]

    return clean_upper_case(new_words)

In [3]:
init_logger()
base_path = 'data/song_from_lyricvn_com/'
loaded_pd_songs = pd.read_csv(base_path + 'songs_meta_data.csv')
words = {}
sentences = []
logging.info('Start extracting sentences')
for idx, (url, file_path, segment_file_path) in enumerate(loaded_pd_songs.values):
    abs_file_path = base_path + file_path
    abs_segment_file_path = base_path + segment_file_path
    with open(abs_segment_file_path, 'r') as f:
        text = f.read()
    for line in text.split('\n\n'):
        sentence = []
        for tmp_text in line.replace('_',' ').replace('\t',' ').split('\n'):
            if tmp_text:
                cleaned_word = tmp_text[2:].strip()
                if " " not in cleaned_word or not all(word[0].isupper() for word in cleaned_word.split()):
                    cleaned_word = cleaned_word.lower()
                else:
                    cleaned_word = tmp_text[2:].strip()
                if not contains_only_vietnamese_letters_and_spaces(cleaned_word):
                    if sentence:
                        sentences.append(sentence)
                    sentence=[]
                    continue

                if cleaned_word not in words:
                    words[cleaned_word] = 0
                words[cleaned_word] += 1
                sentence.append(cleaned_word)
        if sentence:
            sentences.append(sentence)
    if idx %1000 == 0:
        logging.info(f'  - processed {idx} segmented file')
logging.info(f'  - processed {idx} segmented file')


2023-11-03 18:19:47 INFO     Start extracting sentences
2023-11-03 18:19:47 INFO       - processed 0 segmented file
2023-11-03 18:19:51 INFO       - processed 1000 segmented file
2023-11-03 18:19:54 INFO       - processed 2000 segmented file
2023-11-03 18:19:59 INFO       - processed 3000 segmented file
2023-11-03 18:20:03 INFO       - processed 4000 segmented file
2023-11-03 18:20:07 INFO       - processed 5000 segmented file
2023-11-03 18:20:11 INFO       - processed 6000 segmented file
2023-11-03 18:20:15 INFO       - processed 7000 segmented file
2023-11-03 18:20:19 INFO       - processed 8000 segmented file
2023-11-03 18:20:23 INFO       - processed 9000 segmented file
2023-11-03 18:20:27 INFO       - processed 10000 segmented file
2023-11-03 18:20:30 INFO       - processed 11000 segmented file
2023-11-03 18:20:34 INFO       - processed 12000 segmented file
2023-11-03 18:20:38 INFO       - processed 13000 segmented file
2023-11-03 18:20:42 INFO       - processed 14000 segmented fi

In [3]:
len(sentences)

1025964

In [4]:
n_gram_phrases = {}
for st_idx,sentence in enumerate(sentences):
    for n_gram in [1,2,3]:
        for idx in range(len(sentence) - n_gram + 1):
            n_gram_phrase = ' '.join(sentence[idx:idx + n_gram])
            if n_gram_phrase not in n_gram_phrases:
                n_gram_phrases[n_gram_phrase] = 0
            n_gram_phrases[n_gram_phrase] += 1
    if st_idx %100000 == 0:
        logging.info(f'  - processed {st_idx} sentences')
logging.info(f'  - processed {st_idx} sentences')
# n_gram_phrases = clean_upper_case(n_gram_phrases)
# logging.info('  - Cleaned upper case')

2023-11-03 18:20:51 INFO       - processed 0 sentences
2023-11-03 18:20:51 INFO       - processed 100000 sentences
2023-11-03 18:20:51 INFO       - processed 200000 sentences
2023-11-03 18:20:52 INFO       - processed 300000 sentences
2023-11-03 18:20:52 INFO       - processed 400000 sentences
2023-11-03 18:20:52 INFO       - processed 500000 sentences
2023-11-03 18:20:53 INFO       - processed 600000 sentences
2023-11-03 18:20:53 INFO       - processed 700000 sentences
2023-11-03 18:20:53 INFO       - processed 800000 sentences
2023-11-03 18:20:54 INFO       - processed 900000 sentences
2023-11-03 18:20:54 INFO       - processed 1000000 sentences
2023-11-03 18:20:54 INFO       - processed 1025963 sentences


In [5]:
n_gram_phrases

{'tên': 18914,
 'bài': 19092,
 'hát': 22208,
 'tên bài': 16208,
 'bài hát': 17655,
 'tên bài hát': 16206,
 'Cứ Ngủ Say': 1,
 'ca sĩ': 17500,
 'Khởi My': 67,
 'Chế Đình Cường': 3,
 'sáng tác': 16422,
 'album': 16229,
 'sao': 20302,
 'Đom Đóm': 23,
 'sao Đom Đóm': 18,
 'ngày': 39914,
 'ra mắt': 16239,
 'ngày ra mắt': 16204,
 'thể loại': 16220,
 'Việt Nam': 17142,
 'Nhạc Trẻ': 7234,
 'Bài Hát': 1129,
 'Ngủ Say': 4,
 'Ca Sĩ': 785,
 'lắng nghe': 718,
 'từng': 15180,
 'giọt': 3072,
 'nắng': 6067,
 'lên': 9305,
 'đầy': 3179,
 'từng giọt': 352,
 'giọt nắng': 156,
 'nắng lên': 281,
 'lên đầy': 13,
 'từng giọt nắng': 24,
 'giọt nắng lên': 4,
 'nắng lên đầy': 3,
 'mùa': 6140,
 'thu': 2685,
 'đến': 20986,
 'sớm': 1241,
 'nay': 8741,
 'mùa thu': 973,
 'thu đến': 76,
 'đến sớm': 14,
 'sớm nay': 19,
 'mùa thu đến': 25,
 'thu đến sớm': 2,
 'đến sớm nay': 2,
 'có lẽ': 1817,
 'lúc': 8501,
 'này': 15458,
 'có lẽ lúc': 12,
 'lúc này': 596,
 'có lẽ lúc này': 3,
 'anh': 111534,
 'yêu': 33944,
 'vẫn': 22819,

In [11]:
sorted_dict = sorted(n_gram_phrases.items(), key=lambda x: x[1],reverse=True)

print(sorted_dict[:100])

[('em', 14786), ('anh', 11076), ('người', 4523), ('không', 4472), ('có', 4036), ('là', 4018), ('ta', 3963), ('ngày', 3941), ('đã', 3659), ('đi', 3579), ('nhau', 3387), ('yêu', 3289), ('một', 3241), ('còn', 3030), ('về', 3007), ('cho', 2867), ('trong', 2850), ('ai', 2727), ('mình', 2708), ('như', 2685), ('những', 2676), ('để', 2504), ('rồi', 2478), ('bên', 2443), ('khi', 2360), ('sẽ', 2330), ('con', 2269), ('vẫn', 2249), ('lại', 2192), ('hát', 2156), ('sao', 2085), ('xa', 2037), ('vì', 2028), ('chẳng', 2027), ('chỉ', 2016), ('đến', 2011), ('tôi', 2009), ('và', 1929), ('mà', 1916), ('đâu', 1906), ('bài', 1888), ('nhớ', 1843), ('bài hát', 1842), ('tình', 1840), ('tên', 1825), ('biết', 1815), ('đêm', 1813), ('ca sĩ', 1796), ('Việt Nam', 1760), ('lòng', 1715), ('qua', 1644), ('ơi', 1618), ('sáng tác', 1597), ('ra mắt', 1582), ('album', 1579), ('thể loại', 1579), ('tên bài', 1577), ('tên bài hát', 1577), ('ngày ra mắt', 1576), ('thôi', 1544), ('này', 1517), ('được', 1495), ('nơi', 1494), ('c

In [None]:
len(n_gram_phrases)
list(n_gram_phrases.items())[:20]

[('tên', 18914),
 ('bài', 19092),
 ('hát', 22208),
 ('tên bài', 16208),
 ('bài hát', 17655),
 ('tên bài hát', 16206),
 ('Cứ Ngủ Say', 1),
 ('ca sĩ', 17500),
 ('Khởi My', 67),
 ('Chế Đình Cường', 3),
 ('sáng tác', 16422),
 ('album', 16229),
 ('sao', 20302),
 ('Đom Đóm', 23),
 ('sao Đom Đóm', 18),
 ('ngày', 39914),
 ('ra mắt', 16239),
 ('ngày ra mắt', 16204),
 ('thể loại', 16220),
 ('Việt Nam', 17142)]

In [5]:
cleaned_words = clean_words_dict(words)


In [6]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base-v2")
count = 0
pho_vocab = tokenizer.get_vocab()
words_from_pho = {}
for word in pho_vocab:
    pho_vocab[word] = 55000 // (pho_vocab[word] + 1)
    if pho_vocab[word] < 1:
        continue
    # if contains_only_vietnamese_letters_and_spaces(word.replace('_',' ')):
    replaced_word = word.replace('_',' ')
    words_from_pho[replaced_word] = pho_vocab[word]
words_from_pho_clean = clean_words_dict(words_from_pho)

for word in words_from_pho_clean:
    if word not in cleaned_words:
        cleaned_words[word] = words_from_pho_clean[word]
cleaned_words = clean_upper_case(cleaned_words)

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
for i in sorted(cleaned_words.items(),key=lambda x: x[1], reverse=True):
    print(i)

('em', 149135)
('anh', 111534)
('người', 47795)
('không', 46865)
('là', 44255)
('có', 41560)
('ta', 41151)
('ngày', 39914)
('đi', 38070)
('đã', 37285)
('một', 34279)
('yêu', 33944)
('nhau', 33227)
('về', 31620)
('còn', 30681)
('cho', 30435)
('trong', 28838)
('ai', 28291)
('mình', 27798)
('những', 27290)
('rồi', 26262)
('như', 26167)
('để', 25481)
('sẽ', 24429)
('bên', 24112)
('khi', 23243)
('vẫn', 22819)
('con', 22751)
('hát', 22208)
('lại', 21948)
('tôi', 21290)
('chỉ', 21139)
('đến', 20986)
('chẳng', 20946)
('mà', 20856)
('vì', 20811)
('sao', 20302)
('và', 19941)
('xa', 19550)
('đâu', 19537)
('biết', 19404)
('bài', 19092)
('tình', 19001)
('tên', 18914)
('ca sĩ', 18285)
('nhớ', 17840)
('đêm', 17213)
('Việt Nam', 17142)
('lòng', 17032)
('thôi', 16805)
('ơi', 16482)
('sáng tác', 16454)
('qua', 16400)
('ra mắt', 16241)
('album', 16229)
('thể loại', 16220)
('nói', 15893)
('được', 15829)
('cũng', 15601)
('đời', 15525)
('nơi', 15467)
('này', 15458)
('từng', 15180)
('đôi', 15180)
('phải', 14

In [8]:
len(cleaned_words)

50415

In [16]:
class demo:
    def __init__(self, a,b,c):
        self.a = a
        self.b = b
        self.c = c
    def dost(self):
        print('hello')
test = demo(1,2,3)

In [17]:
test

<__main__.demo at 0x292f67950>

In [18]:
import pandas as pd
pd.to_pickle(test,'test.pkl')

In [19]:
test2 = pd.read_pickle('test.pkl')