<a href="https://colab.research.google.com/github/vytsb/Gilusis_Mokymas/blob/main/lab2_poetry.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [52]:
import os, re
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
!pip install pronouncing -q
import pronouncing
import keras
keras.utils.set_random_seed(812)
!pip install markovify -q
import markovify
!pip install num2words -q
from num2words import num2words
import kagglehub


# Dainų tekstų / poezijos generavimas

Šio laboratorinio darbo metu, sukursime dviejų modelių sistemą dainų tekstų generavimui:
1. RNN (kelių sluoksnių LSTM) modelis prognozuos sekančios eilutės savybes;
2. Markovo grandinių modelis ([markovify](https://pypi.org/project/markovify/)) generuos žodžių sekas.

## 1. Duomenų atsiuntimas ir apžiūra

In [53]:
datapath = kagglehub.dataset_download("tgdivy/poetry-foundation-poems")
data_files = os.listdir(datapath)
print(f'Downloaded {len(data_files)} files:', data_files)
data = pd.read_csv(os.path.join(datapath, data_files[0]))

poet_directory = os.path.join(datapath, "poets")
os.makedirs(poet_directory, exist_ok=True)

for poet, poems in data.groupby('Poet'):
    poet_file = os.path.join(poet_directory, f"{poet.replace(' ', '_').lower()}.txt")
    with open(poet_file, 'w', encoding='utf-8') as f:
        f.write('\n\n'.join(poems['Poem'].dropna().tolist()))

data_files = os.listdir(poet_directory)
print(f"Generated poet files: {data_files}")

Downloaded 2 files: ['PoetryFoundationData.csv', 'poets']
Generated poet files: ['christopher_robley.txt', 'barbara_ellen_sorensen.txt', 'elizabeth_moody.txt', 'jessica_jopp.txt', 'richard_m._berlin.txt', 'carolina_ebeid.txt', 'annie_finch.txt', 'maureen_thorson.txt', 'alli_warren.txt', 'marianne_boruch.txt', 'derek_beaulieu.txt', 'samuel_johnson.txt', 'charlotte_brontë.txt', 'hannah_brooks-motl.txt', 'elizabeth_akers_allen.txt', 'basil_bunting.txt', 'susan_stewart.txt', 'lewis_warsh.txt', 'dobby_gibson.txt', 'henri_cole.txt', 'uladzimir_niakliaeu.txt', 'chrystos.txt', 'brenda_hillman.txt', 'bianca_stone.txt', 'margaret_kaufman.txt', 'katherine_hauth.txt', 'marvin_bell.txt', 'bob_kaufman.txt', 'allen_tate.txt', 'anne_sexton.txt', 'jennifer_elise_foerster.txt', 'a._e._housman.txt', 'lord_edward,_lord_herbert_of_cherbury.txt', 'rosmarie_waldrop.txt', 'león_salvatierra.txt', 'katia_kapovich.txt', 'laura_dimmit.txt', 'francesca_abbate.txt', 'witter_bynner.txt', 'maurice_kilwein_guevara.txt

In [54]:
characters_counts = {}
for data_file in data_files:
  with open(os.path.join(poet_directory, data_file), 'r') as f:
    characters_counts[data_file] = len(f.read())
sorted_counts = sorted(characters_counts.items(), key=lambda x: x[1], reverse=True)

total = sum([v for _, v in sorted_counts])
print(f'Total characters: {total}')
for k, v in sorted_counts:
    print(f'- {k}: {v} characters ({v / total * 100:.2f} %)')

Total characters: 20499112
- john_milton.txt: 558047 characters (2.72 %)
- alfred,_lord_tennyson.txt: 176827 characters (0.86 %)
- walt_whitman.txt: 157572 characters (0.77 %)
- alexander_pope.txt: 147434 characters (0.72 %)
- algernon_charles_swinburne.txt: 142429 characters (0.69 %)
- robert_browning.txt: 137232 characters (0.67 %)
- percy_bysshe_shelley.txt: 132404 characters (0.65 %)
- william_shakespeare.txt: 124131 characters (0.61 %)
- william_wordsworth.txt: 119930 characters (0.59 %)
- matthew_arnold.txt: 119819 characters (0.58 %)
- edmund_spenser.txt: 109395 characters (0.53 %)
- john_koethe.txt: 85816 characters (0.42 %)
- samuel_taylor_coleridge.txt: 78337 characters (0.38 %)
- robert_pinsky.txt: 77037 characters (0.38 %)
- henry_wadsworth_longfellow.txt: 74248 characters (0.36 %)
- john_dryden.txt: 73028 characters (0.36 %)
- john_ashbery.txt: 72162 characters (0.35 %)
- philip_whalen.txt: 71707 characters (0.35 %)
- christian_wiman.txt: 69477 characters (0.34 %)
- frank_

Galime naudoti standartinius normalizavimo metodus, pavyzdžiui, sumažinti visas raides ir pašalinti skyrybos ženklus, tačiau turėtume palikti trumpinius, nes jie turi skirtingą nuo pilnų versijų ritmą ir kitaip rimuojasi.

In [55]:
def normalize_word_line(line: str) -> list[str]:
  row = [x.lower() for x in re.findall(r"\w+'?\w*", line)]
  new_row = []
  for word in row:
    numbers = re.findall(r"\d+", word)
    for n in numbers:
      word = word.replace(n, num2words(int(n)))
    new_row.append(word)
  return new_row

##3. Ritmo ištraukimas iš duomenų

Paskaičiuojame skiemenų kiekius:

In [56]:
def n_syllables(word_line: list[str]):
    vowels = 'aeiouy'
    syllable_count = 0

    for word in word_line:
        for i, char in enumerate(word):
            if char in vowels:
                if (i == 0) or (word[i-1] not in vowels):
                    syllable_count += 1
        word_vowels_count = sum([x in vowels for x in word])
        if word_vowels_count == 0:  # abreviaturos
            syllable_count = len(word)  # pvz. "NLP" skaitosi kaip "en-el-pi"
        elif word_vowels_count > 1 and (word[-1] == 'e') and (word[-2] not in vowels):  # paskutinė "e" dažnai nesakoma anglų k., bet "ie", "ee" ištariama
            syllable_count -= 1

    return syllable_count

Tačiau mes nenorime naudoti bele kokius žodžius, o norime prioretizuoti žodžius iš jau esamų atlikėjo dainų. Todėl reikia sukonstruoti rimuojančių žodžių žodyną.

In [57]:
def get_rhyme(line: list[str]) -> str:
    last_word = re.sub('\W+', '', line[-1])
    all_rhymes = pronouncing.rhymes(last_word)
    if all_rhymes:
        rhyming_ends = [x[-2:] for x in all_rhymes]
        most_common_rhyme = max(set(rhyming_ends), key=rhyming_ends.count)
    else:
        most_common_rhyme = last_word[-2:]
    return most_common_rhyme


def get_rhyme_list(normalized_lyrics: list[list[str]]):
  rhyme_set = set()
  for row in normalized_lyrics:
    most_common_rhyme = get_rhyme(row)
    rhyme_set.add(most_common_rhyme)

  sorted_rhyme_set = sorted(list(rhyme_set), key=lambda x: x[-1])
  return sorted_rhyme_set

## 4. Duomenų rinkinio paruošimas

Modelis negalės tiesiogiai operuoti skiemenimis, todėl turime naudoti skaičius:

In [58]:
def get_rhyme_float(line: list[str], rhyme_list: list[str]) -> float | None:
  rhyme = get_rhyme(line)
  if rhyme in rhyme_list:
    return rhyme_list.index(rhyme) / len(rhyme_list)
  else:
    return None

In [59]:
def get_random_lines(markov_model, n_rows: int) -> list[list[str]]:
  lines = []
  last_words = []

  while len(lines) < n_rows:
    line = markov_model.make_sentence(max_overlap_ratio=.49, tries=100)
    # nenorime gauti tuščios eilutės ar jau turimos eilutės
    if (line is not None) and (line not in lines):
      last_word = normalize_word_line(line)[-1]
      # nenorime kad dažnai pasikartotų tas pats žodis eilutės gale
      if last_words.count(last_word) < 3:
        lines.append(normalize_word_line(line))
        last_words.append(last_word)

  return lines


def get_line_features(line: list[str], rhyme_list: list[str]) -> tuple:
  return (line, n_syllables(line), get_rhyme_float(line, rhyme_list))

Konstruojame duomenų rinkinį:

In [60]:
def build_dataset(lines: list[list[str]], rhyme_list: list[str]):
	features = [get_line_features(x, rhyme_list) for x in lines]
	x_data, y_data = [], []

  # turėsime standartinę struktūrą kai eilutės rimuojasi po 4 grupėje
	# pirmos dvi eilutės bus pradinės savybės, antros dvi eilutės - prognozuojamos
	for i in range(len(features) - 3):
		# duomenyse liks tik eilučių savybes, todėl visur [1:]
		line1, line2 = features[i    ][1:], features[i + 1][1:]
		line3, line4 = features[i + 2][1:], features[i + 3][1:]
		x_data.append(np.array([line1, line2]))
		y_data.append(np.array([line3, line4]))
	return np.array(x_data), np.array(y_data)

## 5. RNN modelio inicializavimas

Mūsų modelis turės prognozuoti naujų 4 eilučių savybes gaunant senas 4 eilutes.

In [61]:
def create_lstm(depth: int):

  keras.backend.clear_session()  # pašaliname tarpinių modelių likučius
  keras.utils.set_random_seed(812)

  model = keras.Sequential(name='LSTM-based_lyrics_generator')
  model.add(keras.layers.Input((2, 2)))
  model.add(keras.layers.LSTM(4, return_sequences=True))
  model.add(keras.layers.Dropout(0.2))  # Pridėtas Dropout sluoksnis

  for i in range(depth):
    model.add(keras.layers.LSTM(8, return_sequences=True))
    model.add(keras.layers.Dropout(0.2))  # Pridėtas Dropout sluoksnis

  model.add(keras.layers.LSTM(2, return_sequences=True))

  model.compile(
      optimizer=keras.optimizers.RMSprop(learning_rate=0.001),
      loss='mse')

  return model


Turime **dviejų modelių sistemą**, todėl būtina parašyti funkcijas kad modeliai galėtų tarpusavyje bendrauti.

In [62]:
def compose(starting_input: np.ndarray, rnn_model, n_line_groups: int):
	final_vectors = []
	starting_vectors = rnn_model.predict(starting_input).flatten().reshape(1, 2, 2)
	final_vectors.append(starting_vectors)
	for i in range(n_line_groups):
		prev_vectors = final_vectors[-1]
		final_vectors.append(rnn_model.predict(prev_vectors).flatten().reshape(1, 2, 2))
	return final_vectors


def last_word_compare(prev_lines: list[list[str]], new_line: list[str], penalty: float = 0.2) -> float:
	sum_penalty = 0.0
	for line in prev_lines:
		if line[-1] == new_line[-1]:
			sum_penalty += penalty
	return sum_penalty


def calculate_score(features, n_syllables, rhyme, penalty: float, rhyme_list, maxsyllables):
	desired_n_syllables = features[0] * maxsyllables
	desired_rhyme = features[1] * len(rhyme_list)
	syllable_score = - abs(float(desired_n_syllables) - float(n_syllables))
	rhyme_score = abs(float(desired_rhyme) - float(rhyme))
	score = 1.0 + syllable_score + rhyme_score - penalty
	return score


def vectors_into_song(vectors, generated_lyrics, rhyme_list, maxsyllables: int):
	song = []
	generated_features = [get_line_features(x, rhyme_list) for x in generated_lyrics]

	vector_halves = []
	for vector in vectors:
		vector_halves.extend(vector[0].tolist())

	for vector in vector_halves:
		scorelist = []

		for (line, n_syllables, rhyme) in generated_features:
			if len(song) != 0:
				penalty = last_word_compare(song, line)
			else:
				penalty = 0

			total_score = calculate_score(vector, n_syllables, rhyme, penalty, rhyme_list, maxsyllables)
			scorelist.append([line, total_score])

		# randame eilutę su aukščiausiu įvertinimu
		best_line_index = np.argmax([float(x[1]) for x in scorelist])
		best_line = scorelist[best_line_index][0]
		song.append(best_line)

		# pašaliname šią eilutę iš likusių eilučių sąrašo
		generated_features = [x for x in generated_features if x[0] != best_line]

	return [' '.join(x) for x in song]

In [63]:
artist_file = 'john_koethe.txt'
with open(os.path.join(poet_directory, artist_file), 'r') as f:
    raw_lyrics = f.read()

markov_model = markovify.NewlineText(raw_lyrics)

lyrics = [normalize_word_line(x) for x in raw_lyrics.splitlines()]
lyrics = [x for x in lyrics if x]
rhymes = get_rhyme_list(lyrics)
print(f'Collected {len(rhymes)} rhymes')

x_data, y_data = build_dataset(lyrics, rhymes)

model = create_lstm(depth=4)
print(model.summary())

model.fit(
    x_data, y_data,
    batch_size=2,
    epochs=10
)

start_i = np.random.choice(range(len(x_data)))
start = np.array([x_data[start_i]])
vectors = compose(start, model, 4)
some_lyrics = get_random_lines(markov_model, 200)
vectors_into_song(vectors, some_lyrics, rhymes, maxsyllables=12)

Collected 122 rhymes


None
Epoch 1/10
[1m617/617[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 17ms/step - loss: 107.9078
Epoch 2/10
[1m617/617[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 8ms/step - loss: 101.1107
Epoch 3/10
[1m617/617[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 10ms/step - loss: 100.9568
Epoch 4/10
[1m617/617[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 8ms/step - loss: 100.9473
Epoch 5/10
[1m617/617[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 10ms/step - loss: 100.9447
Epoch 6/10
[1m617/617[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 10ms/step - loss: 100.9438
Epoch 7/10
[1m617/617[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 8ms/step - loss: 100.9441
Epoch 8/10
[1m617/617[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 11ms/step - loss: 100.9444
Epoch 9/10
[1m617/617[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 9ms/step - loss: 100.9436
Epoch 10/10
[1m617/617[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37

['strewn around home a few years after i d grown up and took him home i look',
 'and leave the world dissolves into the unknown',
 'or the sound the heart is a real one',
 'a separate part of what i like this room',
 'one feels the wind in the sunlight less',
 'with the future used to look at all the',
 'as thick as a landscape where all roads lead',
 'i don t want them now i don t think so age is like the sound of cuban',
 'one feels the wind in the sky dissolved',
 'but that beneath my life has that makes the days']