In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Import Libraries

In [None]:
import os
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"

import numpy as np
import pandas as pd

import tqdm
import io
import string
import re
import nltk
import emoji

import matplotlib.pyplot as plt
import seaborn as sns

from collections import Counter
from wordcloud import WordCloud
from textblob import TextBlob
from nltk.util import ngrams
from nltk import word_tokenize

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

import tensorflow as tf
from tensorflow.keras import layers, models, optimizers, losses
from tensorflow.keras.utils import to_categorical, plot_model, Sequence
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, LearningRateScheduler
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from IPython.display import Markdown, clear_output

def bold(string):
    display(Markdown("**" + string + "**"))

import warnings
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

# Load Data

In [None]:
df = pd.read_csv("/kaggle/input/turkish-song-lyrics/turkish_song_lyrics.csv")
df.head()

In [None]:
def df_stats(data):
    bold(" SHAPE ".center(50, "#"))
    print("ROWS: {}".format(data.shape[0]))
    print("COLS: {}".format(data.shape[1]))
    bold(" TYPES ".center(50, "#"))
    print(data.dtypes)
    bold(" MISSING VALUES ".center(50, "#"))
    print(data.isnull().sum())
    bold(" DUPLICATED VALUES ".center(50, "#"))
    print("NUMBER OF DUPLICATED VALUES: {}".format(data.duplicated().sum()))
    #bold(" DESCRIBE ".center(50, "#"))
    #print(data.describe().T)
    bold(" MEMORY USAGE ".center(50, "#"))
    buf = io.StringIO()
    data.info(buf=buf)
    info = buf.getvalue().split("\n")[-2].split(":")[1].strip()
    print("Memory Usage: {}".format(info))

In [None]:
df_stats(df)

In [None]:
df = df.drop_duplicates()
df = df.reset_index(drop=True)

# Preprocess

In [None]:
lyrics = []

for idx, row in tqdm.tqdm(df.iterrows()):
    for lyric in row["lyrics"].split("\n"):
        lyrics.append(lyric)

In [None]:
lyrics_df = pd.DataFrame({"Lyrics": lyrics})
lyrics_df.head()

In [None]:
lyrics_df['Lyrics Cleaned'] = [token.lower() for token in lyrics_df['Lyrics']]
lyrics_df['Lyrics Cleaned'] = lyrics_df['Lyrics Cleaned'].replace(r'http\S+', '', regex=True).replace(r'www\S+', '', regex=True)
lyrics_df['Lyrics Cleaned'] = lyrics_df['Lyrics Cleaned'].apply(lambda x: re.sub('[0-9]+', '', x))
lyrics_df['Lyrics Cleaned'] = lyrics_df['Lyrics Cleaned'].apply(lambda x: x.translate(x.maketrans('', '', string.punctuation)))
lyrics_df['Lyrics Cleaned'] = lyrics_df['Lyrics Cleaned'].apply(lambda x: x.replace('"', '').replace("’", '').replace("'", '').replace("”", ''))
lyrics_df['Lyrics Cleaned'] = lyrics_df['Lyrics Cleaned'].apply(lambda x: re.sub('\S*@\S*\s?', '', x))
lyrics_df['Lyrics Cleaned'] = lyrics_df['Lyrics Cleaned'].apply(lambda x: emoji.replace_emoji(x))
lyrics_df['Lyrics Cleaned'] = lyrics_df['Lyrics Cleaned'].apply(lambda x: re.sub('<.*?>', '', x))
#stop_words = [x.strip() for x in open('/kaggle/input/zemberekwords/stop-words.tr.txt','r', encoding="UTF8").read().split('\n')]
#lyrics_df['Lyrics Cleaned'] = lyrics_df['Lyrics Cleaned'].apply(lambda text: ' '.join([word for word in text.split() if word.lower() not in stop_words]))
#freq = pd.Series(' '.join(lyrics_df['Lyrics Cleaned']).split()).value_counts()
#less_freq = list(freq[freq < 10].index)
#lyrics_df['Lyrics Cleaned'] = lyrics_df['Lyrics Cleaned'].apply(lambda x: " ".join(x for x in x.split() if x not in less_freq))

In [None]:
lyrics_df.sample(5)

# EDA

In [None]:
def print_wordcloud(df, label):
    tokens = ''
    for token in df[label]:
        tokens += token

    wordcloud = WordCloud(background_color="white", width=1200, height=800).generate(tokens)

    plt.figure(figsize=(12, 8))
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.title(f"{label} - Word Cloud")
    plt.show()

In [None]:
print_wordcloud(lyrics_df, "Lyrics")

In [None]:
print_wordcloud(lyrics_df, "Lyrics Cleaned")

In [None]:
def count_ngrams(corpus, ngram, n):
    vec = CountVectorizer(ngram_range=(ngram,ngram)).fit(corpus)
    bow = vec.transform(corpus).sum(axis=0)
    words_freq = sorted([(word, bow[0, idx]) for word, idx in vec.vocabulary_.items()], key=lambda x: x[1], reverse=True)[:n]
    return words_freq

In [None]:
def plot_ngrams(ngram_df, ngram_name):
    plt.figure(figsize=(12, 6))
    plt.bar(data=ngram_df, x="Text", height="Count")
    plt.xticks(rotation=90)
    plt.xlabel(ngram_name)
    plt.ylabel("Count")
    plt.title(ngram_name)
    plt.show()

In [None]:
unigrams = count_ngrams(lyrics_df["Lyrics Cleaned"], 1, 30)
top_unigram = pd.DataFrame(unigrams, columns=['Text', "Count"])
top_unigram.head()

In [None]:
plot_ngrams(top_unigram, "Unigrams")

In [None]:
bigrams = count_ngrams(lyrics_df["Lyrics Cleaned"], 2, 30)
top_bigram = pd.DataFrame(bigrams, columns=['Text', "Count"])
top_bigram.head()

In [None]:
plot_ngrams(top_bigram, "Bigrams")

In [None]:
trigrams = count_ngrams(lyrics_df["Lyrics Cleaned"], 3, 30)
top_trigram = pd.DataFrame(trigrams, columns=['Text', "Count"])
top_trigram.head()

In [None]:
plot_ngrams(top_trigram, "Trigrams")

# Feature Scaling

In [None]:
lyrics_df["Lyrics Cleaned"] = lyrics_df["Lyrics Cleaned"].apply(lambda x: "startseq " + str(x) + " endseq") 

In [None]:
max_length = 25
X_col = 'Lyrics Cleaned'
batch_size = 16
vocab_size = 10000

In [None]:
tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(lyrics_df["Lyrics Cleaned"].tolist())

In [None]:
train_df, test_df = train_test_split(lyrics_df, test_size=0.1, random_state=42)

In [None]:
def preprocess_batch(batch, tokenizer, vocab_size, max_length):
    X, y = list(), list()            
    captions = batch[X_col].tolist()
    for caption in captions:
        seq = tokenizer.texts_to_sequences([caption])[0]
        max_len = max_length if len(seq) > max_length else len(seq)
        for i in range(1, max_len):
            in_seq, out_seq = seq[:i], seq[i]
            in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
            out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
            X.append(in_seq)
            y.append(out_seq)
            
    X, y = np.array(X), np.array(y)
    return X, y

In [None]:
def batch_generator(df, tokenizer, vocab_size, max_length, batch_size):
    n = len(df)
    while True:
        df = df.sample(frac=1).reset_index(drop=True)
        for i in range(0, n, batch_size):
            batch_df = df.iloc[i:i + batch_size]
            X, y = preprocess_batch(batch_df, tokenizer, vocab_size, max_length)
            yield X, y

In [None]:
train_gen = batch_generator(train_df, tokenizer, vocab_size, max_length, batch_size)
test_gen = batch_generator(test_df, tokenizer, vocab_size, max_length, batch_size)

# Model

In [None]:
input_layer = layers.Input(shape=(max_length,))
embedding_layer = layers.Embedding(vocab_size, 64)(input_layer)
bilstm_layer = layers.Bidirectional(layers.LSTM(100))(embedding_layer)
output_layer = layers.Dense(vocab_size, activation='softmax')(bilstm_layer)

model = models.Model(inputs=[input_layer], outputs=output_layer)
model.compile(loss='categorical_crossentropy', 
              optimizer=optimizers.Adam(learning_rate=0.001))

In [None]:
model.summary()

In [None]:
plot_model(model, show_layer_names=True, show_shapes=True)

# Train

In [None]:
es = EarlyStopping(monitor='val_loss', min_delta=0, patience=3)

In [None]:
steps_per_epoch = len(train_df) // batch_size
validation_steps = len(test_df) // batch_size

In [None]:
history = model.fit(train_gen, 
                    steps_per_epoch=steps_per_epoch, 
                    epochs=20, 
                    validation_data=test_gen, 
                    validation_steps=validation_steps, 
                    callbacks=[es])

# Results

In [None]:
plt.figure()
plt.plot(history.history["loss"])
plt.plot(history.history["val_loss"])
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend(["train", "valid"])
plt.title("Loss Curve")
plt.show()

# Greedy Search Decoding

In [None]:
def greedy_search_predictions(text, model, tokenizer, max_length):
    in_text = "startseq " + text
    for _ in range(max_length):
        sequence = pad_sequences([tokenizer.texts_to_sequences([in_text])[0]], max_length)
        y_pred = np.argmax(model.predict(sequence, verbose=0), axis=1)[0]
        
        word = tokenizer.index_word.get(y_pred, None)
        if not word or word == 'endseq':
            break
    
        in_text += " " + word

    return " ".join([word for word in in_text.split() if word not in ["startseq", "endseq", "<OOV>"]])

# Beam Search Decoding

In [None]:
def beam_search_predictions(text, beam_index = 3):
    in_text = "startseq " + text
    start_seq = tokenizer.texts_to_sequences([in_text])[0]
    sequences = [[start_seq, 0.0]]
    
    while len(sequences[0][0]) < max_length:
        all_candidates = []
        for seq, score in sequences:
            padded_seq = pad_sequences([seq], maxlen=max_length)
            preds = model.predict(padded_seq, verbose=0)[0]
            top_preds = np.argsort(preds)[-beam_index:]
            all_candidates.extend([[seq + [w], score + preds[w]] for w in top_preds])
        
        sequences = sorted(all_candidates, key=lambda x: x[1])[-beam_index:]
    
    final_seq = sequences[-1][0]
    final_caption = ' '.join([tokenizer.index_word[i] for i in final_seq if i not in [tokenizer.word_index["startseq"], tokenizer.word_index.get("endseq", 0), tokenizer.word_index.get("<OOV>", 0)]])
    
    return final_caption

# Test

In [None]:
sentences = ["neler olacak sonra", 
             "aman aman aman", 
             "ey ey ey"]

for sentence in sentences:
    print("\nGreedy Search:", greedy_search_predictions(sentence, model, tokenizer, max_length))
    print("Beam Search:", beam_search_predictions(sentence))