In [18]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.utils.data import DataLoader, Dataset

In [20]:
df = pd.read_csv('processed_turkish_lyrics.csv')

In [22]:
tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-turkish-cased")
model = AutoModelForSequenceClassification.from_pretrained("dbmdz/bert-base-turkish-cased", num_labels=2)
model.eval()  # Evaluation mode

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-turkish-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(32000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [24]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(32000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [26]:
class LyricsDataset(Dataset):
    def __init__(self, texts):
        self.texts = texts

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return self.texts[idx]

In [28]:
def predict_sentiment_batch(texts, batch_size=32):
    dataset = LyricsDataset(texts)
    dataloader = DataLoader(dataset, batch_size=batch_size)

    all_probs = []
    with torch.no_grad():
        for batch in dataloader:
            inputs = tokenizer(batch, return_tensors="pt", truncation=True, padding=True, max_length=512)
            inputs = {key: val.to(device) for key, val in inputs.items()}
            outputs = model(**inputs)
            probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
            all_probs.extend(probs[:, 1].cpu().numpy())  # Probability of positive sentiment

    return all_probs

In [30]:
df['sentiment_score'] = predict_sentiment_batch(df['processed_lyrics'].tolist())

In [34]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import json

# Perform TF-IDF analysis
tfidf = TfidfVectorizer(max_features=1000)
tfidf_matrix = tfidf.fit_transform(df['processed_lyrics'])
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf.get_feature_names_out())

In [36]:
# Perform Topic Modeling
lda = LatentDirichletAllocation(n_components=5, random_state=42)
lda_output = lda.fit_transform(tfidf_matrix)
topic_names = [f"Topic_{i+1}" for i in range(lda.n_components)]
topic_df = pd.DataFrame(lda_output, columns=topic_names)

In [38]:
# Combine all features
df_analyzed = pd.concat([df, tfidf_df, topic_df], axis=1)

In [40]:
# Calculate average sentiment and top topics for each singer
singer_analysis = df_analyzed.groupby('singer').agg({
    'sentiment_score': 'mean',
    'Topic_1': 'mean',
    'Topic_2': 'mean',
    'Topic_3': 'mean',
    'Topic_4': 'mean',
    'Topic_5': 'mean'
}).reset_index()

In [42]:
# Get top 10 TF-IDF words for each singer
tfidf_columns = tfidf_df.columns
for singer in df_analyzed['singer'].unique():
    top_words = df_analyzed[df_analyzed['singer'] == singer][tfidf_columns].mean().nlargest(10).index.tolist()
    singer_analysis.loc[singer_analysis['singer'] == singer, 'top_words'] = ', '.join(top_words)

In [44]:
# Merge singer analysis back to the main dataframe
df_analyzed = df_analyzed.merge(singer_analysis, on='singer', suffixes=('', '_avg'))

In [46]:
# Create a function to generate prompts and completions for LLaMA fine-tuning
def generate_llama_training_data(row):
    prompt = (f"Şarkıcı: {row['singer']}\n"
              f"Duygu Skoru: {row['sentiment_score_avg']:.2f}\n"
              f"Tema Skorları: {', '.join([f'{topic}: {row[topic + '_avg']:.2f}' for topic in topic_names])}\n"
              f"Karakteristik Kelimeler: {row['top_words']}\n\n"
              f"Şarkı Sözü:")
    completion = row['lyrics']
    return {'prompt': prompt, 'completion': completion}

In [48]:
# Apply the function to create LLaMA training data
llama_training_data = df_analyzed.apply(generate_llama_training_data, axis=1).tolist()

In [50]:
# Save the LLaMA training data
with open('llama_fine_tuning_data.jsonl', 'w', encoding='utf-8') as f:
    for item in llama_training_data:
        f.write(json.dumps(item, ensure_ascii=False) + '\n')

In [52]:
# Save the full analysis results
df_analyzed.to_csv('lyrics_full_analysis_for_llama.csv', index=False)