In [2]:
import pandas as pd
import numpy as np
import json
import re
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer

# Load Turkish stopwords
with open('turkish_stopwords.json', 'r', encoding='utf-8') as f:
    stop_words = set(json.load(f))

In [4]:
def preprocess_text(text):
    """Preprocess the text"""
    text = text.lower()
    text = re.sub(r'[^a-zçğıöşü0-9\s]', '', text)  # Keep Turkish characters and spaces
    words = text.split()
    return ' '.join(word for word in words if word not in stop_words)

In [6]:
# Load the dataset
df = pd.read_csv('turkish_song_lyrics.csv')

# Preprocess lyrics
df['processed_lyrics'] = df['lyrics'].apply(preprocess_text)

In [8]:
# Apply TF-IDF vectorization
tfidf = TfidfVectorizer(max_features=1000)  # Get the most important 1000 words
tfidf_matrix = tfidf.fit_transform(df['processed_lyrics'])

# Calculate average TF-IDF vector for each singer
singer_vectors = {}
for singer in df['singer'].unique():
    singer_mask = df['singer'] == singer
    singer_vectors[singer] = tfidf_matrix[singer_mask].mean(axis=0).A.flatten()

# Convert singer vectors to DataFrame
singer_df = pd.DataFrame.from_dict(singer_vectors, orient='index')
singer_df.columns = tfidf.get_feature_names_out()

In [10]:
# Find the most characteristic words for each singer
def get_top_words(row, n=10):
    return [(word, score) for word, score in sorted(zip(row.index, row), key=lambda x: x[1], reverse=True)[:n]]

singer_top_words = singer_df.apply(get_top_words, axis=1)

# Combine results in a new DataFrame
result_df = pd.DataFrame({
    'singer': df['singer'].unique(),
    'top_words': singer_top_words
})

In [12]:
# Calculate the usage rate of characteristic words for each song
def calculate_characteristic_word_usage(lyrics, top_words):
    words = set(lyrics.split())
    top_word_set = set(word for word, _ in top_words)
    return len(words.intersection(top_word_set)) / len(words) if words else 0

df['characteristic_word_usage'] = df.apply(lambda row: calculate_characteristic_word_usage(
    row['processed_lyrics'], 
    result_df[result_df['singer'] == row['singer']]['top_words'].iloc[0]
), axis=1)

In [14]:
# Save the results
output_df = df[['singer', 'lyrics', 'processed_lyrics', 'characteristic_word_usage']]
output_df = output_df.merge(result_df[['singer', 'top_words']], on='singer', how='left')
output_df = output_df.drop_duplicates(subset=['singer', 'lyrics'])

output_df.to_csv('processed_turkish_lyrics.csv', index=False)

In [16]:
# Print summary statistics
print(output_df['characteristic_word_usage'].describe())
print("\nSample singers and their most characteristic words:")
print(output_df[['singer', 'top_words']].groupby('singer').first().head())

count    4952.000000
mean        0.063765
std         0.036377
min         0.000000
25%         0.036340
50%         0.058824
75%         0.085714
max         0.294118
Name: characteristic_word_usage, dtype: float64

Sample singers and their most characteristic words:
                                                           top_words
singer                                                              
Ahmet Kaya         [(bir, 0.0954012355187354), (beni, 0.044687668...
Barış Manço        [(bir, 0.07542210759407574), (the, 0.041491916...
Büyük Ev Ablukada  [(bi, 0.10516404985444336), (gibi, 0.089592342...
Can Bonomo         [(bir, 0.07463970292919377), (sen, 0.069707164...
Candan Erçetin     [(je, 0.06657468480379185), (de, 0.05194479129...
