In [None]:
!pip3 install langdetect
from langdetect import DetectorFactory, detect
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from scipy.cluster.hierarchy import linkage, dendrogram
from matplotlib import pyplot as plt
import json
import re 
import numpy as np
import pandas as pd

In [2]:
with open("artist_lyrics_kaggle.json", "r", encoding='utf-8') as json_file:
  artist_lyrics = json.load(json_file)

with open("artist_genre_kaggle.json", "r", encoding='utf-8') as json_file:
  artist_genre = json.load(json_file)

In [3]:
def extract_artists_lyrics_of_file (json_file):
  json_file_content = {}
  for k,v in json_file.items(): 
    json_file_content[k] = ' '.join(re.findall('[A-Za-z]+', v))
  return json_file_content

def extract_genre_by_artist (json_file, artist):
  return json_file[artist]

def load_stopwords_file(txt_file):
  stopwords_file = open(txt_file)
  stopwords_file_content = stopwords_file.read()
  return stopwords_file_content.splitlines()

def detect_language (content):
  # check lg iso-codes here: https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes
  return detect(content)

def evaluate_tfidf_matrix (stop_words, max_features, lyrics):
  vectorizer = TfidfVectorizer(stop_words=stop_words, max_features=max_features)
  lyrics_tfidf_matrix = vectorizer.fit_transform(lyrics)
  feature_names = vectorizer.get_feature_names()
  return lyrics_tfidf_matrix, feature_names

def extract_features_by_artist (json_content_artist_lyrics,stop_words,max_features):
  artists, lyrics = zip(*extract_artists_lyrics_of_file(json_content_artist_lyrics).items())
  lyrics_tfidf_matrix, feature_names = evaluate_tfidf_matrix(stop_words,max_features,lyrics)                             
  return zip(artists,lyrics_tfidf_matrix.toarray())

def extract_features_by_genre (json_content_artist_lyrics,json_content_artist_genre,stop_words,max_features):
  words_per_genre = {}
  for artist,lyrics in extract_artists_lyrics_of_file(json_content_artist_lyrics).items(): 
    genre = extract_genre_by_artist(json_content_artist_genre, artist)
    words_per_genre[genre] = lyrics if not genre in words_per_genre else words_per_genre[genre]+" "+lyrics
  genres, lyrics = zip(*words_per_genre.items())
  lyrics_tfidf_matrix, feature_names = evaluate_tfidf_matrix(stop_words,max_features,lyrics)        
  return zip(genres,lyrics_tfidf_matrix.toarray())

def apply_hierarchical_clustering (json_content_artist_lyrics,stop_words,max_features):
  artists, lyrics = zip(*extract_artists_lyrics_of_file(json_content_artist_lyrics).items())
  lyrics_tfidf_matrix, feature_names = evaluate_tfidf_matrix(stop_words,max_features,lyrics)                             
  clustered = linkage(lyrics_tfidf_matrix.toarray(), method='ward')
  plot_dendrogram(clustered, artists)

def plot_dendrogram(clustered, artists):
  plt.figure(figsize=(10, 25))
  plt.title('Hierarchical Clustering Dendrogram')
  plt.xlabel('Distance')  
  plt.ylabel('Artists')
  plt.tight_layout()
  dendrogram(clustered,leaf_font_size=8.,labels = artists,orientation = 'left')
  plt.show()
  
def apply_pca (json_content_artist_lyrics,stop_words,max_features):
  artists, lyrics = zip(*extract_artists_lyrics_of_file(json_content_artist_lyrics).items())
  genre_targets_df = pd.DataFrame ([extract_genre_by_artist(artist_genre,artist) for artist in artists],columns=['target'])
  lyrics_tfidf_matrix, feature_names = evaluate_tfidf_matrix(stop_words,max_features,lyrics)   
  pca = PCA(n_components=2)
  principal_components = pca.fit_transform(lyrics_tfidf_matrix.toarray())
  principal_components_df = pd.DataFrame(data = principal_components,columns = ['PC1', 'PC2'])
  principal_components_targets_df = pd.concat([principal_components_df, genre_targets_df], axis = 1)
  scatter_pca(principal_components_targets_df)
  
def scatter_pca(dataframe):
  fig = plt.figure(figsize = (8,8))
  ax = fig.add_subplot(1,1,1) 
  ax.set_xlabel('PC1', fontsize = 15)
  ax.set_ylabel('PC2', fontsize = 15)
  ax.set_title('Principle Component Analysis', fontsize = 20)
  targets = ['Metal','Hip-Hop','Electronic','R&B','Country','Folk','Pop','Indie','Rock','Jazz']
  colors = ["#c3618c","#b45ac2","#c8ac42","#7178ca","#cb7140","#4bafd0","#d0454e","#52a674","#877f3a","#5ac2bd"]
  for target, color in zip(targets,colors):
    indicesToKeep = dataframe['target'] == target
    ax.scatter(dataframe.loc[indicesToKeep, 'PC1']
                , dataframe.loc[indicesToKeep, 'PC2']
                , c = color
                , s = 50)
  ax.legend(targets)
  ax.grid()

In [None]:
artist_features_collection = extract_features_by_artist(artist_lyrics,load_stopwords_file("stopwords.txt"),20)
for artist_features in artist_features_collection: 
  print(artist_features)

In [None]:
genre_features_collection = extract_features_by_genre(artist_lyrics,artist_genre,load_stopwords_file("stopwords.txt"),20)
for genre_features in genre_features_collection: 
  print(genre_features)

In [None]:
apply_hierarchical_clustering(artist_lyrics,load_stopwords_file("stopwords.txt"),20)

In [None]:
apply_pca(artist_lyrics,load_stopwords_file("stopwords.txt"),20)