## Imports

In [154]:
import numpy as np
import pandas as pd

## Data processing

In [155]:
dataset_filepath = "../../public/src/dataset.csv"
geo_data_filepath = "../../public/src/geo_data.csv"

df = pd.read_csv(dataset_filepath)
df_geo_data = pd.read_csv(geo_data_filepath)

df.head()

Unnamed: 0,X_id,name,title,rank,publicationDate,language_detect,location.country
0,5714dec325ac0d8aee3804e7,A,Turn It Up,261631.0,1998-06-22,english,United Kingdom
1,5714dec325ac0d8aee3804e8,A,Foghorn,297455.0,1998-06-22,english,United Kingdom
2,5714dec325ac0d8aee3804e9,A,Cheeky Monkey,268232.0,1998-06-22,english,United Kingdom
3,5714dec325ac0d8aee3804ea,A,No. 1,308436.0,1998-06-22,english,United Kingdom
4,5714dec325ac0d8aee3804eb,A,Bad Idea,273805.0,1998-06-22,english,United Kingdom


In [156]:
df_geo_data.head()


Unnamed: 0.1,Unnamed: 0,x,y
0,Aruba,-69.982675,12.520889
1,Afghanistan,66.004731,33.835232
2,Angola,17.537361,-12.293361
3,Anguilla,-63.064985,18.223967
4,Albania,20.04983,41.142451


In [157]:
# converting publicationDate into a date format
df['publicationDate'] = pd.to_datetime(df['publicationDate'])

# extracting the year
df['year'] = df['publicationDate'].dt.year

df.head()

Unnamed: 0,X_id,name,title,rank,publicationDate,language_detect,location.country,year
0,5714dec325ac0d8aee3804e7,A,Turn It Up,261631.0,1998-06-22,english,United Kingdom,1998
1,5714dec325ac0d8aee3804e8,A,Foghorn,297455.0,1998-06-22,english,United Kingdom,1998
2,5714dec325ac0d8aee3804e9,A,Cheeky Monkey,268232.0,1998-06-22,english,United Kingdom,1998
3,5714dec325ac0d8aee3804ea,A,No. 1,308436.0,1998-06-22,english,United Kingdom,1998
4,5714dec325ac0d8aee3804eb,A,Bad Idea,273805.0,1998-06-22,english,United Kingdom,1998


In [158]:
# Counting the number of songs produced by a country in a year
df_songs_by_countries = df.groupby(['year', 'location.country']).size().reset_index(name='song_count')

# Getting the min year where every country is present
min_year = df_songs_by_countries.groupby('location.country')['year'].min().max()
print("The minimum year where every country is present is:", min_year)

# Adding the position of each country
df_geo_data.rename(columns={'Unnamed: 0': 'country_name'}, inplace=True)
df_songs_by_countries.rename(columns={'location.country': 'country_name'}, inplace=True)
df_songs_by_countries = df_songs_by_countries.merge(df_geo_data, on='country_name', how='left')

df_songs_by_countries.head(10)

The minimum year where every country is present is: 2015


Unnamed: 0,year,country_name,song_count,x,y
0,1910,United States,12,-112.461671,45.679552
1,1951,United States,1,-112.461671,45.679552
2,1958,United States,1,-112.461671,45.679552
3,1960,United States,2,-112.461671,45.679552
4,1963,United States,1,-112.461671,45.679552
5,1970,United States,6,-112.461671,45.679552
6,1972,United States,12,-112.461671,45.679552
7,1976,United States,1,-112.461671,45.679552
8,1977,United States,1,-112.461671,45.679552
9,1980,United Kingdom,1,-2.865634,54.123872


In [159]:
new_dataset_filepath = "../../public/src/bubble_dataset.csv"
df_songs_by_countries.to_csv(new_dataset_filepath, index=False)

In [160]:
def get_top_3_songs(group):
    top_songs = group.sort_values(by='rank', ascending=True).head(3)
    return top_songs[['name', 'title']]

grouped = df[df['year'] == 1910].groupby(['year', 'location.country'])

df_top_songs = grouped.apply(get_top_3_songs)

print(df_top_songs.shape)
df_top_songs.head()


(3, 2)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,name,title
year,location.country,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1910,United States,20856,Stacey Kent,I Won't Dance
1910,United States,20855,Stacey Kent,They Can't Take That Away From Me
1910,United States,20854,Stacey Kent,Let Yourself Go


In [161]:
def get_top_3_songs(group):
    top_songs = group.sort_values(by='rank', ascending=True).head(3)
    print(top_songs)
    return top_songs[['name', 'title']]

grouped = df[df['year'] == 1910].groupby(['year', 'location.country'])

grouped.apply(get_top_3_songs)

                           X_id         name  \
20856  5714dee525ac0d8aee5201b3  Stacey Kent   
20855  5714dee525ac0d8aee5201b2  Stacey Kent   
20854  5714dee525ac0d8aee5201b1  Stacey Kent   

                                   title      rank publicationDate  \
20856                      I Won't Dance  280840.0      1910-01-01   
20855  They Can't Take That Away From Me  291256.0      1910-01-01   
20854                    Let Yourself Go  294589.0      1910-01-01   

      language_detect location.country  year  
20856         english    United States  1910  
20855         english    United States  1910  
20854         english    United States  1910  


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,name,title
year,location.country,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1910,United States,20856,Stacey Kent,I Won't Dance
1910,United States,20855,Stacey Kent,They Can't Take That Away From Me
1910,United States,20854,Stacey Kent,Let Yourself Go


In [162]:
# Adding the top 3 most famous musics of a year for a country


grouped = df.groupby(['year', 'location.country'])

df_top_songs = grouped.apply(get_top_songs).reset_index()
df_top_songs.rename(columns={'location.country': 'country_name'}, inplace=True)


# Rename the columns to music_1, music_2, and music_3
df_top_songs.rename(columns={'name': 'artist_1', 'title': 'music_1'}, inplace=True)
df_top_songs.rename(columns={'name': 'artist_2', 'title': 'music_2'}, inplace=True)
df_top_songs.rename(columns={'name': 'artist_3', 'title': 'music_3'}, inplace=True)

print(df_top_songs.head())


df_songs_by_countries.reset_index(drop=True, inplace=True)
df_songs_by_countries = df_songs_by_countries.merge(df_top_songs, on=['year', 'country_name'], how='left')

df_songs_by_countries.head()

   year   country_name  level_2     artist_1  \
0  1910  United States    20856  Stacey Kent   
1  1910  United States    20855  Stacey Kent   
2  1910  United States    20854  Stacey Kent   
3  1951  United States    20375  Pete Seeger   
4  1958  United States    20377  Pete Seeger   

                             music_1  
0                      I Won't Dance  
1  They Can't Take That Away From Me  
2                    Let Yourself Go  
3                Go Tell Aunt Rhodie  
4                        Down-a-Down  


Unnamed: 0,year,country_name,song_count,x,y,level_2,artist_1,music_1
0,1910,United States,12,-112.461671,45.679552,20856,Stacey Kent,I Won't Dance
1,1910,United States,12,-112.461671,45.679552,20855,Stacey Kent,They Can't Take That Away From Me
2,1910,United States,12,-112.461671,45.679552,20854,Stacey Kent,Let Yourself Go
3,1951,United States,1,-112.461671,45.679552,20375,Pete Seeger,Go Tell Aunt Rhodie
4,1958,United States,1,-112.461671,45.679552,20377,Pete Seeger,Down-a-Down


In [163]:
df_top_songs.head()

Unnamed: 0,year,country_name,level_2,artist_1,music_1
0,1910,United States,20856,Stacey Kent,I Won't Dance
1,1910,United States,20855,Stacey Kent,They Can't Take That Away From Me
2,1910,United States,20854,Stacey Kent,Let Yourself Go
3,1951,United States,20375,Pete Seeger,Go Tell Aunt Rhodie
4,1958,United States,20377,Pete Seeger,Down-a-Down


In [164]:
df_songs_by_countries.head()

Unnamed: 0,year,country_name,song_count,x,y,level_2,artist_1,music_1
0,1910,United States,12,-112.461671,45.679552,20856,Stacey Kent,I Won't Dance
1,1910,United States,12,-112.461671,45.679552,20855,Stacey Kent,They Can't Take That Away From Me
2,1910,United States,12,-112.461671,45.679552,20854,Stacey Kent,Let Yourself Go
3,1951,United States,1,-112.461671,45.679552,20375,Pete Seeger,Go Tell Aunt Rhodie
4,1958,United States,1,-112.461671,45.679552,20377,Pete Seeger,Down-a-Down
