## Imports

In [78]:
import numpy as np
import pandas as pd

## Data processing

In [105]:
dataset_filepath = "../../public/src/dataset.csv"

df = pd.read_csv(dataset_filepath)

df.head()

Unnamed: 0,X_id,name,title,rank,publicationDate,language_detect,location.country
0,5714dec325ac0d8aee3804e7,A,Turn It Up,261631.0,1998-06-22,english,United Kingdom
1,5714dec325ac0d8aee3804e8,A,Foghorn,297455.0,1998-06-22,english,United Kingdom
2,5714dec325ac0d8aee3804e9,A,Cheeky Monkey,268232.0,1998-06-22,english,United Kingdom
3,5714dec325ac0d8aee3804ea,A,No. 1,308436.0,1998-06-22,english,United Kingdom
4,5714dec325ac0d8aee3804eb,A,Bad Idea,273805.0,1998-06-22,english,United Kingdom


In [106]:
countries_path = "../../public/src/countries_codes.csv"

df_countries = pd.read_csv(countries_path, sep = ";")

columns_to_drop = ["Geo Shape", "geo_point_2d", "IS ILOMEMBER", "IS RECEIVING QUEST", "LABEL FR", "LABEL SP", "ONU CODE", "ISO2 CODE", "OFFICIAL LANG CODE"]
df_countries.drop(columns=columns_to_drop, inplace=True)

df_countries.head()

Unnamed: 0,ISO3 CODE,LABEL EN
0,YEM,Yemen
1,MY2,Malaysia: Sabah
2,IMN,Isle of Man
3,TCA,Turks and Caicos Islands
4,LBY,Libyan Arab Jamahiriya


In [107]:
# Renaming countries to fit both datasets
country_mapping = {
    'Russian Federation': 'Russia',
    'Saint Vincent and the Grenadines': 'Saint Vincent and The Grenadines',
    'Moldova, Republic of' : 'Moldova',
    'Korea, Republic of' : 'South Korea',
}

# Use the .replace() method to rename the countries
df.rename(columns={'location.country': 'country_name'}, inplace=True)
df_countries.rename(columns={'LABEL EN': 'country_name'}, inplace=True)
df_countries['country_name'] = df_countries['country_name'].replace(country_mapping)

In [110]:
# Adding the iso3 code of each country
df_countries.rename(columns={'ISO3 CODE': 'id'}, inplace=True)
df = df.merge(df_countries, on='country_name', how='left')

df.head()

Unnamed: 0,X_id,name,title,rank,publicationDate,language_detect,country_name,id
0,5714dec325ac0d8aee3804e7,A,Turn It Up,261631.0,1998-06-22,english,United Kingdom,GBR
1,5714dec325ac0d8aee3804e8,A,Foghorn,297455.0,1998-06-22,english,United Kingdom,GBR
2,5714dec325ac0d8aee3804e9,A,Cheeky Monkey,268232.0,1998-06-22,english,United Kingdom,GBR
3,5714dec325ac0d8aee3804ea,A,No. 1,308436.0,1998-06-22,english,United Kingdom,GBR
4,5714dec325ac0d8aee3804eb,A,Bad Idea,273805.0,1998-06-22,english,United Kingdom,GBR


In [111]:
df_null_countries = df[pd.isna(df["id"])]

print(df_null_countries.shape)
df_null_countries.head()

(0, 8)


Unnamed: 0,X_id,name,title,rank,publicationDate,language_detect,country_name,id


In [112]:
# converting publicationDate into a date format
df['publicationDate'] = pd.to_datetime(df['publicationDate'])

# extracting the year
df['year'] = df['publicationDate'].dt.year

df.head()

Unnamed: 0,X_id,name,title,rank,publicationDate,language_detect,country_name,id,year
0,5714dec325ac0d8aee3804e7,A,Turn It Up,261631.0,1998-06-22,english,United Kingdom,GBR,1998
1,5714dec325ac0d8aee3804e8,A,Foghorn,297455.0,1998-06-22,english,United Kingdom,GBR,1998
2,5714dec325ac0d8aee3804e9,A,Cheeky Monkey,268232.0,1998-06-22,english,United Kingdom,GBR,1998
3,5714dec325ac0d8aee3804ea,A,No. 1,308436.0,1998-06-22,english,United Kingdom,GBR,1998
4,5714dec325ac0d8aee3804eb,A,Bad Idea,273805.0,1998-06-22,english,United Kingdom,GBR,1998


In [113]:
# Counting the number of songs produced by a country in a year
df_songs_by_countries = df.groupby(['year', 'country_name', 'id']).size().reset_index(name='song_count')

# Getting the min year where every country is present
min_year = df_songs_by_countries.groupby('country_name')['year'].min().max()
print("The minimum year where every country is present is:", min_year)

df_songs_by_countries.head(10)

The minimum year where every country is present is: 2015


Unnamed: 0,year,country_name,id,song_count
0,1910,United States,USA,12
1,1951,United States,USA,1
2,1958,United States,USA,1
3,1960,United States,USA,2
4,1963,United States,USA,1
5,1970,United States,USA,6
6,1972,United States,USA,12
7,1976,United States,USA,1
8,1977,United States,USA,1
9,1980,United Kingdom,GBR,1


In [114]:
new_dataset_filepath = "../../public/src/choropleth_dataset.csv"
df_songs_by_countries.to_csv(new_dataset_filepath, index=False)

## Getting the songs