In [1]:
# known import statements
import pandas as pd
import sqlite3 as sql # note that we are renaming to sql
import os

# new import statement
import numpy as np

# Lecture 35 Pandas 3: Data Transformation
* Data transformation is the process of changing the format, structure, or values of data. 
* Often needed during data cleaning and sometimes during data analysis

# Today's Learning Objectives: 

* Setting column as index for pandas `DataFrame`
* Identify, drop, or fill missing values (`np.NaN`) using Pandas `isna`, `dropna`, and `fillna`
* Applying transformations to `DataFrame`:
  * Use `apply` on pandas `Series` to apply a transformation function
  * Use `replace` to replace all target values in Pandas `Series` and `DataFrame` rows / columns
* Filter, aggregate, group, and summarize information in a `DataFrame` with `groupby`
* Convert .groupby examples to SQL
* Solving the same question using SQL and pandas `DataFrame` manipulations:
  * filtering, grouping, and aggregation / summarization

# The dataset: Spotify songs
Adapted from https://www.kaggle.com/datasets/mrmorj/dataset-of-songs-in-spotify.

If you are interested in digging deeper in this dataset, here's a [blog post](https://medium.com/@boplantinga/what-do-spotifys-audio-features-tell-us-about-this-year-s-eurovision-song-contest-66ad188e112a) that explain each column in details.  

### WARMUP 1: Establish a connection to the spotify.db database

In [2]:
# open up the spotify database
db_pathname = "spotify.db"
assert os.path.exists(db_pathname)
conn = sql.connect(db_pathname)

In [3]:
def qry(sql):
    return pd.read_sql(sql, conn)

### WARMUP 2: Identify the table name(s) inside the database

In [4]:
df = qry("SELECT * from sqlite_master")
df

Unnamed: 0,type,name,tbl_name,rootpage,sql
0,table,spotify,spotify,1527,"CREATE TABLE spotify(\nid TEXT PRIMARY KEY,\nt..."
1,index,sqlite_autoindex_spotify_1,spotify,1528,


### WARMUP 3: Use pandas lookup expression to extract the "sql" column and display the full query using .iloc lookup

In [5]:
print(df["sql"].iloc[0])

CREATE TABLE spotify(
id TEXT PRIMARY KEY,
title BLOB,
song_name BLOB, 
genre TEXT,
duration_ms INTEGER, 
key INTEGER, 
mode INTEGER, 
time_signature INTEGER, 
tempo REAL,
acousticness REAL, 
danceability REAL, 
energy REAL, 
instrumentalness REAL, 
liveness REAL, 
loudness REAL, 
speechiness REAL, 
valence REAL)


### WARMUP 4: Store the data inside `spotify` table inside a variable called `df`

In [6]:
df = qry("SELECT * FROM spotify")
df

Unnamed: 0,id,title,song_name,genre,duration_ms,key,mode,time_signature,tempo,acousticness,danceability,energy,instrumentalness,liveness,loudness,speechiness,valence
0,7pgJBLVz5VmnL7uGHmRj6p,,Pathology,Dark Trap,224427,8,1,4,115.080,0.401000,0.719,0.493,0.000000,0.1180,-7.230,0.0794,0.1240
1,0vSWgAlfpye0WCGeNmuNhy,,Symbiote,Dark Trap,98821,5,1,4,218.050,0.013800,0.850,0.893,0.000004,0.3720,-4.783,0.0623,0.0391
2,7EL7ifncK2PWFYThJjzR25,,BRAINFOOD,Dark Trap,101172,8,1,4,189.938,0.187000,0.864,0.365,0.000000,0.1160,-10.219,0.0655,0.0478
3,1umsRbM7L4ju7rn9aU8Ju6,,Sacrifice,Dark Trap,96062,10,0,4,139.990,0.145000,0.767,0.576,0.000003,0.0968,-9.683,0.2560,0.1870
4,4SKqOHKYU5pgHr5UiVKiQN,,Backpack,Dark Trap,135079,5,1,4,128.014,0.007700,0.765,0.726,0.000000,0.6190,-5.580,0.1910,0.2700
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35872,46bXU7Sgj7104ZoXxzz9tM,Euphoric Hardstyle,,hardstyle,269208,4,1,4,150.013,0.031500,0.528,0.693,0.000345,0.1210,-5.148,0.0304,0.3940
35873,0he2ViGMUO3ajKTxLOfWVT,Greatest Hardstyle Playlist,,hardstyle,210112,0,0,4,149.928,0.022500,0.517,0.768,0.000018,0.2050,-7.922,0.0479,0.3830
35874,72DAt9Lbpy9EUS29OzQLob,Best of Hardstyle 2020,,hardstyle,234823,8,1,4,154.935,0.026000,0.361,0.821,0.000242,0.3850,-3.102,0.0505,0.1240
35875,6HXgExFVuE1c3cq9QjFCcU,Euphoric Hardstyle,,hardstyle,323200,6,0,4,150.042,0.000551,0.477,0.921,0.029600,0.0575,-4.777,0.0392,0.4880


### Setting a column as row indices for the `DataFrame`

- Syntax: `df.set_index("<COLUMN>")`
- Returns a new DataFrame object instance reference.
- WARNING: executing this twice will result in `KeyError` being thrown. Once you set a column as row index, it will no longer be a column within the `DataFrame`. If you tried this, go back and execute the above cell and update `df` once more and then execute the below cell exactly once.

In [7]:
# Set the id column as row indices
df = df.set_index("id")
df

Unnamed: 0_level_0,title,song_name,genre,duration_ms,key,mode,time_signature,tempo,acousticness,danceability,energy,instrumentalness,liveness,loudness,speechiness,valence
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
7pgJBLVz5VmnL7uGHmRj6p,,Pathology,Dark Trap,224427,8,1,4,115.080,0.401000,0.719,0.493,0.000000,0.1180,-7.230,0.0794,0.1240
0vSWgAlfpye0WCGeNmuNhy,,Symbiote,Dark Trap,98821,5,1,4,218.050,0.013800,0.850,0.893,0.000004,0.3720,-4.783,0.0623,0.0391
7EL7ifncK2PWFYThJjzR25,,BRAINFOOD,Dark Trap,101172,8,1,4,189.938,0.187000,0.864,0.365,0.000000,0.1160,-10.219,0.0655,0.0478
1umsRbM7L4ju7rn9aU8Ju6,,Sacrifice,Dark Trap,96062,10,0,4,139.990,0.145000,0.767,0.576,0.000003,0.0968,-9.683,0.2560,0.1870
4SKqOHKYU5pgHr5UiVKiQN,,Backpack,Dark Trap,135079,5,1,4,128.014,0.007700,0.765,0.726,0.000000,0.6190,-5.580,0.1910,0.2700
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46bXU7Sgj7104ZoXxzz9tM,Euphoric Hardstyle,,hardstyle,269208,4,1,4,150.013,0.031500,0.528,0.693,0.000345,0.1210,-5.148,0.0304,0.3940
0he2ViGMUO3ajKTxLOfWVT,Greatest Hardstyle Playlist,,hardstyle,210112,0,0,4,149.928,0.022500,0.517,0.768,0.000018,0.2050,-7.922,0.0479,0.3830
72DAt9Lbpy9EUS29OzQLob,Best of Hardstyle 2020,,hardstyle,234823,8,1,4,154.935,0.026000,0.361,0.821,0.000242,0.3850,-3.102,0.0505,0.1240
6HXgExFVuE1c3cq9QjFCcU,Euphoric Hardstyle,,hardstyle,323200,6,0,4,150.042,0.000551,0.477,0.921,0.029600,0.0575,-4.777,0.0392,0.4880


### Not a Number

- `np.NaN` is the floating point representation of Not a Number
- You do not need to know / learn the details about the `numpy` package 

### Replacing / modifying values within the `DataFrame`

Syntax: `df.replace(<TARGET>, <REPLACE>)`
- Your target can be `str`, `int`, `float`, `None` (there are other possiblities, but those are too advanced for this course)
- Returns a new DataFrame object instance reference.

Let's now replace the missing values (empty strings) with `np.NAN`

In [8]:
df = df.replace("", np.NaN)
df.head(10) # title is the album name

Unnamed: 0_level_0,title,song_name,genre,duration_ms,key,mode,time_signature,tempo,acousticness,danceability,energy,instrumentalness,liveness,loudness,speechiness,valence
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
7pgJBLVz5VmnL7uGHmRj6p,,Pathology,Dark Trap,224427,8,1,4,115.08,0.401,0.719,0.493,0.0,0.118,-7.23,0.0794,0.124
0vSWgAlfpye0WCGeNmuNhy,,Symbiote,Dark Trap,98821,5,1,4,218.05,0.0138,0.85,0.893,4e-06,0.372,-4.783,0.0623,0.0391
7EL7ifncK2PWFYThJjzR25,,BRAINFOOD,Dark Trap,101172,8,1,4,189.938,0.187,0.864,0.365,0.0,0.116,-10.219,0.0655,0.0478
1umsRbM7L4ju7rn9aU8Ju6,,Sacrifice,Dark Trap,96062,10,0,4,139.99,0.145,0.767,0.576,3e-06,0.0968,-9.683,0.256,0.187
4SKqOHKYU5pgHr5UiVKiQN,,Backpack,Dark Trap,135079,5,1,4,128.014,0.0077,0.765,0.726,0.0,0.619,-5.58,0.191,0.27
3uE1swbcRp5BrO64UNy6Ex,,TakingOutTheTrash,Dark Trap,192833,11,1,4,120.004,0.172,0.814,0.575,0.000291,0.109,-9.635,0.0635,0.288
3KJrwOuqiEwHq6QTreZT61,,Io sono qui,Dark Trap,180880,10,0,4,128.066,0.0987,0.812,0.813,0.00015,0.0758,-5.583,0.0984,0.348
4QhUXx4ON40TIBrZIlnIke,,Murder,Dark Trap,186261,0,1,4,114.956,0.0343,0.602,0.578,0.0,0.164,-5.61,0.0283,0.156
09320vyX4qHd4GjHIpy5w0,,High 'N Mighty,Dark Trap,124676,7,1,5,111.958,0.112,0.876,0.768,1.2e-05,0.283,-6.606,0.201,0.72
6xEnbXM1us9fDJy2LC0lru,,Bang Ya Fucking Head,Dark Trap,154929,1,1,4,125.013,0.0525,0.69,0.76,0.0,0.134,-5.431,0.0895,0.0797


### Checking for missing values

Syntax: `Series.isna()`
- Returns a boolean Series

Let's check if any of the "song_name"(s) are missing

In [9]:
df["song_name"].isna()

id
7pgJBLVz5VmnL7uGHmRj6p    False
0vSWgAlfpye0WCGeNmuNhy    False
7EL7ifncK2PWFYThJjzR25    False
1umsRbM7L4ju7rn9aU8Ju6    False
4SKqOHKYU5pgHr5UiVKiQN    False
                          ...  
46bXU7Sgj7104ZoXxzz9tM     True
0he2ViGMUO3ajKTxLOfWVT     True
72DAt9Lbpy9EUS29OzQLob     True
6HXgExFVuE1c3cq9QjFCcU     True
6MAAMZImxcvYhRnxDLTufD     True
Name: song_name, Length: 35877, dtype: bool

### Review: `Pandas.Series.value_counts()`
- Returns a new `Series` with unique values from the original `Series` as keys and the count of those unique values as values. 
- Return value `Series` is ordered using descending order of counts

In [10]:
# count the number of missing values for song name
df["song_name"].isna().value_counts()

False    18342
True     17535
Name: song_name, dtype: int64

### Missing value manipulation
Syntax: `df.fillna(<REPLACE>)`
- Returns a new DataFrame object instance reference.

In [11]:
# use .fillna to replace missing values
df["song_name"].fillna("No Song Name")

# to replace the original DataFrame's column, you need to explicitly update that object instance
df["song_name"] = df["song_name"].fillna("No Song Name")
df

Unnamed: 0_level_0,title,song_name,genre,duration_ms,key,mode,time_signature,tempo,acousticness,danceability,energy,instrumentalness,liveness,loudness,speechiness,valence
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
7pgJBLVz5VmnL7uGHmRj6p,,Pathology,Dark Trap,224427,8,1,4,115.080,0.401000,0.719,0.493,0.000000,0.1180,-7.230,0.0794,0.1240
0vSWgAlfpye0WCGeNmuNhy,,Symbiote,Dark Trap,98821,5,1,4,218.050,0.013800,0.850,0.893,0.000004,0.3720,-4.783,0.0623,0.0391
7EL7ifncK2PWFYThJjzR25,,BRAINFOOD,Dark Trap,101172,8,1,4,189.938,0.187000,0.864,0.365,0.000000,0.1160,-10.219,0.0655,0.0478
1umsRbM7L4ju7rn9aU8Ju6,,Sacrifice,Dark Trap,96062,10,0,4,139.990,0.145000,0.767,0.576,0.000003,0.0968,-9.683,0.2560,0.1870
4SKqOHKYU5pgHr5UiVKiQN,,Backpack,Dark Trap,135079,5,1,4,128.014,0.007700,0.765,0.726,0.000000,0.6190,-5.580,0.1910,0.2700
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46bXU7Sgj7104ZoXxzz9tM,Euphoric Hardstyle,No Song Name,hardstyle,269208,4,1,4,150.013,0.031500,0.528,0.693,0.000345,0.1210,-5.148,0.0304,0.3940
0he2ViGMUO3ajKTxLOfWVT,Greatest Hardstyle Playlist,No Song Name,hardstyle,210112,0,0,4,149.928,0.022500,0.517,0.768,0.000018,0.2050,-7.922,0.0479,0.3830
72DAt9Lbpy9EUS29OzQLob,Best of Hardstyle 2020,No Song Name,hardstyle,234823,8,1,4,154.935,0.026000,0.361,0.821,0.000242,0.3850,-3.102,0.0505,0.1240
6HXgExFVuE1c3cq9QjFCcU,Euphoric Hardstyle,No Song Name,hardstyle,323200,6,0,4,150.042,0.000551,0.477,0.921,0.029600,0.0575,-4.777,0.0392,0.4880


### Dropping missing values
Syntax: `df.dropna()`
- Returns a new DataFrame object instance reference.

In [12]:
# .dropna will drop all rows that contain NaN in them
df.dropna()

Unnamed: 0_level_0,title,song_name,genre,duration_ms,key,mode,time_signature,tempo,acousticness,danceability,energy,instrumentalness,liveness,loudness,speechiness,valence
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
5LzAV6KfjN8VhWCedeygfY,Dirtybird Players,No Song Name,techhouse,197499,7,1,4,127.997,0.000957,0.806,0.950,0.920000,0.1130,-6.782,0.0811,0.580
3TsCb6ueD678XBJDiRrvhr,tech house,No Song Name,techhouse,206000,10,1,4,124.994,0.062300,0.729,0.978,0.908000,0.0353,-6.645,0.0420,0.778
6Y0Fy2buEis7bEOlG0QET1,Tech House Bangerz,No Song Name,techhouse,199839,4,0,4,124.006,0.019100,0.724,0.792,0.812000,0.1080,-8.555,0.0405,0.346
4EJI2XGViSQp6WscLKgYDD,tech house,No Song Name,techhouse,173861,8,1,4,125.031,0.053000,0.700,0.898,0.418000,0.5740,-6.099,0.2570,0.791
4x6VzOQTLIrkkCWcDPh5Y0,blanc | Tech House,No Song Name,techhouse,394960,8,0,4,127.029,0.000301,0.803,0.919,0.926000,0.1020,-8.667,0.0702,0.754
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46bXU7Sgj7104ZoXxzz9tM,Euphoric Hardstyle,No Song Name,hardstyle,269208,4,1,4,150.013,0.031500,0.528,0.693,0.000345,0.1210,-5.148,0.0304,0.394
0he2ViGMUO3ajKTxLOfWVT,Greatest Hardstyle Playlist,No Song Name,hardstyle,210112,0,0,4,149.928,0.022500,0.517,0.768,0.000018,0.2050,-7.922,0.0479,0.383
72DAt9Lbpy9EUS29OzQLob,Best of Hardstyle 2020,No Song Name,hardstyle,234823,8,1,4,154.935,0.026000,0.361,0.821,0.000242,0.3850,-3.102,0.0505,0.124
6HXgExFVuE1c3cq9QjFCcU,Euphoric Hardstyle,No Song Name,hardstyle,323200,6,0,4,150.042,0.000551,0.477,0.921,0.029600,0.0575,-4.777,0.0392,0.488


### Review: `Pandas.Series.apply(...)`
Syntax: `Series.apply(<FUNCTION OBJECT REFERENCE>)`
- applies input function to every element of the Series.
- Returns a new `Series` object instance reference.

Let's apply transformation function to `mode` column `Series`:
- mode = 1 means major modality (sounds happy)
- mode = 0 means minor modality (sounds sad)

In [13]:
def replace_mode(m): 
    if m == 1: 
        return "major"
    else: 
        return "minor"

In [14]:
df["mode"].apply(replace_mode)

id
7pgJBLVz5VmnL7uGHmRj6p    major
0vSWgAlfpye0WCGeNmuNhy    major
7EL7ifncK2PWFYThJjzR25    major
1umsRbM7L4ju7rn9aU8Ju6    minor
4SKqOHKYU5pgHr5UiVKiQN    major
                          ...  
46bXU7Sgj7104ZoXxzz9tM    major
0he2ViGMUO3ajKTxLOfWVT    minor
72DAt9Lbpy9EUS29OzQLob    major
6HXgExFVuE1c3cq9QjFCcU    minor
6MAAMZImxcvYhRnxDLTufD    major
Name: mode, Length: 35877, dtype: object

### `lambda`

Let's write a `lambda` function instead of the `replace_mode` function

In [15]:
df["mode"].apply(lambda m: "major" if m == 1 else "minor")

id
7pgJBLVz5VmnL7uGHmRj6p    major
0vSWgAlfpye0WCGeNmuNhy    major
7EL7ifncK2PWFYThJjzR25    major
1umsRbM7L4ju7rn9aU8Ju6    minor
4SKqOHKYU5pgHr5UiVKiQN    major
                          ...  
46bXU7Sgj7104ZoXxzz9tM    major
0he2ViGMUO3ajKTxLOfWVT    minor
72DAt9Lbpy9EUS29OzQLob    major
6HXgExFVuE1c3cq9QjFCcU    minor
6MAAMZImxcvYhRnxDLTufD    major
Name: mode, Length: 35877, dtype: object

Typically transformed columns are added as new columns within the DataFrame.
Let's add a new `modified_mode` column.

In [16]:
df["modified_mode"] = df["mode"].apply(lambda m: "major" if m == 1 else "minor")
df

Unnamed: 0_level_0,title,song_name,genre,duration_ms,key,mode,time_signature,tempo,acousticness,danceability,energy,instrumentalness,liveness,loudness,speechiness,valence,modified_mode
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
7pgJBLVz5VmnL7uGHmRj6p,,Pathology,Dark Trap,224427,8,1,4,115.080,0.401000,0.719,0.493,0.000000,0.1180,-7.230,0.0794,0.1240,major
0vSWgAlfpye0WCGeNmuNhy,,Symbiote,Dark Trap,98821,5,1,4,218.050,0.013800,0.850,0.893,0.000004,0.3720,-4.783,0.0623,0.0391,major
7EL7ifncK2PWFYThJjzR25,,BRAINFOOD,Dark Trap,101172,8,1,4,189.938,0.187000,0.864,0.365,0.000000,0.1160,-10.219,0.0655,0.0478,major
1umsRbM7L4ju7rn9aU8Ju6,,Sacrifice,Dark Trap,96062,10,0,4,139.990,0.145000,0.767,0.576,0.000003,0.0968,-9.683,0.2560,0.1870,minor
4SKqOHKYU5pgHr5UiVKiQN,,Backpack,Dark Trap,135079,5,1,4,128.014,0.007700,0.765,0.726,0.000000,0.6190,-5.580,0.1910,0.2700,major
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46bXU7Sgj7104ZoXxzz9tM,Euphoric Hardstyle,No Song Name,hardstyle,269208,4,1,4,150.013,0.031500,0.528,0.693,0.000345,0.1210,-5.148,0.0304,0.3940,major
0he2ViGMUO3ajKTxLOfWVT,Greatest Hardstyle Playlist,No Song Name,hardstyle,210112,0,0,4,149.928,0.022500,0.517,0.768,0.000018,0.2050,-7.922,0.0479,0.3830,minor
72DAt9Lbpy9EUS29OzQLob,Best of Hardstyle 2020,No Song Name,hardstyle,234823,8,1,4,154.935,0.026000,0.361,0.821,0.000242,0.3850,-3.102,0.0505,0.1240,major
6HXgExFVuE1c3cq9QjFCcU,Euphoric Hardstyle,No Song Name,hardstyle,323200,6,0,4,150.042,0.000551,0.477,0.921,0.029600,0.0575,-4.777,0.0392,0.4880,minor


#### Let's go back to the original table from the SQL database

In [17]:
df = qry("SELECT * FROM spotify")
df

Unnamed: 0,id,title,song_name,genre,duration_ms,key,mode,time_signature,tempo,acousticness,danceability,energy,instrumentalness,liveness,loudness,speechiness,valence
0,7pgJBLVz5VmnL7uGHmRj6p,,Pathology,Dark Trap,224427,8,1,4,115.080,0.401000,0.719,0.493,0.000000,0.1180,-7.230,0.0794,0.1240
1,0vSWgAlfpye0WCGeNmuNhy,,Symbiote,Dark Trap,98821,5,1,4,218.050,0.013800,0.850,0.893,0.000004,0.3720,-4.783,0.0623,0.0391
2,7EL7ifncK2PWFYThJjzR25,,BRAINFOOD,Dark Trap,101172,8,1,4,189.938,0.187000,0.864,0.365,0.000000,0.1160,-10.219,0.0655,0.0478
3,1umsRbM7L4ju7rn9aU8Ju6,,Sacrifice,Dark Trap,96062,10,0,4,139.990,0.145000,0.767,0.576,0.000003,0.0968,-9.683,0.2560,0.1870
4,4SKqOHKYU5pgHr5UiVKiQN,,Backpack,Dark Trap,135079,5,1,4,128.014,0.007700,0.765,0.726,0.000000,0.6190,-5.580,0.1910,0.2700
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35872,46bXU7Sgj7104ZoXxzz9tM,Euphoric Hardstyle,,hardstyle,269208,4,1,4,150.013,0.031500,0.528,0.693,0.000345,0.1210,-5.148,0.0304,0.3940
35873,0he2ViGMUO3ajKTxLOfWVT,Greatest Hardstyle Playlist,,hardstyle,210112,0,0,4,149.928,0.022500,0.517,0.768,0.000018,0.2050,-7.922,0.0479,0.3830
35874,72DAt9Lbpy9EUS29OzQLob,Best of Hardstyle 2020,,hardstyle,234823,8,1,4,154.935,0.026000,0.361,0.821,0.000242,0.3850,-3.102,0.0505,0.1240
35875,6HXgExFVuE1c3cq9QjFCcU,Euphoric Hardstyle,,hardstyle,323200,6,0,4,150.042,0.000551,0.477,0.921,0.029600,0.0575,-4.777,0.0392,0.4880


Extract just the "genre" and "duration_ms" columns from `df`.

In [18]:
df[["genre", "duration_ms"]]

Unnamed: 0,genre,duration_ms
0,Dark Trap,224427
1,Dark Trap,98821
2,Dark Trap,101172
3,Dark Trap,96062
4,Dark Trap,135079
...,...,...
35872,hardstyle,269208
35873,hardstyle,210112
35874,hardstyle,234823
35875,hardstyle,323200


### `Pandas.DataFrame.groupby(...)`

Syntax: `DataFrame.groupby(<COLUMN>)`
- Returns a `groupby` object instance reference
- Need to apply aggregation methods to use the return value of `groupby`

In [19]:
df[["genre", "duration_ms"]].groupby("genre")

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7fe4e6a87670>

### What is the average duration for each genre ordered based on decreasing order of averages?
#### v1: using `df` (`pandas`) to answer the question

In [20]:
df[["genre", "duration_ms"]].groupby("genre").mean()

Unnamed: 0_level_0,duration_ms
genre,Unnamed: 1_level_1
Dark Trap,196059.938997
Emo,218370.989519
Hiphop,227885.028411
Pop,211558.05298
Rap,200816.798836
RnB,225628.556955
Trap Metal,145940.519467
Underground Rap,175506.191224
dnb,288860.138811
hardstyle,232828.626542


In [21]:
df[["genre", "duration_ms"]].groupby("genre").mean().sort_values(by = "duration_ms", ascending = False)

Unnamed: 0_level_0,duration_ms
genre,Unnamed: 1_level_1
psytrance,445770.492075
techno,399123.187453
techhouse,298395.587596
dnb,288860.138811
trance,288729.366262
hardstyle,232828.626542
Hiphop,227885.028411
RnB,225628.556955
trap,225149.277731
Emo,218370.989519


One way to check whether `groupby` works would be to use `value_counts` on the same column `Series`.

In [22]:
df["genre"].value_counts()

Underground Rap    4330
Dark Trap          3590
Hiphop             3027
trance             2804
psytrance          2650
techno             2646
dnb                2507
trap               2362
hardstyle          2351
techhouse          2209
RnB                1905
Trap Metal         1875
Emo                1622
Rap                1546
Pop                 453
Name: genre, dtype: int64

### What is the average duration for each genre ordered based on decreasing order of averages?
#### v2: using SQL query to answer the question

In [23]:
# SQL equivalent query of the above Pandas query
avg_duration_per_genre = qry("""
SELECT genre, AVG(duration_ms) as avg_duration
FROM spotify 
GROUP BY genre
ORDER BY avg_duration DESC
""")

# How can we get make the SQL query output to be exactly same as df.groupby?
avg_duration_per_genre = avg_duration_per_genre.set_index("genre")
avg_duration_per_genre

Unnamed: 0_level_0,avg_duration
genre,Unnamed: 1_level_1
psytrance,445770.492075
techno,399123.187453
techhouse,298395.587596
dnb,288860.138811
trance,288729.366262
hardstyle,232828.626542
Hiphop,227885.028411
RnB,225628.556955
trap,225149.277731
Emo,218370.989519


### What is the average speechiness for each mode, time signature pair?
#### v1: pandas

In [24]:
# use a list to indicate all the columns you want to groupby 
df[["mode", "time_signature", "speechiness"]].groupby(["mode", "time_signature"]).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,speechiness
mode,time_signature,Unnamed: 2_level_1
0,1,0.181224
0,3,0.121837
0,4,0.126688
0,5,0.20489
1,1,0.173138
1,3,0.129512
1,4,0.13917
1,5,0.220177


In [25]:
# SQL equivalent query of the above Pandas query
qry("""
SELECT mode, time_signature, AVG(speechiness) as avg_speechiness
FROM spotify 
GROUP BY mode, time_signature
""")

Unnamed: 0,mode,time_signature,avg_speechiness
0,0,1,0.181224
1,0,3,0.121837
2,0,4,0.126688
3,0,5,0.20489
4,1,1,0.173138
5,1,3,0.129512
6,1,4,0.13917
7,1,5,0.220177


### Self-practice

### Which songs have a tempo greater than 150 and what are their genre?

In [26]:
# v1: pandas
fast_songs = df[df["tempo"] > 150]
fast_songs[["song_name", "genre"]]

Unnamed: 0,song_name,genre
1,Symbiote,Dark Trap
2,BRAINFOOD,Dark Trap
18,FunnyToSeeYouHere,Dark Trap
19,Killer,Dark Trap
20,608,Dark Trap
...,...,...
35871,,hardstyle
35872,,hardstyle
35874,,hardstyle
35875,,hardstyle


In [27]:
# v2: SQL

qry("""
SELECT song_name, genre
FROM spotify
WHERE tempo > 150
""")

Unnamed: 0,song_name,genre
0,Symbiote,Dark Trap
1,BRAINFOOD,Dark Trap
2,FunnyToSeeYouHere,Dark Trap
3,Killer,Dark Trap
4,608,Dark Trap
...,...,...
13748,,hardstyle
13749,,hardstyle
13750,,hardstyle
13751,,hardstyle


### What is the sum of danceability and liveness for "Hiphop" genre songs?

In [28]:
# v1: pandas
hiphop_songs = df[df["genre"] == "Hiphop"]
hiphop_songs["danceability"] + hiphop_songs["liveness"]

15321    0.8416
15322    0.9201
15323    0.8580
15324    0.8240
15325    0.9348
          ...  
18343    0.6690
18344    0.5370
18345    0.8850
18346    0.8770
18347    0.8703
Length: 3027, dtype: float64

In [29]:
# v2: SQL
hiphop_songs = qry("""
SELECT danceability + liveness as song_score
FROM spotify
WHERE genre = "Hiphop"
""")
hiphop_songs["song_score"]

0       0.8416
1       0.9201
2       0.8580
3       0.8240
4       0.9348
         ...  
3022    0.6690
3023    0.5370
3024    0.8850
3025    0.8770
3026    0.8703
Name: song_score, Length: 3027, dtype: float64

### Find all song_name ordered by ascending order of duration_ms. Eliminate songs which don't have a song_name

In [30]:
# v1: pandas
songs_by_duration = list(df.sort_values(by = "duration_ms")["song_name"])
# [song for song in songs_by_duration if song != ""] # uncomment to see the output

In [31]:
# v2
songs_by_duration = qry("""
SELECT song_name
FROM spotify
ORDER BY duration_ms
""")
songs_by_duration = list(songs_by_duration["song_name"])
# [song for song in songs_by_duration if song != ""] # uncomment to see the output

### How many distinct "genre"s are there in the dataset?

In [32]:
# v1: pandas
list(set(list(df["genre"])))

['Pop',
 'Trap Metal',
 'hardstyle',
 'trap',
 'Rap',
 'Emo',
 'dnb',
 'Hiphop',
 'RnB',
 'trance',
 'Dark Trap',
 'Underground Rap',
 'psytrance',
 'techhouse',
 'techno']

In [33]:
# v2: SQL
genres = qry("""
SELECT DISTINCT genre
FROM spotify
""")
list(genres["genre"])

['Dark Trap',
 'Underground Rap',
 'Trap Metal',
 'Emo',
 'Rap',
 'RnB',
 'Pop',
 'Hiphop',
 'techhouse',
 'techno',
 'trance',
 'psytrance',
 'trap',
 'dnb',
 'hardstyle']

### Considering only songs with energy greater than 0.5, what is the maximum energy for each "genre" with song count greater than 2000?

In [34]:
# v1: pandas
high_energy_songs = df[df["energy"] > 0.5]
genre_groups = high_energy_songs[["genre", "energy"]].groupby("genre")
max_energy = genre_groups.max()
max_energy["energy"]

genre
Dark Trap          0.998
Emo                0.995
Hiphop             0.978
Pop                0.977
Rap                0.980
RnB                0.974
Trap Metal         0.999
Underground Rap    0.997
dnb                0.999
hardstyle          0.999
psytrance          0.999
techhouse          0.999
techno             1.000
trance             1.000
trap               1.000
Name: energy, dtype: float64

In [35]:
genre_counts = genre_groups.count()
genre_counts["energy_max"] = max_energy["energy"]
filtered_genre_counts = genre_counts[genre_counts["energy"] > 2000]
filtered_genre_counts

Unnamed: 0_level_0,energy,energy_max
genre,Unnamed: 1_level_1,Unnamed: 2_level_1
Dark Trap,2757,0.998
Hiphop,2497,0.978
Underground Rap,3420,0.997
dnb,2496,0.999
hardstyle,2345,0.999
psytrance,2642,0.999
techhouse,2164,0.999
techno,2534,1.0
trance,2786,1.0
trap,2346,1.0


In [36]:
# v2: SQL
qry("""
SELECT genre, COUNT(*) as song_count, MAX("energy") as energy_max
FROM spotify
WHERE energy > 0.5
GROUP BY genre
HAVING song_count > 2000
""")

Unnamed: 0,genre,song_count,energy_max
0,Dark Trap,2757,0.998
1,Hiphop,2497,0.978
2,Underground Rap,3420,0.997
3,dnb,2496,0.999
4,hardstyle,2345,0.999
5,psytrance,2642,0.999
6,techhouse,2164,0.999
7,techno,2534,1.0
8,trance,2786,1.0
9,trap,2346,1.0
