In [2]:
import os
import json
import requests

import pandas as pd

from dotenv import load_dotenv
from tqdm.notebook import tqdm
tqdm.pandas()

from IPython.display import Image

from lets_plot import *
LetsPlot.setup_html()

from sqlalchemy import create_engine
from sqlalchemy import text

## 1. Reading data in from the spotify_artist.db database

Now that our data has been cleaned and inputed to the SQL database, let's load it back into a Pandas dataframe to start data visualisation using the lets-plot libary. We will be creating 3 plots which will enable us to come to a thorough conclusion and comprehensively answer the question over who is the 'bigger' artist - Kendrick Lamar of J.Cole?

In [3]:
engine = create_engine('sqlite:////files/w10-summative-vedantr98/data/spotify_artist.db')

In [3]:
df_artists_db = pd.read_sql('artists', engine)

In [4]:
df_artists_db

Unnamed: 0,Name,Spotify_ID,Popularity,Type,Total_Followers
0,J. Cole,6l3HvQ5sa6mXTsMTB19rO5,87,artist,25054342
1,Kendrick Lamar,2YZyLoL8N0Wb9xBt1NhZWg,95,artist,34539703


In [5]:
df_albums_db = pd.read_sql('albums', engine)

In [6]:
df_albums_db

Unnamed: 0,Artist,Name,Spotify_ID,Release_Date,Number_of_Tracks,Label,Type,Popularity,Album_by_Release_Order
0,J. Cole,Cole World: The Sideline Story,0fhmJYVhW0e4i33pCLPA5i,2011-09-27,16,Roc Nation LLC,album,78,1
1,J. Cole,Born Sinner (Deluxe Version),5FP9keIJnlSCKnkdVOf623,2013-06-18,21,Roc Nation LLC,album,80,2
2,J. Cole,2014 Forest Hills Drive,0UMMIkurRUmkruZ3KGBLtG,2014-12-09,13,Roc Nation Records LLC,album,85,3
3,J. Cole,4 Your Eyez Only,3CCnGldVQ90c26aFATC1PW,2016-12-09,10,J. Cole P&D,album,75,4
4,J. Cole,KOD,4Wv5UAieM1LDEYVq5WmqDd,2018-04-20,12,J. Cole P&D,album,71,5
5,J. Cole,The Off-Season,4JAvwK4APPArjIsOdGoJXX,2021-05-14,12,"Dreamville, Inc., Under exclusive license to R...",album,75,6
6,Kendrick Lamar,Section.80,1bkN9nIkkCnXeG4yitVS1J,2011-07-02,15,Top Dawg Entertainment / Section.80,album,74,1
7,Kendrick Lamar,"good kid, m.A.A.d city (Deluxe)",748dZDqSZy6aPXKcI9H80u,2012-10-22,17,Aftermath,album,74,2
8,Kendrick Lamar,To Pimp A Butterfly,7ycBtnsMtyVbbwTfJwRjSP,2015-03-16,16,Aftermath,album,81,3
9,Kendrick Lamar,DAMN.,4eLPsYPBmXABThSJ821sqY,2017-04-14,14,Aftermath,album,89,4


## 2. Data Visualisation - Creating the Plots

### 2.1. Plot 1 - Artist Total Followers

Let's use geom_bar to create a barchart comparing Total Spotify Followers 

In [7]:
plot1 = ggplot(df_artists_db, aes(x='Name', y='Total_Followers', fill='Name')) + \
    geom_bar(stat='identity') + \
    scale_fill_manual(values=['#3CC0C6', '#9D2323']) + \
    labs(x='Artist', y='Total Number of Spotify Followers (millions)', title= 'J.Cole vs Kendrick Lamar: Total Number of Followers ðŸŽ¤',
           subtitle='   - Kendrick Lamar has approximately 10M more followers than J.Cole') + \
    theme(
        axis_title_x=element_text(size=15, color="black", family="Georgia Ref"),  
        axis_title_y=element_text(size=15, color="black", family="Georgia Ref"),  
        axis_text_x=element_text(size=12, color="black", family="Arial"),
        axis_text_y=element_text(size=12, color="black", family="Arial"),
        plot_title=element_text(size=22, color="black", family="Georgia Ref"), 
        plot_subtitle=element_text(size=12, color="gray", family="Arial",) 
    ) + \
    ggsize(900, 600)

plot1.show()

### 2.2. Plot 2 - Popularity of Albums

Now we will use geom_line to create a scatter graph individually comparing the 6 studio albums both artists have released

In [8]:
plot2 = ggplot(df_albums_db, aes(x='Album_by_Release_Order', y='Popularity', color='Artist')) + \
    geom_line() + \
    geom_point(size=3) + \
    geom_text(aes(label='Name'),nudge_y=1, size=5,colour='#000000')+ \
    scale_color_manual(values={'J. Cole': '#3CC0C6', 'Kendrick Lamar': '#9D2323'}) + \
    scale_x_continuous(breaks=[1, 2, 3, 4, 5, 6],expand=[0.15,0])  + \
    labs(x='Album Number by Relase Order', y='Spotify Popularity (Scale: 1-100) ', title='J.Cole vs Kendrick Lamar: Popularity compared between Studio AlbumsðŸ’½',
           subtitle='- J.Cole initially released more popular albums but has since been surpassed by Kendrick Lamar') + \
    theme(
        axis_title_x=element_text(size=15, color="black", family="Georgia Ref"),  
        axis_title_y=element_text(size=15, color="black", family="Georgia Ref"),  
        axis_text_x=element_text(size=12, color="black", family="Arial"),
        axis_text_y=element_text(size=12, color="black", family="Arial"),
        plot_title=element_text(size=22, color="black", family="Georgia Ref"), 
        plot_subtitle=element_text(size=12, color="gray", family="Arial",) 
    ) + \
    ggsize(900, 650)

plot2.show()

### 2.3. Plot 3 - Average Popularity of Albums 

For our final plot, we will use the .groupby function to calculate the average popularity across each artists' 6 albums creating a new column and comparing them directly together

In [9]:
artist_avg_popularity = df_albums_db.groupby('Artist').agg({'Popularity': 'mean'})

In [10]:
df_albums_db2 = df_albums_db.merge(artist_avg_popularity, on='Artist', how='left')

In [11]:
df_albums_db2

Unnamed: 0,Artist,Name,Spotify_ID,Release_Date,Number_of_Tracks,Label,Type,Popularity_x,Album_by_Release_Order,Popularity_y
0,J. Cole,Cole World: The Sideline Story,0fhmJYVhW0e4i33pCLPA5i,2011-09-27,16,Roc Nation LLC,album,78,1,77.333333
1,J. Cole,Born Sinner (Deluxe Version),5FP9keIJnlSCKnkdVOf623,2013-06-18,21,Roc Nation LLC,album,80,2,77.333333
2,J. Cole,2014 Forest Hills Drive,0UMMIkurRUmkruZ3KGBLtG,2014-12-09,13,Roc Nation Records LLC,album,85,3,77.333333
3,J. Cole,4 Your Eyez Only,3CCnGldVQ90c26aFATC1PW,2016-12-09,10,J. Cole P&D,album,75,4,77.333333
4,J. Cole,KOD,4Wv5UAieM1LDEYVq5WmqDd,2018-04-20,12,J. Cole P&D,album,71,5,77.333333
5,J. Cole,The Off-Season,4JAvwK4APPArjIsOdGoJXX,2021-05-14,12,"Dreamville, Inc., Under exclusive license to R...",album,75,6,77.333333
6,Kendrick Lamar,Section.80,1bkN9nIkkCnXeG4yitVS1J,2011-07-02,15,Top Dawg Entertainment / Section.80,album,74,1,82.666667
7,Kendrick Lamar,"good kid, m.A.A.d city (Deluxe)",748dZDqSZy6aPXKcI9H80u,2012-10-22,17,Aftermath,album,74,2,82.666667
8,Kendrick Lamar,To Pimp A Butterfly,7ycBtnsMtyVbbwTfJwRjSP,2015-03-16,16,Aftermath,album,81,3,82.666667
9,Kendrick Lamar,DAMN.,4eLPsYPBmXABThSJ821sqY,2017-04-14,14,Aftermath,album,89,4,82.666667


In [12]:
df_albums_db3=df_albums_db2.rename(columns={
        'Popularity_x': 'Popularity',
        'Popularity_y': 'Average_Popularity',     
})

In [13]:
plot3 = ggplot(df_albums_db3, aes(x='Artist', y='Average_Popularity', fill='Artist')) + \
    geom_bar(stat='identity') + \
    scale_fill_manual(values=['#3CC0C6', '#9D2323']) + \
    labs(x='Artist', y='Average Spotify Popularity (Scale: 1-100)', title= 'J.Cole vs Kendrick Lamar: Average Popularity across all AlbumsðŸŽµ',
           subtitle='   - Kendrick Lamar has a higher popularity across the average of all 6 albums than J.Cole') + \
    theme(
        axis_title_x=element_text(size=15, color="black", family="Georgia Ref"),  
        axis_title_y=element_text(size=15, color="black", family="Georgia Ref"),  
        axis_text_x=element_text(size=12, color="black", family="Arial"),
        axis_text_y=element_text(size=12, color="black", family="Arial"),
        plot_title=element_text(size=22, color="black", family="Georgia Ref"), 
        plot_subtitle=element_text(size=12, color="gray", family="Arial",) 
    ) + \
    ggsize(800, 700)

plot3.show()

### 3. Snippets of the plots are under the images folder and can be accessed via the links below


[Plot 1 - Artist Followers](/files/w10-summative-vedantr98/images/Plot_1.JPG) 

[Plot 2 - Album Popularity](../images/Plot_2.JPG) 

[Plot 3 - Album Popularity Averaged](../images/Plot_3.JPG) 
