In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('charts.csv')

In [3]:
df.head()

df.shape


(330087, 7)

In [4]:
print(f"The dataframe is {(df.shape[0])} rows and {(df.shape[1])} columns.")

The dataframe is 330087 rows and 7 columns.


In [5]:
df.isnull().sum()

date                  0
rank                  0
song                  0
artist                0
last-week         32312
peak-rank             0
weeks-on-board        0
dtype: int64

The only missing values are in the last-week column, which has 32312 missing values. Displaying full records below. 

In [6]:
# Missing value rows printed
missing_values = df[df['last-week'].isnull()]
missing_values.head()


Unnamed: 0,date,rank,song,artist,last-week,peak-rank,weeks-on-board
26,2021-11-06,27,Moth To A Flame,Swedish House Mafia & The Weeknd,,27,1
27,2021-11-06,28,Let's Go Brandon,Bryson Gray Featuring Tyson James & Chandler C...,,28,1
60,2021-11-06,61,Not In The Mood,"Lil Tjay, Fivio Foreign & Kay Flock",,61,1
68,2021-11-06,69,Switches & Dracs,Moneybagg Yo Featuring Lil Durk & EST Gee,,69,1
78,2021-11-06,79,Poke It Out,Wale Featuring J. Cole,,79,1


In [7]:
df = df.sort_values('date', ascending=False)


The latest date of the dataset is 2021-11-06. 

# How many unique number one songs have there been in 2021?

In [17]:
# Calculating unique number one songs in 2021
unique_songs = df[
    (df['date'] >= '2021-01-01')
    & 
    (df['peak-rank'] == 1)
]['song'].nunique()

print(f"There are {unique_songs} unique number one songs in 2021.")

There are 26 unique number one songs in 2021.


In [9]:
# The missing values may indicate that these songs were not on the chart previous week, not necessarily their first week on the chart. We can create an indicator variable for first week on chart. It is important to note that this does not mean first week released, only first week on chart, plus there is a possibility that they were on the chart on a week before this data was collected.
# Creating indicator column 'chart_status' that indicates whether the song is in its first week on the chart, continuing from last week, or a re-entry (not on chart last week but has been on chart before).
df['chart_status'] = df.apply(
    lambda row: 'continuing' if pd.notnull(row['last-week']) else 'first-week' if row['peak-rank'] == row['rank'] else 're-entry', axis= 1
)

In [10]:
df['chart_status'].value_counts()

chart_status
continuing    297775
first-week     30514
re-entry        1798
Name: count, dtype: int64

In [11]:
df.head()

Unnamed: 0,date,rank,song,artist,last-week,peak-rank,weeks-on-board,chart_status
0,2021-11-06,1,Easy On Me,Adele,1.0,1,3,continuing
64,2021-11-06,65,'Til You Can't,Cody Johnson,76.0,65,3,continuing
75,2021-11-06,76,2055,Sleepy Hallow,72.0,51,15,continuing
73,2021-11-06,74,Tequila Little Time,Jon Pardi,83.0,74,3,continuing
72,2021-11-06,73,Volvi,Aventura x Bad Bunny,75.0,22,13,continuing


In [12]:
# Imputing missing values in 'last-week' column with 0
df['last-week'] = df['last-week'].fillna(0)

In [13]:
# We are now good to go with no missing values
df.isnull().sum()

date              0
rank              0
song              0
artist            0
last-week         0
peak-rank         0
weeks-on-board    0
chart_status      0
dtype: int64

In [14]:
# Exploratory data analysis and modeling can proceed
df.describe()

Unnamed: 0,rank,last-week,peak-rank,weeks-on-board
count,330087.0,330087.0,330087.0,330087.0
mean,50.500929,42.932918,40.970629,9.161785
std,28.866094,30.166474,29.347481,7.618264
min,1.0,0.0,1.0,1.0
25%,26.0,16.0,13.0,4.0
50%,51.0,41.0,38.0,7.0
75%,76.0,69.0,65.0,13.0
max,100.0,100.0,100.0,90.0


In [15]:
# The data can tell us history of songs on the chart, their movement, and trends over time.
# Let's start by looking at 2020's top songs so far
# We will filter for the date range and then look at the top 10 songs by peak rank.
top_songs_2020 = df[(df['date'] >= '2020-01-01') & (df['date'] <= '2020-12-31')]
top_songs_2020 = df[top_songs_2020['peak-rank'] <= 10]
top_songs_2020[['song', 'artist', 'peak-rank']]


  top_songs_2020 = df[top_songs_2020['peak-rank'] <= 10]


IndexingError: Unalignable boolean Series provided as indexer (index of the boolean Series and of the indexed object do not match).

In [None]:
t