In [1]:
### This notebook takes in a couple of raw data sets and produces a .csv file containing only 
### tracks, which are part of popular releases

### To run this notebook you need the raw datasets sp_release and sp_track in the /data folder
### of the repo

In [2]:
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt

In [3]:
print(os.getcwd())
cwd = os.getcwd()

C:\Users\alexv\OneDrive\Documents\GitHub\group-coursework-machine-learners\notebooks


In [4]:
# Change the current working directory to the one containing the raw data
master_dir = cwd[0:-10]
cwd = master_dir+'\\data\\raw kaggle data\\'
os.chdir(cwd)

In [5]:
# Make sure we have the correct directory
print(os.getcwd())

C:\Users\alexv\OneDrive\Documents\GitHub\group-coursework-machine-learners\data\raw kaggle data


In [7]:
# Load the data sets into respective dataframes

sp_release_df= pd.read_csv("sp_release.csv")
sp_track_df = pd.read_csv("sp_track.csv")

In [None]:
# We want to measure popularity which is updated in real(ish) time, aka
# how many times a track has been played RECENTLY

sp_release_df.value_counts("updated_on", ascending=True)

In [None]:
# We can see that the samples have been collected within <20d of each other
# and relatively recently

# Now we check for missing values and the features of the data set
sp_release_df.info()


In [None]:
# We can see there's some missing values, but a neglegible amount
# Let's see how many different types of albums there are

sp_release_df.value_counts("album_type")

In [None]:
# We are trying to predict the popularity of a single based on it's audio
# features so we extract the singles from the df

sp_release_singles_df = sp_release_df.groupby(
    sp_release_df['album_type']).get_group('single')

In [None]:
# Check to see if the split was successful

sp_release_singles_df.info()

In [None]:
sp_release_singles_df.value_counts('total_tracks')


In [None]:
popularity_counts = sp_release_singles_df.value_counts("popularity", ascending=True)
print(popularity_counts)

In [None]:
#73 is the max popularity of a release
popularity_counts.info()

In [None]:
#create a new DataFrame to hold only the popular releases
popular_singles_df = sp_release_singles_df[sp_release_singles_df['popularity']!=0]

In [None]:
#check 
popular_singles_df.head()

In [None]:
sp_track_df.info()

In [None]:
sp_track_df.head()

In [None]:
#Filtering on release_id and retaining relevant features

filter_ids = popular_singles_df['release_id'].values

mask = sp_track_df['release_id'].isin(filter_ids)

filtered_data_ids = sp_track_df.loc[mask, 'track_id']
filtered_data_isrc = sp_track_df.loc[mask, 'isrc']
filtered_data_explicit = sp_track_df.loc[mask, 'explicit']
filtered_data_title = sp_track_df.loc[mask, 'track_title'] 
filtered_data_sample = sp_track_df.loc[mask, 'preview_url']
filtered_data_release_id = sp_track_df.loc[mask, 'release_id']

popular_tracks_df = pd.DataFrame({
    'track_id': filtered_data_ids, 
    'isrc': filtered_data_isrc, 
    'explicit': filtered_data_explicit,
    'track_title': filtered_data_title,
    'preview_url' : filtered_data_sample,
    'release_id': filtered_data_release_id})
popular_tracks_df.head()

In [None]:
#We can see if there are any doubling songs in the releases or missing values
popular_tracks_df.describe()

In [None]:
popular_tracks_df.info()

In [None]:
#Drop rows with NaN values for isrc
popular_tracks_df = popular_tracks_df.dropna(subset=['isrc'])
popular_tracks_df.info()

In [None]:
#Drop duplicates
popular_tracks_df = popular_tracks_df.drop_duplicates(subset=['isrc'])
popular_tracks_df.info()

In [None]:
popular_singles_df.head()

In [None]:
popular_tracks_df.head()

In [None]:
merded_popular_tracks_df = pd.merge(popular_tracks_df, popular_singles_df, on='release_id', how='left')

In [None]:
merded_popular_tracks_df.head()

In [None]:
#rename 'popularity' to 'release_popularity' to avoid confusion (same for total_tracks)
merded_popular_tracks_df = merded_popular_tracks_df.rename(columns={
    'popularity':'release_popularity', 
    'total_tracks':'total_tracks_in_release'})
merded_popular_tracks_df.head()

In [None]:
print(os.getcwd())

In [None]:
merded_popular_tracks_df.to_csv('popular_tracks.csv', index=False, header=True)