In [26]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split  
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from xgboost import XGBClassifier, XGBRegressor
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.metrics import precision_recall_curve, average_precision_score

plt.style.use('ggplot') # this format looks nice
pd.set_option('display.max_columns', 200) # show more columns in output
pd.set_option('display.max_rows', 100)  # Or any number you want

In [27]:
df_raw = pd.read_csv('../input/spotify_history.csv')  # Load the dataset

In [28]:
df = df_raw.copy()

# Understanding the Data
* shape
* head and tail
* dtypes
* describe
* missing values

In [29]:
df.head()
df.columns

Index(['spotify_track_uri', 'ts', 'platform', 'ms_played', 'track_name',
       'artist_name', 'album_name', 'reason_start', 'reason_end', 'shuffle',
       'skipped'],
      dtype='object')

In [30]:
df.shape

(149860, 11)

In [33]:
for col in df.columns:
    if df[col].dtype == 'object':
        print(f"{col}: {df[col].nunique()} unique values")
    else:
        print(f"{col}: {df[col].dtype}")


spotify_track_uri: 16527 unique values
ts: 140422 unique values
platform: 6 unique values
ms_played: int64
track_name: 13839 unique values
artist_name: 4113 unique values
album_name: 7948 unique values
reason_start: 13 unique values
reason_end: 15 unique values
shuffle: bool
skipped: bool


In [34]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 149860 entries, 0 to 149859
Data columns (total 11 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   spotify_track_uri  149860 non-null  object
 1   ts                 149860 non-null  object
 2   platform           149860 non-null  object
 3   ms_played          149860 non-null  int64 
 4   track_name         149860 non-null  object
 5   artist_name        149860 non-null  object
 6   album_name         149860 non-null  object
 7   reason_start       149717 non-null  object
 8   reason_end         149743 non-null  object
 9   shuffle            149860 non-null  bool  
 10  skipped            149860 non-null  bool  
dtypes: bool(2), int64(1), object(8)
memory usage: 10.6+ MB


In [40]:
# who are the artists with the most unique tracks
df.groupby('artist_name')['track_name'].nunique().sort_values(ascending=False).head(10)

artist_name
The Beatles           470
The Killers           155
Paul McCartney        155
John Mayer            115
The Rolling Stones    111
The Black Keys        107
Bob Dylan             100
Howard Shore           98
Led Zeppelin           94
Johnny Cash            85
Name: track_name, dtype: int64

In [41]:
# who are the artists with the most play time
df.groupby('artist_name')['seconds'].sum().sort_values(ascending=False).head(10)

artist_name
The Beatles           1210184.552
The Killers           1059556.516
John Mayer             725219.443
Bob Dylan              569456.396
Paul McCartney         357354.370
Howard Shore           348930.675
The Strokes            317508.419
The Rolling Stones     307917.009
Pink Floyd             260531.842
Led Zeppelin           248338.279
Name: seconds, dtype: float64

In [None]:
# decided to make a seconds column to make data easier to interpret
df['seconds'] = df['ms_played'] / 1000
df.head()


Unnamed: 0,spotify_track_uri,ts,platform,ms_played,track_name,artist_name,album_name,reason_start,reason_end,shuffle,skipped,seconds
0,2J3n32GeLmMjwuAzyhcSNe,2013-07-08 02:44:34,web player,3185,"Say It, Just Say It",The Mowgli's,Waiting For The Dawn,autoplay,clickrow,False,False,3.185
1,1oHxIPqJyvAYHy0PVrDU98,2013-07-08 02:45:37,web player,61865,Drinking from the Bottle (feat. Tinie Tempah),Calvin Harris,18 Months,clickrow,clickrow,False,False,61.865
2,487OPlneJNni3NWC8SYqhW,2013-07-08 02:50:24,web player,285386,Born To Die,Lana Del Rey,Born To Die - The Paradise Edition,clickrow,unknown,False,False,285.386
3,5IyblF777jLZj1vGHG2UD3,2013-07-08 02:52:40,web player,134022,Off To The Races,Lana Del Rey,Born To Die - The Paradise Edition,trackdone,clickrow,False,False,134.022
4,0GgAAB0ZMllFhbNc3mAodO,2013-07-08 03:17:52,web player,0,Half Mast,Empire Of The Sun,Walking On A Dream,clickrow,nextbtn,False,False,0.0


# Cleaning the Data
* drop rows we do not need
* find duplicated columns/rows
* renaming
* feature creation

# Feature Understanding
* univariate analysis
* histograms
* kdes
* boxplots
* pivot tables

# Feature Relationships
* understanding how the features are related to each other
* scatterplots
* correlation and heatmaps
* Pairplots
* Group by comparisons

# Asking questions about the data
* trying to answer questions about the data using plots or statistics

# Feature engineering
* how to interpolate/remove missing values
* how to encode categoricals (when nessecary)
* which additional features should I add

# Model creation
* make simple models and test performance
* analyze what they get right and wrong
* decide if we should improve current model, try another model, or go back to feature engineering
* work until model performance is arbitrarily high enough

# Model Evaluation


# Final output on test set
* from all the models it seems that Neural Nets perform the best (albeit only slightly better than 
the other methods)
* output test set to output file