In [1]:
import numpy as np
import pandas as pd
from sklearn import datasets, linear_model
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
import matplotlib.pyplot as plt # data visualization
import ipywidgets as widgets # interactive widgets
from ipywidgets import Box


In [2]:
df = pd.read_csv('../Data/tracks_cleaned.csv')
df.describe()

Unnamed: 0,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,num_artists,year
count,469281.0,469281.0,469281.0,469281.0,469281.0,469281.0,469281.0,469281.0,469281.0,469281.0,469281.0,469281.0,469281.0,469281.0,469281.0,469281.0,469281.0
mean,27.578334,229985.0,0.044155,0.563647,0.542439,5.22484,-10.200712,0.659208,0.104881,0.449396,0.113413,0.213992,0.552442,118.465459,3.8735,1.295914,1988.594946
std,18.36632,127091.9,0.205439,0.166195,0.25178,3.517928,5.086349,0.473976,0.179904,0.348656,0.266952,0.184386,0.257641,29.784032,0.472858,0.887235,22.813764
min,0.0,3344.0,0.0,0.0,0.0,0.0,-60.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1922.0
25%,13.0,175053.0,0.0,0.453,0.344,2.0,-12.887,0.0,0.034,0.0969,0.0,0.0983,0.346,95.549,4.0,1.0,1974.0
50%,27.0,214907.0,0.0,0.577,0.55,5.0,-9.233,1.0,0.0443,0.422,2.4e-05,0.139,0.564,117.363,4.0,1.0,1992.0
75%,41.0,263800.0,0.0,0.686,0.749,8.0,-6.482,1.0,0.0764,0.784,0.00946,0.278,0.769,136.335,4.0,1.0,2007.0
max,98.0,5621218.0,1.0,0.991,1.0,11.0,5.376,1.0,0.971,0.996,1.0,1.0,1.0,243.507,5.0,58.0,2021.0


## Using all features

Since Key is already an estimated value instead of actual we will drop it from the frame. Additonally, we drop release_date due to the inconsisitencies in the data where release_months are only available for some of the data. Since we have already created a column called year in the data cleaning part, we are still able to use that and take advantage of the strong relation between year and popularity

In [3]:
df.drop(['key', 'release_date'],axis=1, inplace=True)

time_signature_df=pd.get_dummies(df["time_signature"])
df = pd.concat([df,time_signature_df],axis=1)
df['mode'] = np.where(df['mode']=='Major', 1, 0)

 Modeling the data + Sets Prep

In [6]:
X= df.loc[:,df.columns !="popularity"] # all the features except popularity
y = df["popularity"] 

In [7]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 469281 entries, 0 to 469280
Data columns (total 20 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   duration_ms       469281 non-null  int64  
 1   explicit          469281 non-null  int64  
 2   danceability      469281 non-null  float64
 3   energy            469281 non-null  float64
 4   loudness          469281 non-null  float64
 5   mode              469281 non-null  int64  
 6   speechiness       469281 non-null  float64
 7   acousticness      469281 non-null  float64
 8   instrumentalness  469281 non-null  float64
 9   liveness          469281 non-null  float64
 10  valence           469281 non-null  float64
 11  tempo             469281 non-null  float64
 12  time_signature    469281 non-null  int64  
 13  num_artists       469281 non-null  int64  
 14  year              469281 non-null  int64  
 15  0                 469281 non-null  uint8  
 16  1                 46

In [9]:
train_scores = []
test_scores = []

for i in range(100):
    # separate the data to training and testing
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

    # save as np.array
    X_train = np.array(X_train)
    X_test = np.array(X_test)
    y_train = np.array(y_train) 
    y_test = np.array(y_test)

    #creating a regression model
    model_regression = linear_model.LinearRegression()
    model_regression.fit(X_train, y_train)

   # to import to another jupyter notebook
    %store model_regression

    #We estimate models’ success rate in predicting the songs using the score() function which calculates the coefficient of determination. A score which is closer to 1 means the regressor is more accurate.
    train = model_regression.score(X_train, y_train)
    test = model_regression.score(X_test, y_test)

    train_scores.append(train)
    test_scores.append(test)

# Calculate average train and test accuracy
avg_train = sum(train_scores) / len(train_scores)
avg_test = sum(test_scores) / len(test_scores)

print("Average Train Accuracy = " + str(avg_train))
print("Average Test Accuracy = " + str(avg_test))

Stored 'model_regression' (LinearRegression)
Stored 'model_regression' (LinearRegression)
Stored 'model_regression' (LinearRegression)
Stored 'model_regression' (LinearRegression)
Stored 'model_regression' (LinearRegression)
Stored 'model_regression' (LinearRegression)
Stored 'model_regression' (LinearRegression)
Average Train Accuracy = 0.37647194405804696
Average Test Accuracy = 0.3766325089388232


As such, when non-neglgible factors are accounted for, the success rate of a linear regression model in predicting popularity is 0.376 on average after 100 runs

## Dropping Non-essential factors

In [190]:
df = pd.read_csv('../Data/tracks_cleaned.csv')
df.describe()

df.drop(['key', 'release_date', 'num_artists', 'mode', 'speechiness', 'liveness', 'valence', 'tempo', 'time_signature'],axis=1, inplace=True)


In [191]:
X= df.loc[:,df.columns !="popularity"] # all the features except popularity
y = df["popularity"] 

In [192]:
train_scores = []
test_scores = []

for i in range(100):
    # separate the data to training and testing
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

    # save as np.array
    X_train = np.array(X_train)
    X_test = np.array(X_test)
    y_train = np.array(y_train) 
    y_test = np.array(y_test)

    #creating a regression model
    model_regression = linear_model.LinearRegression()
    model_regression.fit(X_train, y_train)

    #We estimate models’ success rate in predicting the songs using the score() function which calculates the coefficient of determination. A score which is closer to 1 means the regressor is more accurate.
    train = model_regression.score(X_train, y_train)
    test = model_regression.score(X_test, y_test)

    train_scores.append(train)
    test_scores.append(test)

# Calculate average train and test accuracy
avg_train = sum(train_scores) / len(train_scores)
avg_test = sum(test_scores) / len(test_scores)

print("Average Train Accuracy = " + str(avg_train))
print("Average Test Accuracy = " + str(avg_test))

Average Train Accuracy = 0.3746576507012922
Average Test Accuracy = 0.3742977454816405


As seen above, dropping the non-essential features reduces the accuracy by around 0.001

## Using top 6 features from EDA

In [199]:
df = pd.read_csv('../Data/tracks_cleaned.csv')
df.describe()

df.drop(['key', 'release_date', 'num_artists', 'mode', 'speechiness', 'liveness', 'valence', 'tempo', 'time_signature', 'instrumentalness'],axis=1, inplace=True)

In [200]:
X= df.loc[:,df.columns !="popularity"] # all the features except popularity
y = df["popularity"] 

In [201]:
train_scores = []
test_scores = []

for i in range(100):
    # separate the data to training and testing
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

    # save as np.array
    X_train = np.array(X_train)
    X_test = np.array(X_test)
    y_train = np.array(y_train) 
    y_test = np.array(y_test)

    #creating a regression model
    model_regression = linear_model.LinearRegression()
    model_regression.fit(X_train, y_train)

    #We estimate models’ success rate in predicting the songs using the score() function which calculates the coefficient of determination. A score which is closer to 1 means the regressor is more accurate.
    train = model_regression.score(X_train, y_train)
    test = model_regression.score(X_test, y_test)

    train_scores.append(train)
    test_scores.append(test)

# Calculate average train and test accuracy
avg_train = sum(train_scores) / len(train_scores)
avg_test = sum(test_scores) / len(test_scores)

print("Average Train Accuracy = " + str(avg_train))
print("Average Test Accuracy = " + str(avg_test))

Average Train Accuracy = 0.36808090981203906
Average Test Accuracy = 0.36785937199337787


Reducing the number of features further to important features reduces the success score of the regression model.

## Conclusion

In conclusion, when it comes to regression models, using all the features gives a higher success rate than using only the essential ones. There could be several reasons for this:

1. Even though some factors may have a weaker correlation with popularity, they may still provide some predictive power when used in combination with other factors. The model may be able to extract information from these weaker factors and use it to make more accurate predictions.

2. It's possible that the non-negligible factors alone may not capture all the important information about popularity. There may be other factors that have a weaker correlation individually but when combined with other factors, they provide more useful information about popularity.

Using all factors in a linear regression model could result in a higher success score because the weaker factors may still provide some predictive power, other factors could provide additional useful information, and including more factors could help to reduce multicollinearity issues.



Nonetheless a success score of 0.3-0.4 is a weak score. As such, linear regression models may not be the best in predicting popularity.