As opposed to lr_rfr_streampredictions or preds_with_social_media, in this file, we predict the number of streams a song will have in four weeks using data from the previous week (week_0), the current week (week_1), two weeks from now (week_2), and three weeks from now (week_3). Each week_i column captures the total number of streams a song had on their most recent full week. Since the DataFrame we use in this notebook only includes records from Thursday, the most recent full week will be from the previous Friday - Thursday.

In [3]:
import nbimporter
from lr_rfr_streampredictions import *
from preds_with_social_media import *
import torch

pd.set_option('display.max_rows', 100)

In [4]:
data = pd.read_csv('../csv_files/three_week_data.csv')
data = data.rename(columns={'target': 'Next Full Week Streams'})
data

Unnamed: 0,artist,title,unified_song_id,date,release_date,this_day,this_week,days_since_release,day_of_week,rn,...,day_3,day_4,day_5,day_6,week_0,week_1,week_2,week_3,Next Full Week Streams,naive
0,Young Thug,Hercules,100070,2024-06-27,2016-02-05,15979,100850.0,3065,6,1,...,14665.0,12847.0,14473.0,15932.0,100850.0,103313.0,101589.0,100364.0,100669,104344.0
1,Young Thug,Hercules,100070,2024-06-20,2016-02-05,15343,103313.0,3058,6,1,...,13438.0,12405.0,14523.0,16005.0,103313.0,101589.0,100364.0,103225.0,102646,100850.0
2,Young Thug,Hercules,100070,2024-06-13,2016-02-05,15999,101589.0,3051,6,1,...,13938.0,12305.0,14530.0,16076.0,101589.0,100364.0,103225.0,103165.0,103858,103313.0
3,Young Thug,Hercules,100070,2024-06-06,2016-02-05,15357,100364.0,3044,6,1,...,14039.0,12311.0,14205.0,15559.0,100364.0,103225.0,103165.0,103814.0,104343,101589.0
4,Young Thug,Hercules,100070,2024-05-30,2016-02-05,15457,103225.0,3037,6,1,...,12212.0,12667.0,14507.0,16328.0,103225.0,103165.0,103814.0,108114.0,100850,100364.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
213215,Don Toliver,Geeked Up (Feat. Yeat),244428207,2024-06-27,2024-06-24,147864,0.0,3,6,1,...,0.0,,,,0.0,,,,943379,
213216,Don Toliver,Rockstar Girl,244473425,2024-06-27,2024-06-24,82434,0.0,3,6,1,...,0.0,,,,0.0,,,,253709,
213217,Don Toliver,Love Is A Drug,244473426,2024-06-27,2024-06-24,114819,0.0,3,6,1,...,0.0,,,,0.0,,,,510602,
213218,Don Toliver,Donny Darko (Feat. Lil Uzi Vert),244473427,2024-06-27,2024-06-24,148913,0.0,3,6,1,...,528.0,,,,0.0,,,,705091,


In [5]:
data.isnull().sum()

artist                        0
title                         0
unified_song_id               0
date                          0
release_date                  0
this_day                      0
this_week                  2668
days_since_release            0
day_of_week                   0
rn                            0
end_of_week                   0
popularity                    0
day_0                         0
day_1                       160
day_2                       233
day_3                      8483
day_4                      8526
day_5                      8796
day_6                      8802
week_0                     2668
week_1                    11525
week_2                    20326
week_3                    29087
Next Full Week Streams        0
naive                      8802
dtype: int64

In [6]:
train_valid, test = train_test_split(data, test_size=0.2, random_state=42)
train, valid = train_test_split(train_valid, test_size=0.2, random_state=42)

print(train.shape)
print(valid.shape)
print(test.shape)

(136460, 25)
(34116, 25)
(42644, 25)


In [7]:
# Reminder: the target

features = ['days_since_release', 'popularity', 'week_0', 'week_1', 'week_2', 'week_3']
target = 'Next Full Week Streams'

In [8]:
train_final = (train.pipe(interpolate_vals, cols=['week_0', 'week_1', 'week_2', 'week_3'])
                .pipe(fill_mode, cols=['days_since_release'])
               .pipe(
                    log_apply, 
                    *('days_since_release', 'week_0', 'week_1', 'week_2', 'week_3', 'Next Full Week Streams'),
                    prediction_col=target
                    )
                )

new_features = ['log days_since_release', 'popularity', 'log week_0', 'log week_1', 'log week_2', 'log week_3']

valid_final = (valid.pipe(interpolate_vals, cols=['week_0', 'week_1', 'week_2', 'week_3'])
                .pipe(fill_mode, cols=['days_since_release'])
               .pipe(
                    log_apply, 
                    *('days_since_release', 'week_0', 'week_1', 'week_2', 'week_3', 'Next Full Week Streams'),
                    prediction_col=target
                )
                )

test_final = (test.pipe(interpolate_vals, cols=['week_0', 'week_1', 'week_2', 'week_3'])
                .pipe(fill_mode, cols=['days_since_release'])
               .pipe(
                    log_apply, 
                    *('days_since_release', 'week_0', 'week_1', 'week_2', 'week_3', 'Next Full Week Streams'),
                    prediction_col=target
                )
             )

X_train = train_final[new_features]
Y_train = train_final['log Next Full Week Streams']

X_valid = valid_final[new_features]
Y_valid = valid_final['log Next Full Week Streams']

X_test = test_final[new_features]
Y_test = test_final['log Next Full Week Streams']

In [9]:
# Define a pipeline to search for the best combination of PCA truncation
# and classifier regularization.
pca = PCA()

# Define a Standard Scaler to normalize inputs
scaler = StandardScaler()

lr_pipe = Pipeline([
     ("scaler", scaler), 
     ("pca", pca), 
     ("regressor", Lasso())
])

# Parameters of pipelines can be set using '__' separated parameter names:
lr_params = {
    "pca__n_components": [2, 3, 4, 5, 6],
    "regressor": [Ridge(), Lasso()],
    "regressor__fit_intercept": [True, False],
    "regressor__alpha": [0.001, 0.01, 0.1]
}

search = GridSearchCV(lr_pipe, lr_params, cv=10, scoring='neg_mean_absolute_error', n_jobs=-1)
search.fit(X_train, Y_train)


GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('pca', PCA()),
                                       ('regressor', Lasso())]),
             n_jobs=-1,
             param_grid={'pca__n_components': [2, 3, 4, 5, 6],
                         'regressor': [Ridge(alpha=0.001), Lasso()],
                         'regressor__alpha': [0.001, 0.01, 0.1],
                         'regressor__fit_intercept': [True, False]},
             scoring='neg_mean_absolute_error')

In [10]:
print(search.best_params_)

{'pca__n_components': 6, 'regressor': Ridge(alpha=0.001), 'regressor__alpha': 0.001, 'regressor__fit_intercept': True}


In [11]:
best = search.best_estimator_
best

Pipeline(steps=[('scaler', StandardScaler()), ('pca', PCA(n_components=6)),
                ('regressor', Ridge(alpha=0.001))])

In [12]:
Y_train_pred = best.predict(X_train)
print(f'{mean_absolute_error(np.e**Y_train_pred, np.e**Y_train):0.3e}')

7.797e+04


In [86]:
Y_valid_pred = best.predict(X_valid)

print(f'{mean_absolute_error(np.e**Y_valid, np.e**Y_valid_pred):0.3e}')

7.620e+04


In [87]:
Y_test_pred = best.predict(X_test)

print(f'{mean_absolute_error(np.e**Y_test, np.e**Y_test_pred):0.3e}')

7.765e+04


In [89]:
# naive = np.e**

df = pd.DataFrame({
    "True Test": np.e**Y_train,
    "Predicted Test": np.e**Y_train_pred
})
df

Unnamed: 0,True Test,Predicted Test
106511,13043.0,21346.939154
16354,42633.0,46173.242027
114561,98817.0,96359.309060
137853,28719.0,29324.935543
139901,13035.0,25061.211354
...,...,...
181610,10373.0,16464.864956
126306,176263.0,127438.822840
36171,21300.0,27999.658749
112833,83629.0,64401.292081
