In [1]:
import nbimporter
from lr_rfr_streampredictions import *

import os
import torch
from sklearn.svm import SVR

pd.set_option('display.max_rows', 100)

In [2]:
train = pd.read_csv('../csv_files/train.csv')
valid = pd.read_csv('../csv_files/valid.csv')
test = pd.read_csv('../csv_files/test.csv')

train = train.sample(5000, random_state=42)
valid = valid.sample(1000, random_state=42)
test = test.sample(1000, random_state=42)

In [3]:
days = ['day_0', 'day_1', 'day_2', 'day_3', 'day_4', 'day_5', 'day_6']

pipelines = [
    (fill_vals, None, {
        'cols': days,
    }),  
    (add_pct_change, None, None),
    (log_transform, 
        ['day_0', 'day_1', 'day_2', 'day_3', 'day_4', 'day_5', 'day_6', 'days_since_release', 'Next Full Week Streams'], 
        {'prediction_col': 'Next Full Week Streams'}
    )
]

X_train, Y_train = feature_engine_pipe(train, pipelines, 'log Next Full Week Streams')
X_valid, Y_valid = feature_engine_pipe(valid, pipelines, 'log Next Full Week Streams')
X_test, Y_test = feature_engine_pipe(test, pipelines, 'log Next Full Week Streams')

In [None]:
# Define a pipeline to search for the best combination of PCA truncation
# and classifier regularization.
pca = PCA()

# Define a Standard Scaler to normalize inputs
scaler = StandardScaler()

svr_pipe = Pipeline([
    ("scaler", scaler),
    ("pca", pca),
    ("regressor", SVR())
])


svr_params = {
    "pca__n_components": [3, 5, 7, 10],
    "regressor__kernel": ['linear', 'poly', 'rbf', 'sigmoid'],
    "regressor__gamma": ['scale', 1, 0.1, 0.01, 1],
    "regressor__C": [0.1, 1, 10, 100, 1000],
    "regressor__epsilon": [0.01, 0.1, 1, 10]
}

svr_search = RandomizedSearchCV(svr_pipe, svr_params, cv=5, n_iter=100, scoring='neg_mean_absolute_error', n_jobs=-1)

svr_search.fit(X_train, Y_train)

In [None]:

print("Best epsilon parameter:", grid.best_estimator_)



In [213]:
data = pd.read_csv('../csv_files/alltime_socials.csv')
data = data.sample(200000, random_state=42)
print(data.columns)

Index(['ARTIST', 'TITLE', 'UNIFIED_SONG_ID', 'DATE', 'RELEASE_DATE',
       'THIS_DAY', 'THIS_WEEK', 'DAYS_SINCE_RELEASE', 'DAY_OF_WEEK', 'RN',
       'END_OF_WEEK', 'POPULARITY', 'SPOTIFY_FOLLOWER_COUNT',
       'SPOTIFY_LISTENERS_COUNT', 'SPFOLDAY0', 'SPFOLDAY1', 'SPFOLDAY2',
       'SPFOLDAY3', 'SPFOLDAY4', 'SPFOLDAY5', 'SPFOLDAY6', 'SPLISDAY0',
       'SPLISDAY1', 'SPLISDAY2', 'SPLISDAY3', 'SPLISDAY4', 'SPLISDAY5',
       'SPLISDAY6', 'DAY_0', 'DAY_1', 'DAY_2', 'DAY_3', 'DAY_4', 'DAY_5',
       'DAY_6', 'TARGET', 'IG_FOLLOWER_COUNT', 'IGFOLDAY0', 'IGFOLDAY1',
       'IGFOLDAY2', 'IGFOLDAY3', 'IGFOLDAY4', 'IGFOLDAY5', 'IGFOLDAY6'],
      dtype='object')


In [214]:
print(data.isnull().sum())

ARTIST                        21
TITLE                          2
UNIFIED_SONG_ID                0
DATE                           0
RELEASE_DATE                   0
THIS_DAY                       0
THIS_WEEK                   3137
DAYS_SINCE_RELEASE             0
DAY_OF_WEEK                    0
RN                             0
END_OF_WEEK                    0
POPULARITY                     0
SPOTIFY_FOLLOWER_COUNT      2832
SPOTIFY_LISTENERS_COUNT     3768
SPFOLDAY0                   2832
SPFOLDAY1                   2939
SPFOLDAY2                   3092
SPFOLDAY3                   3062
SPFOLDAY4                   3066
SPFOLDAY5                   3667
SPFOLDAY6                   3660
SPLISDAY0                   3768
SPLISDAY1                   3912
SPLISDAY2                   4058
SPLISDAY3                   4031
SPLISDAY4                   4034
SPLISDAY5                   4631
SPLISDAY6                   4617
DAY_0                          0
DAY_1                        210
DAY_2     

In [215]:
train_valid, test = train_test_split(data, test_size=0.2, random_state=42)
train, valid = train_test_split(train_valid, test_size=0.2, random_state=42)

features = [
    'days_since_release', 'popularity', 'spotify_follower_count', 'spotify_listeners_count', 
    'spfolday0', 'spfolday1', 'spfolday2', 'spfolday3', 'spfolday4', 'spfolday5', 'spfolday6',
    'splisday0', 'splisday1', 'splisday2', 'splisday3', 'splisday4', 'splisday5', 'splisday6',
    'day_0', 'day_1', 'day_2', 'day_3', 'day_4', 'day_5', 'day_6', 'ig_follower_count',
    'igfolday0', 'igfolday1', 'igfolday2', 'igfolday3', 'igfolday4', 'igfolday5', 'igfolday6', 
]

def rename_columns(data):
    data.columns = data.columns.str.lower()
    data = data.rename(columns={"target": "Next Full Week Streams", "next full week streams": "Next Full Week Streams"})
    return data

def select_cols(data, cols):
    temp = data.copy()
    return temp[cols]
    
train_temp = rename_columns(train)
train_temp = select_columns(train_temp, features + ['Next Full Week Streams'])
train_temp

Unnamed: 0,days_since_release,popularity,spotify_follower_count,spotify_listeners_count,spfolday0,spfolday1,spfolday2,spfolday3,spfolday4,spfolday5,...,day_6,ig_follower_count,igfolday0,igfolday1,igfolday2,igfolday3,igfolday4,igfolday5,igfolday6,Next Full Week Streams
1189005,8521,46,2424250.0,6473592.0,2424250.0,2423722.0,2423722.0,2423175.0,2421900.0,2420685.0,...,8766.0,251170.0,251170.0,250834.0,250880.0,250693.0,250649.0,250555.0,250278.0,53374
1448931,3184,0,4672508.0,10830717.0,4672508.0,93284.0,93276.0,4670771.0,93248.0,93236.0,...,5719.0,54920.0,54920.0,4839038.0,54924.0,4834543.0,4834542.0,54917.0,54915.0,32985
1425601,2831,14,7056606.0,13695199.0,7056606.0,13428329.0,7050583.0,7047680.0,13400281.0,7041715.0,...,27631.0,,,23382435.0,23381194.0,23379610.0,,23375501.0,23372583.0,183760
1161592,2746,41,24287047.0,33081976.0,24287047.0,24285274.0,24278175.0,205095.0,24271247.0,205036.0,...,1302.0,8708360.0,8708360.0,8708539.0,66837.0,8709906.0,66836.0,66834.0,66833.0,9496
2035914,287,49,19151853.0,13763370.0,19151853.0,19145859.0,19139516.0,19135839.0,19132161.0,19118622.0,...,39936.0,14409578.0,14409578.0,14408721.0,14407561.0,14406990.0,14405374.0,14404402.0,14404376.0,257708
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1834564,7,40,7555891.0,15663195.0,7555891.0,7550349.0,7544685.0,7539007.0,7531727.0,7519638.0,...,2278802.0,10384718.0,10384718.0,10384615.0,10384513.0,10384410.0,10384308.0,10384205.0,10384103.0,4203388
1357389,8906,35,2143608.0,4571800.0,2143608.0,2142605.0,2141562.0,2140397.0,2139144.0,2137985.0,...,1226.0,144482.0,144482.0,144315.0,144216.0,144129.0,144082.0,144092.0,143943.0,7514
1691106,657,20,1014863.0,4748954.0,1014863.0,1014121.0,1013410.0,1012657.0,1011770.0,1010882.0,...,1055.0,377863.0,377863.0,377822.0,377839.0,377880.0,377822.0,377776.0,377813.0,7497
778625,128,1,24711.0,227119.0,24711.0,24664.0,24604.0,24562.0,24502.0,24451.0,...,1394.0,159754.0,159754.0,159778.0,159801.0,159825.0,159848.0,159872.0,159895.0,9806


In [216]:
sp_fols = features[4:11]
sp_lis = features[11:18]
days = features[18:25]
ig_fols = features[26:33]

train_temp = interpolate_vals(train_temp, sp_fols)
train_temp = interpolate_vals(train_temp, sp_lis)
train_temp = interpolate_vals(train_temp, days)
train_temp = interpolate_vals(train_temp, ig_fols)      

In [217]:
train_temp.isnull().sum()

days_since_release            0
popularity                    0
spotify_follower_count     1814
spotify_listeners_count    2415
spfolday0                     0
spfolday1                     0
spfolday2                     0
spfolday3                     0
spfolday4                     0
spfolday5                     0
spfolday6                     0
splisday0                     0
splisday1                     0
splisday2                     0
splisday3                     0
splisday4                     0
splisday5                     0
splisday6                     0
day_0                         0
day_1                         0
day_2                         0
day_3                         0
day_4                         0
day_5                         0
day_6                         0
ig_follower_count          9485
igfolday0                     0
igfolday1                     0
igfolday2                     0
igfolday3                     0
igfolday4                     0
igfolday

In [218]:

null_cols = ['days_since_release', 'spotify_follower_count', 'spotify_listeners_count', 'ig_follower_count']

def fill_mode(data, cols):
    data[cols] = data[cols].replace(0, np.nan)
    data[cols] = data[cols].apply(lambda col: col.fillna(col.mode()[0]))
    return data

train_temp = fill_mode(train_temp, null_cols)
train_temp.isnull().sum()

days_since_release         0
popularity                 0
spotify_follower_count     0
spotify_listeners_count    0
spfolday0                  0
spfolday1                  0
spfolday2                  0
spfolday3                  0
spfolday4                  0
spfolday5                  0
spfolday6                  0
splisday0                  0
splisday1                  0
splisday2                  0
splisday3                  0
splisday4                  0
splisday5                  0
splisday6                  0
day_0                      0
day_1                      0
day_2                      0
day_3                      0
day_4                      0
day_5                      0
day_6                      0
ig_follower_count          0
igfolday0                  0
igfolday1                  0
igfolday2                  0
igfolday3                  0
igfolday4                  0
igfolday5                  0
igfolday6                  0
Next Full Week Streams     0
dtype: int64

In [220]:
def log_transform(data, *cols, prediction_col):
    copy = data.copy()
    for col in cols:
        copy[col] = np.log(copy[col])
        copy = copy.rename(columns={col: "log " + col})
    return copy[[c for c in copy if c != ('log ' + prediction_col)] + ['log ' + prediction_col]]


train_log = log_transform(
    train_temp, 
    *(days + sp_fols + sp_lis + ig_fols + ['days_since_release', 'spotify_follower_count', 'spotify_listeners_count', 'ig_follower_count', 'Next Full Week Streams']),
    prediction_col='Next Full Week Streams'
)

train_log

Unnamed: 0,log days_since_release,popularity,log spotify_follower_count,log spotify_listeners_count,log spfolday0,log spfolday1,log spfolday2,log spfolday3,log spfolday4,log spfolday5,...,log day_6,log ig_follower_count,log igfolday0,log igfolday1,log igfolday2,log igfolday3,log igfolday4,log igfolday5,log igfolday6,log Next Full Week Streams
1189005,9.050289,46.000000,14.701033,15.683242,14.701033,14.700815,14.700815,14.700589,14.700063,14.699561,...,9.078636,12.433885,12.433885,12.432547,12.432730,12.431984,12.431809,12.431434,12.430328,10.885079
1448931,8.065894,26.913781,15.357207,16.197897,15.357207,11.443404,11.443318,15.356835,11.443018,11.442889,...,8.651549,10.913633,10.913633,15.392226,10.913706,15.391297,15.391297,10.913578,10.913542,10.403808
1425601,7.948385,14.000000,15.769475,16.432556,15.769475,16.412877,15.768621,15.768209,16.410786,15.767362,...,10.226694,16.167817,16.967496,16.967496,16.967443,16.967375,16.967287,16.967199,16.967074,12.121386
1161592,7.917901,41.000000,17.005454,17.314499,17.005454,17.005381,17.005088,12.231229,17.004803,12.230941,...,7.171657,15.979794,15.979794,15.979815,11.110012,15.979972,11.109997,11.109967,11.109952,9.158626
2035914,5.659482,49.000000,16.767910,16.437521,16.767910,16.767597,16.767266,16.767074,16.766881,16.766173,...,10.595033,16.483404,16.483404,16.483344,16.483264,16.483224,16.483112,16.483044,16.483043,12.459582
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1834564,1.945910,40.000000,15.837838,16.566824,15.837838,15.837104,15.836354,15.835601,15.834635,15.833029,...,14.639160,16.155846,16.155846,16.155836,16.155826,16.155816,16.155806,16.155796,16.155787,15.251401
1357389,9.094480,35.000000,14.578001,15.335418,14.578001,14.577533,14.577046,14.576502,14.575916,14.575374,...,7.111512,11.880910,11.880910,11.879754,11.879067,11.878464,11.878138,11.878207,11.877173,8.924523
1691106,6.487684,20.000000,13.830264,15.373435,13.830264,13.829533,13.828831,13.828088,13.827212,13.826334,...,6.961296,12.842287,12.842287,12.842178,12.842223,12.842332,12.842178,12.842057,12.842155,8.922258
778625,4.852030,1.000000,10.115004,12.333229,10.115004,10.113100,10.110664,10.108956,10.106510,10.104426,...,7.239933,11.981390,11.981390,11.981541,11.981685,11.981835,11.981979,11.982129,11.982273,9.190750


In [212]:
pipelines = [
    (rename_columns, None, None),
    (select_cols, None, {
        'cols': features + ['Next Full Week Streams']
    }),
    (interpolate_vals, None, {
        'cols': days,
    }),
    (interpolate_vals, None, {
        'cols': sp_fols,
    }), 
    (interpolate_vals, None, {
        'cols': sp_lis,
    }),
    (interpolate_vals, None, {
        'cols': ig_fols,
    }), 
    (fill_mode, None, {
        'cols': ['days_since_release', 'spotify_follower_count', 'spotify_listeners_count', 'ig_follower_count'],
    }),
    (log_transform, 
        *(days + sp_fols + sp_lis + ig_fols + ['days_since_release', 'spotify_follower_count', 'spotify_listeners_count', 'ig_follower_count', 'Next Full Week Streams']),
        {'prediction_col': 'Next Full Week Streams'}
    )
]



X_train, Y_train = feature_engine_pipe(train, pipelines, 'log Next Full Week Streams')
X_valid, Y_valid = feature_engine_pipe(valid, pipelines, 'log Next Full Week Streams')

ValueError: too many values to unpack (expected 3)

In [221]:
train

Unnamed: 0,artist,title,unified_song_id,date,release_date,this_day,this_week,days_since_release,day_of_week,rn,...,day_6,target,ig_follower_count,igfolday0,igfolday1,igfolday2,igfolday3,igfolday4,igfolday5,igfolday6
1189005,Matchbox Twenty,Mad Season,33642633,2023-09-21,2000-05-23 00:00:00.000 Z,8318,56328.0,8521,6,1,...,8766.0,53374,251170.0,251170.0,250834.0,250880.0,250693.0,250649.0,250555.0,250278.0
1448931,Thomas Rhett,I Feel Good,59109069,2024-05-30,2015-09-11 00:00:00.000 Z,4900,34607.0,3184,6,1,...,5719.0,32985,54920.0,54920.0,4839038.0,54924.0,4834543.0,4834542.0,54917.0,54915.0
1425601,Meek Mill,JUMP OUT THE FACE,46547075,2023-03-30,2015-06-29 00:00:00.000 Z,28073,168087.0,2831,6,1,...,27631.0,183760,,,23382435.0,23381194.0,23379610.0,,23375501.0,23372583.0
1161592,Twenty One Pilots,HEATHENS,29966831,2024-06-27,2016-12-20 00:00:00.000 Z,1429,9169.0,2746,6,1,...,1302.0,9496,8708360.0,8708360.0,8708539.0,66837.0,8709906.0,66836.0,66834.0,66833.0
2035914,Melanie Martinez,Battle Of The Larynx,201689120,2024-01-11,2023-03-30 00:00:00.000 Z,39914,274985.0,287,6,1,...,39936.0,257708,14409578.0,14409578.0,14408721.0,14407561.0,14406990.0,14405374.0,14404402.0,14404376.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1834564,YoungBoy Never Broke Again,I Can't Take It Back,158938546,2021-09-30,2021-09-23 00:00:00.000 Z,847659,1525.0,7,6,1,...,2278802.0,4203388,10384718.0,10384718.0,10384615.0,10384513.0,10384410.0,10384308.0,10384205.0,10384103.0
1357389,Stone Temple Pilots,ART SCHOOL GIRL,38005895,2020-08-13,1996-03-26 00:00:00.000 Z,1107,7700.0,8906,6,1,...,1226.0,7514,144482.0,144482.0,144315.0,144216.0,144129.0,144082.0,144092.0,143943.0
1691106,Honne,Day 1 ◑ (Late Night Version),95114288,2020-10-08,2018-12-21 00:00:00.000 Z,1070,8192.0,657,6,1,...,1055.0,7497,377863.0,377863.0,377822.0,377839.0,377880.0,377822.0,377776.0,377813.0
778625,Teejay3k,Energy,126063620,2020-12-03,2020-07-28 00:00:00.000 Z,1412,10473.0,128,6,1,...,1394.0,9806,159754.0,159754.0,159778.0,159801.0,159825.0,159848.0,159872.0,159895.0


In [232]:
abc = (train.pipe(rename_columns)
 .pipe(select_cols, cols=(features + ['Next Full Week Streams']))
 .pipe(interpolate_vals, cols=days)
 .pipe(interpolate_vals, cols=sp_fols)
 .pipe(interpolate_vals, cols=sp_lis)
 .pipe(interpolate_vals, cols=ig_fols)
 .pipe(fill_mode, cols=['days_since_release', 'spotify_follower_count', 'spotify_listeners_count', 'ig_follower_count'])
 .pipe(
    log_transform, 
    *(days + sp_fols + sp_lis + ig_fols + ['days_since_release', 'spotify_follower_count', 'spotify_listeners_count', 'ig_follower_count', 'Next Full Week Streams']),
    prediction_col='Next Full Week Streams'
    )
)

# new_features = features = [
#     'log days_since_release', 'popularity', 'log spotify_follower_count', 'log spotify_listeners_count', 
#     'log spfolday0', 'log spfolday1', 'log spfolday2', 'log spfolday3', 'log pfolday4', 'log spfolday5', 'log spfolday6',
#     'log splisday0', 'log splisday1', 'log splisday2', 'log splisday3', 'log splisday4', 'log splisday5', 'log splisday6',
#     'day_0', 'day_1', 'day_2', 'day_3', 'day_4', 'day_5', 'day_6', 'ig_follower_count',
#     'log igfolday0', 'log igfolday1', 'log igfolday2', 'log igfolday3', 'log igfolday4', 'log igfolday5', 'log igfolday6', 
# ]

abc.columns


KeyError: "['log days_since_release', 'log spotify_follower_count', 'log spotify_listeners_count', 'log spfolday0', 'log spfolday1', 'log spfolday2', 'log spfolday3', 'log pfolday4', 'log spfolday5', 'log spfolday6', 'log splisday0', 'log splisday1', 'log splisday2', 'log splisday3', 'log splisday4', 'log splisday5', 'log splisday6', 'log igfolday0', 'log igfolday1', 'log igfolday2', 'log igfolday3', 'log igfolday4', 'log igfolday5', 'log igfolday6'] not in index"

In [223]:
# Define a pipeline to search for the best combination of PCA truncation
# and classifier regularization.
pca = PCA()

# Define a Standard Scaler to normalize inputs
scaler = StandardScaler()

lr_pipe = Pipeline([
     ("scaler", scaler), 
     ("pca", pca), 
     ("regressor", Lasso())
])

# Parameters of pipelines can be set using '__' separated parameter names:
lr_params = {
    "pca__n_components": [2, 4, 6, 8, 10],
    "regressor": [Ridge(), Lasso()],
    "regressor__fit_intercept": [True, False],
    "regressor__alpha": [0.1, 1.0, 10]
}

search = GridSearchCV(lr_pipe, lr_params, cv=10, scoring='neg_mean_absolute_error', n_jobs=-1)
search.fit(abc[features], abc['log next full week streams'])


KeyError: "['days_since_release', 'spotify_follower_count', 'spotify_listeners_count', 'spfolday0', 'spfolday1', 'spfolday2', 'spfolday3', 'spfolday4', 'spfolday5', 'spfolday6', 'splisday0', 'splisday1', 'splisday2', 'splisday3', 'splisday4', 'splisday5', 'splisday6', 'day_0', 'day_1', 'day_2', 'day_3', 'day_4', 'day_5', 'day_6', 'ig_follower_count', 'igfolday0', 'igfolday1', 'igfolday2', 'igfolday3', 'igfolday4', 'igfolday5', 'igfolday6'] not in index"