In [1]:
# Importing necessary libraries
import pymongo
from datetime import datetime, timedelta
from pymongo import InsertOne, DeleteOne, ReplaceOne, UpdateOne
import pandas as pd
import numpy as np  
from pandas import json_normalize
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

In [2]:
# Importing song data from the database
client = pymongo.MongoClient("mongodb+srv://" + "vidit23" + ":" + "dsba123" + "@mvp-bvqf2.mongodb.net/test?retryWrites=true&w=majority")
connectedDB = client['MVP']
songCollectionName = "Videos"
artistCollectionName = "Artists"

In [3]:
# Getting the data from the DB, check if useDate is present in the records fetched
useDate = '24/04/2020'
query_result = list(connectedDB[songCollectionName].find({'youtubeId': {'$exists': 1}, 
                                                          'views.' + useDate: {'$exists': 1}}))

# Flatten the structure of the dataframe due to nested objects in database
initialSongsDf = pd.json_normalize(query_result)
print("Shape of incoming data", initialSongsDf.shape)

Shape of incoming data (39241, 142)


In [4]:
artistResult = list(connectedDB[artistCollectionName].find({}))
artistDictionary = {artist['_id'] : [artist['popularity'], artist['followers']] for artist in artistResult}

def combineMultipleArtistInfo(row):
    popularities = []
    followers = []
    for songArtist in row['artists']:
        popularities.append(artistDictionary[songArtist['id']][0])
        followers.append(artistDictionary[songArtist['id']][1])
    return popularities, followers

In [5]:
artistInfoDf = initialSongsDf.apply(lambda x: combineMultipleArtistInfo(x), axis=1, result_type='expand')
artistInfoDf.columns = ['artistsPopularity', 'artistsFollowers']
initialSongsDf = initialSongsDf.merge(artistInfoDf,left_index=True, right_index=True)

In [1]:
numDays = 2
musicFeatureColumns = ['energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 
                       'instrumentalness', 'liveness', 'valence', 'tempo', 'danceability']
artistRelatedColumns = ['artistsPopularity', 'artistsFollowers']

In [2]:
# Dropping the unnecessary columns of previous and future days
useDateFormatted = datetime.strptime(useDate, "%d/%m/%Y")
viewsColumns = []
engagementRelatedColumns = []

for addDay in range(numDays+1):
    deltaDay = (useDateFormatted + timedelta(days=addDay)).strftime('%d/%m/%Y')
    viewsColumns += ['views.' + deltaDay + '.viewCount']
    engagementRelatedColumns += [['views.' + deltaDay + '.spotifyPopularity', 
                                  'views.' + deltaDay + '.likeCount', 
                                  'views.' + deltaDay + '.dislikeCount', 
                                  'views.' + deltaDay + '.commentCount']]

# Removing the target day related engagement data
engagementRelatedColumns = engagementRelatedColumns[:-1]
engagementRelatedColumnsFlattened = [item for sublist in engagementRelatedColumns for item in sublist]

essentialSongsDf = initialSongsDf[musicFeatureColumns + artistRelatedColumns + 
                                  engagementRelatedColumnsFlattened + viewsColumns]

# Dropping all the null rows with null values
essentialSongsDf = essentialSongsDf.dropna()

print("Shape after selecting the essential columns and dropping null values ", essentialSongsDf.shape)

NameError: name 'datetime' is not defined

In [9]:
# Renaming all the columns that have dates in them to standard names
viewsMapping = { col: ('day' + str(index) + '.' + col.split('.')[-1]) for index, col in enumerate(viewsColumns)}
engagementMapping = { colName: 'day' + str(index) + '.' + colName.split('.')[-1] for index, cols in enumerate(engagementRelatedColumns) for colName in cols }
essentialSongsDf = essentialSongsDf.rename(columns = viewsMapping)
essentialSongsDf = essentialSongsDf.rename(columns = engagementMapping)
print("Shape after selecting the essential columns and dropping null values ", essentialSongsDf.shape)

Shape after selecting the essential columns and dropping null values  (38677, 23)


In [None]:
# Insert the logic of condensing the artist popularity and followers

In [12]:
# Convert all the columns to type float64
changeType = {column: 'float64' for column in list(set(essentialSongsDf.columns) - set(artistRelatedColumns))}
essentialSongsDf = essentialSongsDf.astype(changeType)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 38677 entries, 0 to 39240
Data columns (total 23 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   energy                  38677 non-null  float64
 1   key                     38677 non-null  float64
 2   loudness                38677 non-null  float64
 3   mode                    38677 non-null  float64
 4   speechiness             38677 non-null  float64
 5   acousticness            38677 non-null  float64
 6   instrumentalness        38677 non-null  float64
 7   liveness                38677 non-null  float64
 8   valence                 38677 non-null  float64
 9   tempo                   38677 non-null  float64
 10  artistsPopularity       38677 non-null  object 
 11  artistsFollowers        38677 non-null  object 
 12  day0.spotifyPopularity  38677 non-null  float64
 13  day0.likeCount          38677 non-null  float64
 14  day0.dislikeCount       38677 non-null

In [16]:
# This remove rows where viewCount might be 0 or if views decrease from one day to the next
essentialSongsDf = essentialSongsDf[(essentialSongsDf['day0.viewCount'] > 0) & 
                                    (essentialSongsDf['day1.viewCount'] > 0) & 
                                    (essentialSongsDf['day2.viewCount'] > 0) & 
                                    (essentialSongsDf['day2.viewCount'] > essentialSongsDf['day1.viewCount']) & 
                                    (essentialSongsDf['day1.viewCount'] > essentialSongsDf['day0.viewCount'])]
print("Shape after dropping 0 values ", essentialSongsDf.shape)

Shape after dropping 0 values  (37237, 25)


In [17]:
# Takes percentage change in view between (day1, day0) and (day2, day1)
essentialSongsDf['first_percentage_increase'] = ((essentialSongsDf['day1.viewCount'] - essentialSongsDf['day0.viewCount']) * 100) / essentialSongsDf['day0.viewCount']
essentialSongsDf['target_percentage_increase'] = ((essentialSongsDf['day2.viewCount'] - essentialSongsDf['day1.viewCount']) * 100) / essentialSongsDf['day1.viewCount']
essentialSongsDf.describe()

Unnamed: 0,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,...,day0.commentCount,day1.spotifyPopularity,day1.likeCount,day1.dislikeCount,day1.commentCount,day0.viewCount,day1.viewCount,day2.viewCount,first_percentage_increase,target_percentage_increase
count,37237.0,37237.0,37237.0,37237.0,37237.0,37237.0,37237.0,37237.0,37237.0,37237.0,...,37237.0,37237.0,37237.0,37237.0,37237.0,37237.0,37237.0,37237.0,37237.0,37237.0
mean,0.698395,5.275747,-6.842183,0.627575,0.08218,0.186374,0.150753,0.197056,0.457492,123.804695,...,5635.44,40.602707,110578.3,5348.353,5637.395,19000260.0,19008350.0,19015960.0,0.215893,0.191826
std,0.216467,3.572953,3.768831,0.483457,0.083347,0.259626,0.295412,0.158399,0.236294,29.350651,...,49354.38,16.722293,556862.6,73104.11,49363.93,112139900.0,112173200.0,112204800.0,0.914882,0.68572
min,0.000943,0.0,-39.194,0.0,0.0223,0.0,0.0,0.013,0.0,34.543,...,0.0,0.0,0.0,0.0,0.0,3.0,5.0,6.0,0.000558,4.9e-05
25%,0.555,2.0,-8.435,0.0,0.0355,0.00677,0.0,0.0966,0.272,100.033,...,27.0,30.0,769.0,18.0,27.0,65259.0,65409.0,65555.0,0.031301,0.029182
50%,0.737,5.0,-6.004,1.0,0.0496,0.0559,9.7e-05,0.131,0.445,123.558,...,194.0,42.0,4979.0,120.0,194.0,491396.0,492546.0,493148.0,0.065941,0.061696
75%,0.879,8.0,-4.342,1.0,0.0883,0.266,0.0701,0.262,0.634,143.852,...,1309.0,52.0,33521.0,820.0,1309.0,3858964.0,3864626.0,3873022.0,0.182149,0.168748
max,0.999,11.0,1.893,1.0,0.953,0.996,0.997,1.0,1.0,222.605,...,5040410.0,96.0,37205870.0,11260880.0,5040935.0,6730952000.0,6732749000.0,6734501000.0,66.666667,58.823529


In [52]:
from sklearn.preprocessing import RobustScaler
robust_scaler = RobustScaler()
x_scaled = robust_scaler.fit_transform(essentialSongsDf)
# df_test = pd.DataFrame(x_scaled)
scaledDf = pd.DataFrame(x_scaled, columns=essentialSongsDf.columns)
scaledDf

Unnamed: 0,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,day0Views,day1Views,day2Views,day0Popularity,day1Popularity,first_percentage_increase,target_percentage_increase
0,-0.563467,0.166667,-0.426370,0.0,4.060606,2.484764,-0.001375,-0.139057,-0.080110,0.129108,29.956766,29.911073,29.875954,1.363636,1.363636,-0.185605,-0.110675
1,0.046440,-0.833333,-0.017123,0.0,2.886364,1.678624,-0.001352,-0.275091,-0.290055,-1.012275,5.312234,5.302151,5.293494,0.272727,0.272727,-0.407475,-0.408854
2,0.523220,-0.833333,0.346624,-1.0,-0.001894,-0.214649,-0.001296,-0.301088,0.544199,-0.173624,-0.076031,-0.076115,-0.076246,0.318182,0.318182,-0.156895,-0.180587
3,-0.705882,0.333333,-1.193738,0.0,-0.297348,1.528196,-0.001403,-0.281741,0.668508,-0.403274,-0.114199,-0.114220,-0.114286,0.272727,0.272727,0.125992,0.572668
4,0.009288,1.000000,-0.161448,0.0,-0.465909,-0.160842,-0.001258,0.973398,0.281768,-0.426022,-0.084815,-0.084882,-0.084993,-0.363636,-0.363636,-0.105910,-0.029275
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37721,-0.315789,-0.500000,-0.734344,-1.0,-0.293561,-0.199799,0.828758,1.457074,-0.967680,-0.768273,-0.126807,-0.126812,-0.126877,-0.863636,-0.863636,0.603657,0.207576
37722,0.656347,1.000000,0.007828,-1.0,-0.009470,0.344056,0.002214,0.937122,-0.138122,-0.471428,-0.006886,-0.007089,-0.007325,-0.363636,-0.363636,-0.251396,-0.281955
37723,-0.668731,-0.666667,-0.129159,-1.0,-0.238636,1.115483,-0.001403,0.459492,-0.165746,0.688575,-0.129888,-0.129892,-0.129954,-1.681818,-1.681818,4.432487,1.297069
37724,0.532508,0.333333,1.169276,-1.0,2.791667,-0.177428,-0.001275,-0.151149,-0.455801,0.099926,0.043649,0.043343,0.043024,-0.590909,-0.590909,-0.331208,-0.334262


In [53]:
X_train, X_test, Y_train, Y_test = train_test_split(scaledDf[columnsToKeep + ['first_percentage_increase']], 
                                                    scaledDf['target_percentage_increase'], 
                                                    test_size=0.2, random_state=42)


In [54]:
from sklearn.metrics import mean_squared_error
mean_squared_error(X_train['first_percentage_increase'], Y_train)

15.381709824750445

In [63]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR

# find optimal alpha with grid search
alpha = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
# l1_ratio = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]
param_grid = dict(alpha=alpha)

ridge = Ridge(alpha=1)
grid = GridSearchCV(estimator=ridge, param_grid=param_grid, 
                    scoring='r2', verbose=1, cv=10)
grid_result = grid.fit(X_train, Y_train)
print('Best Score: ', grid_result.best_score_)
print('Best Params: ', grid_result.best_params_)




# lasso_regressor = Lasso(alpha=1)
# lasso_regressor.fit(X_train, Y_train)

# train_score = lasso_regressor.score(X_train, Y_train)
# test_score = lasso_regressor.score(X_test, Y_test)

# coeff_used = np.sum(lasso_regressor.coef_!=0)

# # scores = cross_val_score(lasso_regressor, scaledDf[feature], cleaned_df[target])

# Y_pred = lasso_regressor.predict(X_train)

# print ("training score:", train_score )
# print ("test score: ", test_score)
# print ("number of features used: ", coeff_used)
# print("Cross Validated Mean and Standard Deviation: %0.3f +/- %0.3f" % (scores.mean(), scores.std()))

Fitting 10 folds for each of 7 candidates, totalling 70 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Best Score:  0.629727153970575
Best Params:  {'alpha': 1000}


[Parallel(n_jobs=1)]: Done  70 out of  70 | elapsed:    0.3s finished


In [58]:
mean_squared_error(Y_pred, Y_train)

10.178082032686543

In [44]:
columnsToKeep + ['first_percentage_increase']

['energy',
 'key',
 'loudness',
 'mode',
 'speechiness',
 'acousticness',
 'instrumentalness',
 'liveness',
 'valence',
 'tempo',
 'first_percentage_increase']

In [60]:
lasso_regressor.coef_

array([-0.        ,  0.        , -0.        ,  0.        , -0.        ,
        0.        ,  0.        , -0.        ,  0.        ,  0.        ,
        0.61423437])