In [1]:
# Importing necessary libraries
import pymongo
from datetime import datetime, timedelta
from pymongo import InsertOne, DeleteOne, ReplaceOne, UpdateOne
import pandas as pd
import numpy as np  
from pandas.io.json import json_normalize
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

In [2]:
# Importing track data from the database
client = pymongo.MongoClient("mongodb+srv://" + "vidit23" + ":" + "dsba123" + "@mvp-bvqf2.mongodb.net/test?retryWrites=true&w=majority")
connectedDB = client['MVP']
songCollectionName = "Videos"
artistCollectionName = "Artists"

In [3]:
# Getting the data from the DB, check if useDate is present in the records fetched
useDate = '24/04/2020'
query_result = list(connectedDB[songCollectionName].find({'youtubeId': {'$exists': 1}, 
                                                          'views.' + useDate: {'$exists': 1}}))

# Flatten the structure of the dataframe due to nested objects in database
initialSongsDf = pd.json_normalize(query_result)
print("Shape of incoming data", initialSongsDf.shape)

Shape of incoming data (39241, 166)


In [4]:
# Fetching all artist information
artistResult = list(connectedDB[artistCollectionName].find({}))
artistDictionary = {artist['_id'] : [artist['popularity'], artist['followers']] for artist in artistResult}

# Function to coalesce multiple artist into one
def combineMultipleArtistInfo(row):
    popularities = []
    followers = []
    for songArtist in row['artists']:
        popularities.append(artistDictionary[songArtist['id']][0])
        followers.append(artistDictionary[songArtist['id']][1])
    return popularities, followers

In [5]:
# Finding corresponding artist for each track
artistInfoDf = initialSongsDf.apply(lambda x: combineMultipleArtistInfo(x), axis=1, result_type='expand')
artistInfoDf.columns = ['artistsPopularity', 'artistsFollowers']

In [6]:
# Different combining functions based on artists
artistInfoDf['artistPopularitySum'] = artistInfoDf['artistsPopularity'].apply(np.sum)
artistInfoDf['artistPopularityMax'] = artistInfoDf['artistsPopularity'].apply(np.max)
artistInfoDf['artistFollowerSum'] = artistInfoDf['artistsFollowers'].apply(np.sum)
artistInfoDf['artistFollowerMax'] = artistInfoDf['artistsFollowers'].apply(np.max)
artistInfoDf['numArtists'] = artistInfoDf['artistsPopularity'].apply(np.size)

In [7]:
# Combining track and artist information
initialSongsDf = initialSongsDf.merge(artistInfoDf,left_index=True, right_index=True)

In [8]:
numDays = 2
musicFeatureColumns = ['energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 
                       'instrumentalness', 'liveness', 'valence', 'tempo', 'danceability']

artistRelatedColumns = ['artistPopularitySum', 'artistPopularityMax','numArtists','artistFollowerSum','artistFollowerMax']

In [9]:
# Dropping the unnecessary columns of previous and future days
useDateFormatted = datetime.strptime(useDate, "%d/%m/%Y")
viewsColumns = []
engagementRelatedColumns = []

for addDay in range(numDays+1):
    deltaDay = (useDateFormatted + timedelta(days=addDay)).strftime('%d/%m/%Y')
    viewsColumns += ['views.' + deltaDay + '.viewCount']
    engagementRelatedColumns += [['views.' + deltaDay + '.spotifyPopularity', 
                                  'views.' + deltaDay + '.likeCount', 
                                  'views.' + deltaDay + '.dislikeCount', 
                                  'views.' + deltaDay + '.commentCount']]

# Removing the target day related engagement data
engagementRelatedColumns = engagementRelatedColumns[:-1]
engagementRelatedColumnsFlattened = [item for sublist in engagementRelatedColumns for item in sublist]

essentialSongsDf = initialSongsDf[musicFeatureColumns + artistRelatedColumns + 
                                  engagementRelatedColumnsFlattened + viewsColumns]

# Dropping all the null rows with null values
essentialSongsDf = essentialSongsDf.dropna()

print("Shape after selecting the essential columns and dropping null values ", essentialSongsDf.shape)

Shape after selecting the essential columns and dropping null values  (38677, 27)


In [10]:
# Renaming all the columns that have dates in them to standard names
viewsMapping = { col: ('day' + str(index) + '.' + col.split('.')[-1]) for index, col in enumerate(viewsColumns)}
engagementMapping = { colName: 'day' + str(index) + '.' + colName.split('.')[-1] for index, cols in enumerate(engagementRelatedColumns) for colName in cols }
essentialSongsDf = essentialSongsDf.rename(columns = viewsMapping)
essentialSongsDf = essentialSongsDf.rename(columns = engagementMapping)
print("Shape after selecting the essential columns and dropping null values ", essentialSongsDf.shape)

Shape after selecting the essential columns and dropping null values  (38677, 27)


In [11]:
# Convert all the columns to type float64
changeType = {column: 'float64' for column in list(set(essentialSongsDf.columns))}
essentialSongsDf = essentialSongsDf.astype(changeType)

In [12]:
# This remove rows where viewCount might be 0 or if views decrease from one day to the next
essentialSongsDf = essentialSongsDf[(essentialSongsDf['day0.viewCount'] > 0) & 
                                    (essentialSongsDf['day1.viewCount'] > 0) & 
                                    (essentialSongsDf['day2.viewCount'] > 0) & 
                                    (essentialSongsDf['day2.viewCount'] > essentialSongsDf['day1.viewCount']) & 
                                    (essentialSongsDf['day1.viewCount'] > essentialSongsDf['day0.viewCount'])]
print("Shape after dropping 0 values ", essentialSongsDf.shape)

Shape after dropping 0 values  (37237, 27)


In [13]:
# Takes percentage change in view between (day1, day0) and (day2, day1)
essentialSongsDf['first_percentage_increase'] = ((essentialSongsDf['day1.viewCount'] - essentialSongsDf['day0.viewCount']) * 100) / essentialSongsDf['day0.viewCount']
essentialSongsDf['target_percentage_increase'] = ((essentialSongsDf['day2.viewCount'] - essentialSongsDf['day1.viewCount']) * 100) / essentialSongsDf['day1.viewCount']


In [14]:
essentialSongsDf = essentialSongsDf.drop('day2.viewCount', 1)
essentialSongsDf = essentialSongsDf[(essentialSongsDf['day1.likeCount'] >= essentialSongsDf['day0.likeCount']) & 
                                    (essentialSongsDf['day1.dislikeCount'] >= essentialSongsDf['day0.dislikeCount'])]
print("Shape after dropping 0 values ", essentialSongsDf.shape)

Shape after dropping 0 values  (35899, 28)


In [15]:
# Percent increase in likes
essentialSongsDf['likes_percentage_change'] = ((essentialSongsDf['day1.likeCount'] - essentialSongsDf['day0.likeCount']) * 100) / essentialSongsDf['day0.likeCount']

# Percent increase in dislikes
essentialSongsDf['dislikes_percentage_change'] = ((essentialSongsDf['day1.dislikeCount'] - essentialSongsDf['day0.dislikeCount']) * 100) / essentialSongsDf['day0.dislikeCount']
# Converting all inf and nan values to 0
essentialSongsDf = essentialSongsDf.replace([np.inf, -np.inf], 0)
essentialSongsDf.describe()

Unnamed: 0,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,...,day1.spotifyPopularity,day1.likeCount,day1.dislikeCount,day1.commentCount,day0.viewCount,day1.viewCount,first_percentage_increase,target_percentage_increase,likes_percentage_change,dislikes_percentage_change
count,35899.0,35899.0,35899.0,35899.0,35899.0,35899.0,35899.0,35899.0,35899.0,35899.0,...,35899.0,35899.0,35899.0,35899.0,35899.0,35899.0,35899.0,35899.0,35872.0,34273.0
mean,0.697502,5.276832,-6.867439,0.627928,0.081929,0.187556,0.152315,0.197205,0.457866,123.798968,...,40.533775,112656.4,5491.63,5749.967,19450730.0,19459010.0,0.219421,0.194813,0.192243,0.183266
std,0.216979,3.56869,3.788229,0.483364,0.083243,0.260572,0.296682,0.158692,0.236822,29.362916,...,16.778016,566584.9,74445.94,50250.14,114145500.0,114179400.0,0.930198,0.696499,0.868945,2.236361
min,0.000943,0.0,-39.194,0.0,0.0224,0.0,0.0,0.013,0.0,34.543,...,0.0,0.0,0.0,0.0,3.0,5.0,0.000558,4.9e-05,0.0,0.0
25%,0.554,2.0,-8.4695,0.0,0.0355,0.00689,0.0,0.0966,0.272,100.033,...,30.0,743.0,17.0,26.0,62568.0,62763.5,0.031644,0.029616,0.00418,0.0
50%,0.736,5.0,-6.021,1.0,0.0494,0.0566,0.000102,0.131,0.445,123.528,...,42.0,4764.0,116.0,187.0,471565.0,472446.0,0.066752,0.06246,0.051129,0.0
75%,0.878,8.0,-4.352,1.0,0.0879,0.269,0.07545,0.262,0.635,143.842,...,52.0,32653.5,802.5,1284.0,3759686.0,3761030.0,0.184767,0.17173,0.135596,0.022331
max,0.999,11.0,1.893,1.0,0.953,0.996,0.997,1.0,1.0,222.605,...,96.0,37205870.0,11260880.0,5040935.0,6730952000.0,6732749000.0,66.666667,58.823529,50.0,200.0


In [16]:
# Taking the log of the skewed data and changing column names accordingly
logList = ['day0.likeCount','day0.dislikeCount','day0.commentCount','day1.likeCount','day1.dislikeCount',
           'day1.commentCount','day0.viewCount','day1.viewCount','artistFollowerSum','artistFollowerMax', 
           'first_percentage_increase', 'target_percentage_increase', 'likes_percentage_change', 'dislikes_percentage_change']

logMapping = { col: ('log' +  '.' + col) for index, col in enumerate(logList)}
essentialSongsDf = essentialSongsDf.rename(columns = logMapping)

logCols = logMapping.values()
for cols in logCols: 
    essentialSongsDf[cols] = essentialSongsDf[cols].apply(lambda x: np.log(x))

essentialSongsDf.describe()

  x2 = take(ap, indices_above, axis=axis) * weights_above


Unnamed: 0,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,...,day1.spotifyPopularity,log.day1.likeCount,log.day1.dislikeCount,log.day1.commentCount,log.day0.viewCount,log.day1.viewCount,log.first_percentage_increase,log.target_percentage_increase,log.likes_percentage_change,log.dislikes_percentage_change
count,35899.0,35899.0,35899.0,35899.0,35899.0,35899.0,35899.0,35899.0,35899.0,35899.0,...,35899.0,35899.0,35899.0,35899.0,35899.0,35899.0,35899.0,35899.0,35872.0,34273.0
mean,0.697502,5.276832,-6.867439,0.627928,0.081929,0.187556,0.152315,0.197205,0.457866,123.798968,...,40.533775,-inf,-inf,-inf,13.103697,13.105853,-2.546631,-2.620064,-inf,-inf
std,0.216979,3.56869,3.788229,0.483364,0.083243,0.260572,0.296682,0.158692,0.236822,29.362916,...,16.778016,,,,2.946287,2.944222,1.294576,1.285066,,
min,0.000943,0.0,-39.194,0.0,0.0224,0.0,0.0,0.013,0.0,34.543,...,0.0,-inf,-inf,-inf,1.098612,1.609438,-7.490644,-9.918253,-inf,-inf
25%,0.554,2.0,-8.4695,0.0,0.0355,0.00689,0.0,0.0966,0.272,100.033,...,30.0,6.610696,2.833213,3.258097,11.044009,11.047129,-3.453197,-3.519433,-5.477435,
50%,0.736,5.0,-6.021,1.0,0.0494,0.0566,0.000102,0.131,0.445,123.528,...,42.0,8.468843,4.75359,5.231109,13.063812,13.065679,-2.706765,-2.773237,-2.973399,
75%,0.878,8.0,-4.352,1.0,0.0879,0.269,0.07545,0.262,0.635,143.842,...,52.0,10.39371,6.687732,7.157735,15.139846,15.140203,-1.688659,-1.761829,-1.998079,-3.801762
max,0.999,11.0,1.893,1.0,0.953,0.996,0.997,1.0,1.0,222.605,...,96.0,17.43198,16.23684,15.4331,22.629982,22.630249,4.199705,4.074542,3.912023,5.298317


In [17]:
essentialSongsDf = essentialSongsDf.replace([np.inf, -np.inf, np.nan], 0)

In [18]:
# Selecting the features and the label
predictor_columns = ['energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness',
       'instrumentalness', 'liveness', 'valence', 'tempo', 'danceability',
       'artistPopularitySum', 'artistPopularityMax', 'numArtists',
       'log.artistFollowerSum', 'log.artistFollowerMax',
       'day0.spotifyPopularity', 'log.day0.likeCount', 'log.day0.dislikeCount',
       'log.day0.commentCount', 'day1.spotifyPopularity', 'log.day1.likeCount',
       'log.day1.dislikeCount', 'log.day1.commentCount', 'log.day0.viewCount',
       'log.day1.viewCount', 'log.first_percentage_increase', 'log.likes_percentage_change',
       'log.dislikes_percentage_change']
target_columns = 'log.target_percentage_increase'

In [19]:
from sklearn.preprocessing import RobustScaler
robust_scaler = RobustScaler()
x_scaled = robust_scaler.fit_transform(essentialSongsDf)
df_test_robust = pd.DataFrame(x_scaled, columns=essentialSongsDf.columns)
X_train_robust, X_test_robust, Y_train_robust, Y_test_robust = train_test_split(df_test_robust[predictor_columns], df_test_robust[target_columns], test_size=0.2, random_state=1) 

In [20]:
from sklearn.metrics import r2_score, mean_squared_error
X_test_robust_df = pd.DataFrame(X_test_robust)
print('Base Rate Mean Squared Error for Robust Scaler', mean_squared_error(Y_test_robust, X_test_robust_df[['log.first_percentage_increase']]))
print('Base Rate R Squared Error for Robust Scaler', r2_score(Y_test_robust, X_test_robust_df[['log.first_percentage_increase']]))

Base Rate Mean Squared Error for Robust Scaler 0.029055551257308097
Base Rate R Squared Error for Robust Scaler 0.9448472831144035


In [21]:
from sklearn import preprocessing
df_test_minmax = essentialSongsDf.copy()
x = df_test_minmax.values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()#feature_range can be set
x_scaled = min_max_scaler.fit_transform(x)
df_test_minmax = pd.DataFrame(x_scaled)
column_names = essentialSongsDf.columns
df_test_minmax.columns = column_names
X_train_min_max, X_test_min_max, Y_train_min_max, Y_test_min_max = train_test_split(df_test_minmax[predictor_columns], df_test_minmax[target_columns], test_size=0.2, random_state=1)

In [22]:
X_test_minmax_df = pd.DataFrame(X_test_min_max)
print('Base Rate Mean Squared Error for MinMax Scaler', mean_squared_error(Y_test_min_max, X_test_minmax_df[['log.first_percentage_increase']]))
print('Base Rate R Squared Error for MinMax Scaler', r2_score(Y_test_min_max, X_test_minmax_df[['log.first_percentage_increase']]))

Base Rate Mean Squared Error for MinMax Scaler 0.01057798940513532
Base Rate R Squared Error for MinMax Scaler -0.27264691205699365


In [23]:
def find_min_samples_leaf(X_train, Y_train):
    min_samples_leaf = [100,200,300,400,500,600,700,800,900,1000,1100,1200,1300,1400,1500,1600,1700,1800,1900,2000]
    param_grid = dict(min_samples_leaf=min_samples_leaf)
    decisionTreeRegressor = DecisionTreeRegressor(min_samples_leaf=100)
    grid = GridSearchCV(estimator=decisionTreeRegressor, param_grid=param_grid, 
                    scoring='neg_mean_squared_error', verbose=1, cv=10)
    grid_result = grid.fit(X_train, Y_train)
    print('Best Score: ', grid_result.best_score_)
    print('Best Params: ', grid_result.best_params_)

In [24]:
def decision_model(X_train, Y_train, X_test, Y_test, X, Y, min_samples_leaf = 100):
    decisionTreeRegressor = DecisionTreeRegressor(min_samples_leaf=min_samples_leaf)
    decisionTreeRegressor.fit(X_train, Y_train)
    preds = decisionTreeRegressor.predict(X_test)
    test_score = decisionTreeRegressor.score(X_test, Y_test)
    print('Test Score', test_score)
    train_score = decisionTreeRegressor.score(X_train, Y_train)
    print('Train Score',train_score)

In [25]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
# using Grid search CV technique, optimum value fot min_sample_leaf is found. 
find_min_samples_leaf(X_train_robust, Y_train_robust)
decision_model(X_train_robust, Y_train_robust, X_test_robust, Y_test_robust, df_test_robust[predictor_columns], df_test_robust[target_columns], 100)

Fitting 10 folds for each of 20 candidates, totalling 200 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 200 out of 200 | elapsed:   34.1s finished


Best Score:  -0.028407421763437652
Best Params:  {'min_samples_leaf': 100}
Test Score 0.946403228017084
Train Score 0.9517064897145575


In [26]:
find_min_samples_leaf(X_train_min_max, Y_train_min_max)
decision_model(X_train_min_max, Y_train_min_max, X_test_min_max, Y_test_min_max, df_test_minmax[predictor_columns], df_test_minmax[target_columns], 100)

Fitting 10 folds for each of 20 candidates, totalling 200 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 200 out of 200 | elapsed:   35.9s finished


Best Score:  -0.0004480903882860731
Best Params:  {'min_samples_leaf': 100}
Test Score 0.9464098889139747
Train Score 0.9517065412809771


In [27]:
def find_neighbors_parameters(X_train, Y_train):
    n_neighbors = [100,200,300,400,500,600,700,800,900,1000,1100,1200,1300,1400,1500,1600,1700,1800,1900,2000]
    param_grid = dict(n_neighbors=n_neighbors)
    knn = neighbors.KNeighborsRegressor(n_neighbors, weights='distance')
    grid = GridSearchCV(estimator=knn, param_grid=param_grid, 
                    scoring='neg_mean_squared_error', verbose=1, cv=10)
    grid_result = grid.fit(X_train, Y_train)
    print('Best Score for distance: ', grid_result.best_score_)
    print('Best Params for distance: ', grid_result.best_params_)

In [28]:
def neighbor_model(X_train, Y_train, X_test, Y_test, X, Y, weights = 'uniform', n_neighbors = 5):
    knn = neighbors.KNeighborsRegressor(n_neighbors, weights)
    knn.fit(X_train, Y_train)
    preds = knn.predict(X_test)
    test_score = knn.score(X_test, Y_test)
    print('Test Score', test_score)
    train_score = knn.score(X_train, Y_train)
    print('Train Score',train_score)

In [29]:
from sklearn import neighbors
find_neighbors_parameters(X_train_robust, Y_train_robust)
neighbor_model(X_train_robust, Y_train_robust, X_test_robust, Y_test_robust, df_test_robust[predictor_columns], df_test_robust[target_columns], 'distance', 100)

Fitting 10 folds for each of 20 candidates, totalling 200 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 200 out of 200 | elapsed:  8.6min finished


Best Score for distance:  -0.16209540682730633
Best Params for distance:  {'n_neighbors': 100}
Test Score 0.6980009428636405
Train Score 1.0


In [30]:
find_neighbors_parameters(X_train_min_max, Y_train_min_max)
neighbor_model(X_train_min_max, Y_train_min_max, X_test_min_max, Y_test_min_max, df_test_minmax[predictor_columns], df_test_minmax[target_columns], 'distance', 100)

Fitting 10 folds for each of 20 candidates, totalling 200 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 200 out of 200 | elapsed:  8.3min finished


Best Score for distance:  -0.0035302093618191296
Best Params for distance:  {'n_neighbors': 100}
Test Score 0.5814419068480756
Train Score 1.0


In [31]:
def find_ridge_alpha(X_train, Y_train): 
    # find optimal alpha with grid search
    alpha = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
    param_grid = dict(alpha=alpha)

    ridge = Ridge(alpha=1)
    grid = GridSearchCV(estimator=ridge, param_grid=param_grid, 
                    scoring='neg_mean_squared_error', verbose=1, cv=10)
    grid_result = grid.fit(X_train, Y_train)
    print('Best Score: ', grid_result.best_score_)
    print('Best Params: ', grid_result.best_params_)

In [32]:
def ridge_model(X_train, Y_train, X_test, Y_test, X, Y, alpha_value = 1):
    ridge = Ridge(alpha=alpha_value)
    ridge.fit(X_train, Y_train)
    preds = ridge.predict(X_test)
    test_score = ridge.score(X_test, Y_test)
    print('Test Score', test_score)
    train_score = ridge.score(X_train, Y_train)
    print('Train Score',train_score)

In [33]:
find_ridge_alpha(X_train_robust, Y_train_robust)
ridge_model(X_train_robust, Y_train_robust, X_test_robust, Y_test_robust, df_test_robust[predictor_columns], df_test_robust[target_columns], 0.1)

Fitting 10 folds for each of 7 candidates, totalling 70 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Best Score:  -0.027471048102805005
Best Params:  {'alpha': 0.01}
Test Score 0.946596426244818
Train Score 0.948935684950028


[Parallel(n_jobs=1)]: Done  70 out of  70 | elapsed:    0.5s finished


In [34]:
find_ridge_alpha(X_train_min_max, Y_train_min_max)
ridge_model(X_train_min_max, Y_train_min_max, X_test_min_max, Y_test_min_max, df_test_minmax[predictor_columns], df_test_minmax[target_columns], 0.001)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 10 folds for each of 7 candidates, totalling 70 fits
Best Score:  -0.0004334286310378959
Best Params:  {'alpha': 0.001}
Test Score 0.9466769696225609
Train Score 0.9489533304703122


[Parallel(n_jobs=1)]: Done  70 out of  70 | elapsed:    0.5s finished


In [35]:
def find_lasso_alpha(X_train, Y_train):
    # find optimal alpha with grid search
    alpha = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
    param_grid = dict(alpha=alpha)
    lasso = Lasso(alpha=1)
    grid_lasso = GridSearchCV(estimator=lasso, param_grid=param_grid, 
                    scoring='neg_mean_squared_error', verbose=1, cv=10)
    grid_lasso_result = grid_lasso.fit(X_train, Y_train)
    print('Best Score: ', grid_lasso_result.best_score_)
    print('Best Params: ', grid_lasso_result.best_params_)

In [36]:
def lasso_model(X_train, Y_train, X_test, Y_test, X, Y, alpha_value = 1):
    lasso = Lasso(alpha=alpha_value)
    lasso.fit(X_train, Y_train)
    preds = lasso.predict(X_test)
    test_score = lasso.score(X_test, Y_test)
    print('Test Score', test_score)
    train_score = lasso.score(X_train, Y_train)
    print('Train Score',train_score)

In [37]:
find_lasso_alpha(X_train_robust, Y_train_robust)
lasso_model(X_train_robust, Y_train_robust, X_test_robust, Y_test_robust, df_test_robust[predictor_columns], df_test_robust[target_columns], 0.001)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 10 folds for each of 7 candidates, totalling 70 fits
Best Score:  -0.027512765811517615
Best Params:  {'alpha': 0.001}
Test Score 0.9464739740417091
Train Score 0.9487951637238837


[Parallel(n_jobs=1)]: Done  70 out of  70 | elapsed:    0.7s finished


In [38]:
find_lasso_alpha(X_train_min_max, Y_train_min_max)
lasso_model(X_train_min_max, Y_train_min_max, X_test_min_max, Y_test_min_max, df_test_minmax[predictor_columns], df_test_minmax[target_columns], 0.001)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 10 folds for each of 7 candidates, totalling 70 fits
Best Score:  -0.0005191946782293205
Best Params:  {'alpha': 0.001}
Test Score 0.9379982599138168
Train Score 0.938675471419076


[Parallel(n_jobs=1)]: Done  70 out of  70 | elapsed:    0.5s finished


In [39]:
def linear_model(X_train, Y_train, X_test, Y_test, X, Y):
    linear_regressor = LinearRegression() 
    linear_regressor.fit(X_train, Y_train)
    preds = linear_regressor.predict(X_test)
    test_score = linear_regressor.score(X_test, Y_test)
    print('Test Score', test_score)
    train_score = linear_regressor.score(X_train, Y_train)
    print('Train Score',train_score)
    print('Root Mean Square Error', mean_squared_error(Y_test_min_max, preds))

In [40]:
linear_model(X_train_robust, Y_train_robust, X_test_robust, Y_test_robust, df_test_robust[predictor_columns], df_test_robust[target_columns])

Test Score 0.9467752397651785
Train Score 0.9489627906242687
Root Mean Square Error 0.5709739598110675


In [41]:
linear_model(X_train_min_max, Y_train_min_max, X_test_min_max, Y_test_min_max, df_test_minmax[predictor_columns], df_test_minmax[target_columns])

Test Score 0.9467752397651786
Train Score 0.9489627906242687
Root Mean Square Error 0.0004423936792843881
