In [1]:
import csv
import numpy as np
import pandas as pd
import plotly.offline as py
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
import plotly.graph_objects as go
from sklearn.metrics import r2_score
from plotly.subplots import make_subplots
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.linear_model import ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [2]:
py.init_notebook_mode(connected=True)
# Data pre-processing And Feature Sellection

# Read data
covidData = pd.read_csv("./data/owid-covid-data.csv")
# Drop not wanted columns
covidData.drop(["continent","iso_code","human_development_index","new_deaths_smoothed","total_cases_per_million","new_cases_per_million","new_cases_smoothed_per_million","gdp_per_capita","extreme_poverty","handwashing_facilities","life_expectancy"],axis=1,inplace=True)
covidData.drop(["total_cases","new_cases_smoothed","total_deaths_per_million","new_deaths_per_million","new_deaths_smoothed_per_million","reproduction_rate","new_tests_smoothed","new_tests_smoothed_per_thousand","positive_rate","tests_per_case"],axis=1,inplace=True)
covidData.drop(["total_deaths","icu_patients","icu_patients_per_million","hosp_patients","hosp_patients_per_million","weekly_icu_admissions","weekly_icu_admissions_per_million","weekly_hosp_admissions","weekly_hosp_admissions_per_million","total_tests","new_tests","total_tests_per_thousand","new_tests_per_thousand","tests_units"],axis=1,inplace=True)
covidData.drop(["total_vaccinations","people_vaccinated","people_fully_vaccinated","new_vaccinations","new_vaccinations_smoothed","total_vaccinations_per_hundred","people_vaccinated_per_hundred","people_fully_vaccinated_per_hundred","new_vaccinations_smoothed_per_million"],axis=1,inplace=True)

covidData = pd.DataFrame(covidData)
covidData = covidData.fillna(0)

# Dropping row 'Wrold',and row 'International' considered to be of no use 
covidData.drop(covidData[covidData['location'] == 'World'].index,inplace=True)
covidData.drop(covidData[covidData['location'] == 'International'].index,inplace=True)

trainOlder70 = []
trainOlder65 = []
trainMedianAge = []
trainSmokers = []
trainCases = {}
trainDeaths = {}
trainStrIndex = {}
trainPopulation=[]
trainPopulationDens = []
trainCardiovascDeathRate = []
trainDiabetesPrevalence = []
trainHospitalBedsPer_k = []

testOlder70 = []
testOlder65 = []
testMedianAge = []
testSmokers = []
testCases = {}
testDeaths = {}
testStrIndex = {}
testPopulation=[]
testPopulationDens = []
testCardiovascDeathRate = []
testDiabetesPrevalence = []
testHospitalBedsPer_k = []

# Gather features and store them in dataframes
targetCountries = ['Greece','Cuba','United Kingdom','Canada','Kenya','Afghanistan','Israel','Australia','Argentina',"Italy"]
strIndexCountry = None
strIndexMonth = None
strIndex = None
strIndexChangeCounter = 1
for index,row in covidData.iterrows():
    currentMonth = int(row["date"].split("-")[1])
    if int(row["date"].split("-")[0]) == 2021:
        currentMonth += 12

    if strIndexCountry == row["location"]:
        if strIndexMonth == currentMonth:
            if strIndex != float(row["stringency_index"]):
                strIndex = float(row["stringency_index"])
                if strIndex != 0.0:
                    strIndexChangeCounter += 1
                if row["location"] in targetCountries:
                    testStrIndex[row["location"]][strIndexMonth] += float(row["stringency_index"])
                else:
                    trainStrIndex[row["location"]][strIndexMonth] += float(row["stringency_index"])
        else:
            if row["location"] in targetCountries:
                testStrIndex[row["location"]][strIndexMonth] /= strIndexChangeCounter
            else:
                trainStrIndex[row["location"]][strIndexMonth] /= strIndexChangeCounter
            strIndexMonth = currentMonth
            strIndex = float(row["stringency_index"])
            strIndexChangeCounter = 1
            if row["location"] in targetCountries:
                testStrIndex[row["location"]][strIndexMonth] = float(row["stringency_index"])
            else:
                trainStrIndex[row["location"]][strIndexMonth] = float(row["stringency_index"])
    else:
        if strIndexCountry is None:
            strIndexCountry = row["location"]
            strIndexMonth = currentMonth
            if row["location"] in targetCountries:
                testStrIndex[row["location"]] = {strIndexMonth: float(row["stringency_index"])}
            else:
                trainStrIndex[row["location"]] = {strIndexMonth: float(row["stringency_index"])}
        else:
            if strIndexCountry in targetCountries:
                testStrIndex[strIndexCountry][strIndexMonth] /= strIndexChangeCounter
            else:
                trainStrIndex[strIndexCountry][strIndexMonth] /= strIndexChangeCounter
            strIndexCountry = row["location"]
            strIndexMonth = currentMonth
            strIndex = float(row["stringency_index"])
            strIndexChangeCounter = 1
            if row["location"] in targetCountries:
                testStrIndex[row["location"]] = {strIndexMonth: float(row["stringency_index"])}
            else:
                trainStrIndex[row["location"]] = {strIndexMonth: float(row["stringency_index"])}
            

    
    if row["location"] not in targetCountries:
        if row["location"] not in trainCases:
            trainCases[row["location"]] = {currentMonth: int(row['new_cases'])}
            trainCases[row["location"]] = {14: row['new_cases']}
            trainDeaths[row["location"]] = {currentMonth: int(row['new_deaths'])}
            trainDeaths[row["location"]] = {14: row['new_deaths']}
            
            trainMedianAge.append(float(row["median_age"]))
            trainOlder70.append(float(row["aged_70_older"]))
            trainOlder65.append(float(row["aged_65_older"]))
            trainPopulation.append(float(row["population"]))
            trainPopulationDens.append(float(row["population_density"]))
            trainDiabetesPrevalence.append(float(row["diabetes_prevalence"]))
            trainCardiovascDeathRate.append(float(row["cardiovasc_death_rate"]))
            trainHospitalBedsPer_k.append(float(row["hospital_beds_per_thousand"]))
            trainSmokers.append(float(row["female_smokers"]) + float(row["male_smokers"]))
        else:
            if currentMonth in trainCases[row["location"]]:
                trainCases[row["location"]][currentMonth] += int(row['new_cases'])
            else:
                trainCases[row["location"]][currentMonth] = int(row['new_cases'])
            if currentMonth in trainDeaths[row["location"]]:
                trainDeaths[row["location"]][currentMonth] += int(row['new_deaths'])
            else:
                trainDeaths[row["location"]][currentMonth] = int(row['new_deaths'])
            trainCases[row["location"]][14] += int(row['new_cases'])
            trainDeaths[row["location"]][14] += int(row['new_deaths'])
    else:    
        if row["location"] not in testCases:
            testCases[row["location"]] = {currentMonth: int(row['new_cases'])}
            testCases[row["location"]] = {14: row['new_cases']}
            testDeaths[row["location"]] = {currentMonth: int(row['new_deaths'])}
            testDeaths[row["location"]] = {14: row['new_deaths']}
        
            testMedianAge.append(float(row["median_age"]))
            testOlder70.append(float(row["aged_70_older"]))
            testOlder65.append(float(row["aged_65_older"]))
            testPopulation.append(float(row["population"]))
            testPopulationDens.append(float(row["population_density"]))
            testDiabetesPrevalence.append(float(row["diabetes_prevalence"]))
            testCardiovascDeathRate.append(float(row["cardiovasc_death_rate"]))
            testHospitalBedsPer_k.append(float(row["hospital_beds_per_thousand"]))
            testSmokers.append(float(row["female_smokers"]) + float(row["male_smokers"]))
        else:
            if currentMonth in testCases[row["location"]]:
                testCases[row["location"]][currentMonth] += int(row['new_cases'])
            else:
                testCases[row["location"]][currentMonth] = int(row['new_cases'])
            if currentMonth in testDeaths[row["location"]]:
                testDeaths[row["location"]][currentMonth] += int(row['new_deaths'])
            else:
                testDeaths[row["location"]][currentMonth] = int(row['new_deaths'])

            testCases[row["location"]][14] += int(row['new_cases'])
            testDeaths[row["location"]][14] += int(row['new_deaths'])


# Train dataframe
trainCovidData = pd.DataFrame()
trainCovidData["country"] = list(trainCases.keys())
# Gathering the number of monthly cases and deaths for each country
for i in range(14):
    monthlyCases = []
    monthlyDeaths = []
    monthlyStrIndex = []
    for country,months in trainCases.items():
        monthWithoutCases = True
        for month,cases in months.items():
            if month == i + 1:
                monthlyCases.append(cases)
                monthWithoutCases = False
                break
        if monthWithoutCases:
            monthlyCases.append(0)
    
    for country,months in trainDeaths.items():
        monthWithoutDeaths = True
        for month, deaths in months.items():
            if month == i + 1:
                monthlyDeaths.append(deaths)
                monthWithoutDeaths = False
                break
        if monthWithoutDeaths:
            monthlyDeaths.append(0)
            

    for country,months in trainStrIndex.items():
        monthWithoutStrIndex = True
        for month, strIndex in months.items():
            if month == i + 1:
                monthlyStrIndex.append(round(strIndex,2))
                monthWithoutStrIndex = False
                break
        if monthWithoutStrIndex:
            monthlyStrIndex.append(0)


    if i + 1 != 14:
        trainCovidData["casesforMonth-"+str(i+1)] = monthlyCases
    else:
        trainCovidData["totalCases"] = monthlyCases
        
    if i + 1 != 14:
        trainCovidData["deathsforMonth-"+str(i+1)] = monthlyDeaths
    else:
        trainCovidData["totalDeaths"] = monthlyDeaths
    
    if i + 1 != 14:
        trainCovidData["stringencyIndexforMonth-"+str(i+1)] = monthlyStrIndex

    monthlyCases.clear()
    monthlyDeaths.clear()
    monthlyStrIndex.clear()
trainCovidData["smokers"] = trainSmokers
trainCovidData["older70"] = trainOlder70
trainCovidData["older65"] = trainOlder65
trainCovidData["medianAge"] = trainMedianAge
trainCovidData["population"] = trainPopulation
trainCovidData[u"beds‰"] = trainHospitalBedsPer_k
trainCovidData["diabetes"] = trainDiabetesPrevalence
trainCovidData["populationDensity"] = trainPopulationDens
trainCovidData["cardiovascularDeathRate"] = trainCardiovascDeathRate

# Test dataframe
testCovidData = pd.DataFrame()
testCovidData["country"] = list(testCases.keys())
for i in range(14):
    monthlyCases = []
    monthlyDeaths = []
    monthlyStrIndex = []
    for country,months in testCases.items():
        monthWithoutCases = True
        for month,cases in months.items():
            if month == i + 1:
                monthlyCases.append(cases)
                monthWithoutCases = False
                break
        if monthWithoutCases:
            monthlyCases.append(0)
    
    for country,months in testDeaths.items():
        monthWithoutDeaths = True
        for month,deaths in months.items():
            if month == i + 1:
                monthlyDeaths.append(deaths)
                monthWithoutDeaths = False
                break
        if monthWithoutDeaths:
            monthlyDeaths.append(0)

    for country,months in testStrIndex.items():
        monthWithoutStrIndex = True
        for month, strIndex in months.items():
            if month == i + 1:
                monthlyStrIndex.append(round(strIndex,2))
                monthWithoutStrIndex = False
                break
        if monthWithoutStrIndex:
            monthlyStrIndex.append(0)
        
    
    if i + 1 != 14:
        testCovidData["casesforMonth-"+str(i+1)] = monthlyCases
    else:
        testCovidData["totalCases"] = monthlyCases

    if i + 1 != 14:
        testCovidData["deathsforMonth-"+str(i+1)] = monthlyDeaths
    else:
        testCovidData["totalDeaths"] = monthlyDeaths
    
    if i + 1 != 14:
        testCovidData["stringencyIndexforMonth-"+str(i+1)] = monthlyStrIndex

    monthlyCases.clear()
    monthlyDeaths.clear()
    monthlyStrIndex.clear()
testCovidData["smokers"] = testSmokers
testCovidData["older70"] = testOlder70
testCovidData["older65"] = testOlder65
testCovidData["medianAge"] = testMedianAge
testCovidData["population"] = testPopulation
testCovidData[u"beds‰"] = testHospitalBedsPer_k
testCovidData["diabetes"] = testDiabetesPrevalence
testCovidData["populationDensity"] = testPopulationDens
testCovidData["cardiovascularDeathRate"] = testCardiovascDeathRate

display(trainCovidData)
display(testCovidData)

Unnamed: 0,country,casesforMonth-1,deathsforMonth-1,stringencyIndexforMonth-1,casesforMonth-2,deathsforMonth-2,stringencyIndexforMonth-2,casesforMonth-3,deathsforMonth-3,stringencyIndexforMonth-3,...,totalDeaths,smokers,older70,older65,medianAge,population,beds‰,diabetes,populationDensity,cardiovascularDeathRate
0,Albania,0,0,0.00,0,0,0.00,241,15,62.34,...,1358.0,58.3,8.643,13.188,38.0,2877800.0,2.89,10.08,104.871,304.195
1,Algeria,0,0,0.00,0,0,0.00,715,44,36.17,...,2884.0,31.1,3.857,6.211,29.1,43851043.0,1.90,6.73,17.348,278.364
2,Andorra,0,0,0.00,0,0,0.00,375,12,21.85,...,100.0,66.8,0.000,0.000,0.0,77265.0,0.00,7.97,163.755,109.135
3,Angola,0,0,0.00,0,0,0.00,6,2,45.06,...,464.0,0.0,1.362,2.405,16.8,32866268.0,0.00,3.94,23.890,276.045
4,Antigua and Barbuda,0,0,0.00,0,0,0.00,6,0,0.00,...,6.0,0.0,4.631,6.933,32.1,97928.0,3.80,13.17,231.845,191.511
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
176,Venezuela,0,0,0.00,0,0,0.00,133,3,71.53,...,1177.0,0.0,3.915,6.614,29.0,28435943.0,0.80,6.47,36.253,204.850
177,Vietnam,0,0,12.96,14,0,39.58,196,0,60.65,...,35.0,46.9,4.718,7.150,32.6,97338583.0,2.60,6.00,308.127,245.465
178,Yemen,0,0,0.00,0,0,0.00,0,0,0.00,...,615.0,36.8,1.583,2.922,20.3,29825968.0,0.70,5.35,53.508,495.003
179,Zambia,0,0,0.00,0,0,0.00,33,0,33.89,...,728.0,27.8,1.542,2.480,17.7,18383956.0,2.00,3.94,22.995,234.499


Unnamed: 0,country,casesforMonth-1,deathsforMonth-1,stringencyIndexforMonth-1,casesforMonth-2,deathsforMonth-2,stringencyIndexforMonth-2,casesforMonth-3,deathsforMonth-3,stringencyIndexforMonth-3,...,totalDeaths,smokers,older70,older65,medianAge,population,beds‰,diabetes,populationDensity,cardiovascularDeathRate
0,Afghanistan,0,0,0.0,0,0,8.33,174,4,47.69,...,2399.0,0.0,1.337,2.581,18.6,38928341.0,0.5,9.59,54.422,597.029
1,Argentina,0,0,5.55,0,0,11.11,1054,27,56.95,...,47775.0,43.9,7.441,11.198,31.9,45195777.0,5.0,5.5,16.177,191.032
2,Australia,5,0,11.11,16,0,19.44,4534,18,46.3,...,909.0,29.5,10.129,15.504,37.9,25499881.0,3.84,5.07,3.202,107.791
3,Canada,3,0,2.78,16,0,2.78,8507,152,38.95,...,19801.0,28.6,10.797,16.984,41.4,37742157.0,2.5,7.37,4.037,105.599
4,Cuba,0,0,0.0,0,0,0.0,183,6,38.2,...,210.0,70.4,9.719,14.738,43.1,11326616.0,5.2,8.27,110.408,190.968
5,Greece,0,0,0.0,4,0,10.18,1310,49,48.04,...,5764.0,87.3,14.524,20.396,45.3,10423056.0,4.21,4.55,83.479,175.695
6,Israel,0,0,0.0,6,0,19.44,5711,21,54.75,...,4700.0,50.8,7.359,11.733,30.6,8655541.0,2.99,6.74,402.606,93.32
7,Italy,0,0,19.44,1126,29,38.54,104664,12399,80.74,...,87858.0,47.6,16.24,23.021,47.9,60461828.0,3.18,4.78,205.859,113.151
8,Kenya,0,0,0.0,0,0,0.0,59,1,54.37,...,1753.0,21.6,1.528,2.686,20.0,53771300.0,1.4,2.92,87.324,218.637
9,United Kingdom,0,0,8.33,59,0,9.72,38754,2457,39.86,...,104572.0,44.7,12.527,18.517,40.8,67886004.0,2.54,4.28,272.898,122.137


In [3]:
fig = make_subplots(rows=4, cols=3)

fig.add_trace(
    go.Scatter(y=trainCovidData["totalDeaths"], x=trainCovidData[u"beds‰"],
               mode = 'markers',
               text=trainCovidData["country"],
               name="beds"),
    row=1,col=1
)

fig.add_trace(
    go.Scatter(y=trainCovidData["totalDeaths"], x=trainCovidData["older70"],
               mode = 'markers',
               text=trainCovidData["country"],
               name=">70/deaths"),
    row=1,col=2
)

fig.add_trace(
    go.Scatter(y=trainCovidData["totalDeaths"], x=trainCovidData["older65"],
               mode = 'markers',
               text=trainCovidData["country"],
               name=">65/deaths"),
    row=1,col=3
)

fig.add_trace(
    go.Scatter(y=trainCovidData["totalDeaths"], x=trainCovidData["population"],
               mode = 'markers',
               text=trainCovidData["country"],
               name="population"),
    row=2,col=1

)

fig.add_trace(
    go.Scatter(y=trainCovidData["totalDeaths"], x=trainCovidData["populationDensity"],
               mode = 'markers',
               text=trainCovidData["country"],
               name="populationDensity"),
    row=2,col=2
)

fig.add_trace(
    go.Scatter(y=trainCovidData["totalDeaths"], x=trainCovidData["cardiovascularDeathRate"],
               mode = 'markers',
               text=trainCovidData["country"],
               name="cardiovascularDeathRate"),
    row=2,col=3
)

fig.add_trace(
    go.Scatter(y=trainCovidData["totalDeaths"], x=trainCovidData["diabetes"],
               mode = 'markers',
               text=trainCovidData["country"],
               name="diabetes"),
    row=3,col=1
)

fig.add_trace(
    go.Scatter(y=trainCovidData["totalDeaths"], x=trainCovidData["medianAge"],
               mode = 'markers',
               text=trainCovidData["country"],
               name="medianAge"),
    row=3,col=2
)

fig.add_trace(
    go.Scatter(y=trainCovidData["totalDeaths"], x=trainCovidData["smokers"],
               mode = 'markers',
               text=trainCovidData["country"],
               name="smokers"),
    row=3,col=3
)

stringencyIndexX = []
stringencyIndexY = []
for i in range(1,14):
    strIndex = "stringencyIndexforMonth-"+str(i)
    deaths = "deathsforMonth-"+str(i)
    stringencyIndexX.append(trainCovidData[strIndex][30])
    stringencyIndexY.append(trainCovidData[deaths][30])

fig.add_trace(
    go.Scatter(y=stringencyIndexY, x=stringencyIndexX,
                mode = 'lines',
                name="stringencyIndex/"+trainCovidData["country"][30]),
    row=4, col=1
)

stringencyIndexX = []
stringencyIndexY = []
for i in range(1,14):
    strIndex = "stringencyIndexforMonth-"+str(i)
    deaths = "deathsforMonth-"+str(i)
    stringencyIndexX.append(trainCovidData[strIndex][60])
    stringencyIndexY.append(trainCovidData[deaths][60])

fig.add_trace(
    go.Scatter(y=stringencyIndexY, x=stringencyIndexX,
                mode = 'lines',
                name="stringencyIndex/"+trainCovidData["country"][60]),
    row=4, col=2
)

stringencyIndexX = []
stringencyIndexY = []
for i in range(1,14):
    strIndex = "stringencyIndexforMonth-"+str(i)
    deaths = "deathsforMonth-"+str(i)
    stringencyIndexX.append(trainCovidData[strIndex][90])
    stringencyIndexY.append(trainCovidData[deaths][90])

fig.add_trace(
    go.Scatter(y=stringencyIndexY, x=stringencyIndexX,
                mode = 'lines',
                name="stringencyIndex/"+trainCovidData["country"][90]),
    row=4, col=3
)

fig.update_layout(height=1000, width=1000, title_text="Feature Relation Plots")
fig.show()

In [4]:
dropColumns = ['older65','populationDensity']
for i in range(1,14):
    strIndex = "stringencyIndexforMonth-"+str(i)
    dropColumns.append(strIndex)

trainCovidData = trainCovidData.drop(dropColumns, axis=1)
testCovidData  = testCovidData.drop(dropColumns, axis=1)

In [5]:
trainData, productionData = train_test_split(trainCovidData, test_size=0.3, random_state=42, shuffle=True)

In [26]:
depend = ['deathsforMonth-13']
features = ['smokers','older70','medianAge','population',u'beds‰','diabetes','cardiovascularDeathRate']
for i in range(1,14):
    cases = "casesforMonth-"+str(i)
    features.append(cases)
for i in range(1,13):
    deaths = "deathsforMonth-"+str(i)
    features.append(deaths)

featureVectorTrain = trainData[features]
dependVariableTrain = trainData[depend]

featureVectorProduction = productionData[features]
dependVariableProduction = productionData[depend]

featureVectorTest = testCovidData[features]
dependVariableTest = testCovidData[depend]

In [141]:
# Linear Regression Model
linearRegression = LinearRegression()

splitNum = 10
kf = KFold(n_splits=splitNum)
splits = kf.split(featureVectorTrain)

bestLinearModel = None
bestMAE = float('inf')
for i in range(splitNum):
    result = next(splits, None)
    train_X = featureVectorTrain.iloc[result[0]]
    train_Y = dependVariableTrain.iloc[result[0]]
    test_X =  featureVectorTrain.iloc[result[1]]
    test_Y =  dependVariableTrain.iloc[result[1]]
    
    linearModel = linearRegression.fit(train_X,train_Y)
    predictions = linearModel.predict(test_X)

    if round(mean_absolute_error(test_Y, predictions),2) < bestMAE:
        bestMAE = round(mean_absolute_error(test_Y, predictions),2)
        bestLinearModel = linearModel

In [147]:
# Production Test
predictions = bestLinearModel.predict(featureVectorProduction)

print("R squared error: ",round(r2_score(dependVariableProduction, predictions),2))
print("Mean absolute error: ",round(mean_absolute_error(dependVariableProduction, predictions),2))
print("Mean squared error: ",round(mean_squared_error(dependVariableProduction, predictions),2))
print("Rooted mean squared error: ",round(mean_squared_error(dependVariableProduction, predictions,squared=False),2))

R squared error:  0.68
Mean absolute error:  1676.96
Mean squared error:  46843134.31
Rooted mean squared error:  6844.2


In [167]:
# Test Dataset
predictions = bestLinearModel.predict(featureVectorTest)

print("R squared error: ",round(r2_score(dependVariableTest, predictions),2))
print("Mean absolute error: ",round(mean_absolute_error(dependVariableTest, predictions),2))
print("Mean squared error: ",round(mean_squared_error(dependVariableTest, predictions),2))
print("Rooted mean squared error: ",round(mean_squared_error(dependVariableTest, predictions,squared=False),2))

R squared error:  0.87
Mean absolute error:  1771.17
Mean squared error:  11428032.96
Rooted mean squared error:  3380.54


In [148]:
# Lasso Regression Model
lassoRegression = Lasso(max_iter=1000000)

splitNum = 20
kf = KFold(n_splits=splitNum)
splits = kf.split(featureVectorTrain)

bestLassoModel = None
bestMAE = float('inf')
for i in range(splitNum):
    result = next(splits, None)
    train_X = featureVectorTrain.iloc[result[0]]
    train_Y = dependVariableTrain.iloc[result[0]]
    test_X =  featureVectorTrain.iloc[result[1]]
    test_Y =  dependVariableTrain.iloc[result[1]]
    
    lassoModel = lassoRegression.fit(train_X,train_Y)
    predictions = lassoModel.predict(test_X)

    if round(mean_absolute_error(test_Y, predictions),2) < bestMAE:
        bestMAE = round(mean_absolute_error(test_Y, predictions),2)
        bestLassoModel = lassoModel

In [149]:
# Production Test
predictions = bestLassoModel.predict(featureVectorProduction)

print("R squared error: ",round(r2_score(dependVariableProduction, predictions),2))
print("Mean absolute error: ",round(mean_absolute_error(dependVariableProduction, predictions),2))
print("Mean squared error: ",round(mean_squared_error(dependVariableProduction, predictions),2))
print("Rooted mean squared error: ",round(mean_squared_error(dependVariableProduction, predictions,squared=False),2))

R squared error:  0.58
Mean absolute error:  1652.66
Mean squared error:  62240013.02
Rooted mean squared error:  7889.23


In [168]:
# Test Dataset
predictions = bestLassoModel.predict(featureVectorTest)

print("R squared error: ",round(r2_score(dependVariableTest, predictions),2))
print("Mean absolute error: ",round(mean_absolute_error(dependVariableTest, predictions),2))
print("Mean squared error: ",round(mean_squared_error(dependVariableTest, predictions),2))
print("Rooted mean squared error: ",round(mean_squared_error(dependVariableTest, predictions,squared=False),2))

R squared error:  0.81
Mean absolute error:  2360.73
Mean squared error:  16974564.41
Rooted mean squared error:  4120.02


In [154]:
# Ridge Regression Model
ridgeRegression = Ridge(solver='lsqr')

splitNum = 20
kf = KFold(n_splits=splitNum)
splits = kf.split(featureVectorTrain)

bestRidgeModel = None
bestMAE = float('inf')
for i in range(splitNum):
    result = next(splits, None)
    train_X = featureVectorTrain.iloc[result[0]]
    train_Y = dependVariableTrain.iloc[result[0]]
    test_X =  featureVectorTrain.iloc[result[1]]
    test_Y =  dependVariableTrain.iloc[result[1]]
    
    ridgeModel = ridgeRegression.fit(train_X,train_Y)
    predictions = ridgeModel.predict(test_X)

    if round(mean_absolute_error(test_Y, predictions),2) < bestMAE:
        bestMAE = round(mean_absolute_error(test_Y, predictions),2)
        bestRidgeModel = ridgeModel

In [155]:
# Production Test
predictions = bestRidgeModel.predict(featureVectorProduction)

print("R squared error: ",round(r2_score(dependVariableProduction, predictions),2))
print("Mean absolute error: ",round(mean_absolute_error(dependVariableProduction, predictions),2))
print("Mean squared error: ",round(mean_squared_error(dependVariableProduction, predictions),2))
print("Rooted mean squared error: ",round(mean_squared_error(dependVariableProduction, predictions,squared=False),2))

R squared error:  0.91
Mean absolute error:  1053.42
Mean squared error:  12788193.24
Rooted mean squared error:  3576.06


In [169]:
# Test Dataset
predictions = bestRidgeModel.predict(featureVectorTest)

print("R squared error: ",round(r2_score(dependVariableTest, predictions),2))
print("Mean absolute error: ",round(mean_absolute_error(dependVariableTest, predictions),2))
print("Mean squared error: ",round(mean_squared_error(dependVariableTest, predictions),2))
print("Rooted mean squared error: ",round(mean_squared_error(dependVariableTest, predictions,squared=False),2))

R squared error:  0.76
Mean absolute error:  2295.17
Mean squared error:  20621366.7
Rooted mean squared error:  4541.08


In [156]:
# Elastic-Net Regression Model
elasticRegression = ElasticNet(max_iter=1000000)

splitNum = 10
kf = KFold(n_splits=splitNum)
splits = kf.split(featureVectorTrain)

bestElasticModel = None
bestMAE = float('inf')
for i in range(splitNum):
    result = next(splits, None)
    train_X = featureVectorTrain.iloc[result[0]]
    train_Y = dependVariableTrain.iloc[result[0]]
    test_X =  featureVectorTrain.iloc[result[1]]
    test_Y =  dependVariableTrain.iloc[result[1]]
    
    elasticModel = elasticRegression.fit(train_X,train_Y)
    predictions = elasticModel.predict(test_X)

    if round(mean_absolute_error(test_Y, predictions),2) < bestMAE:
        bestMAE = round(mean_absolute_error(test_Y, predictions),2)
        bestElasticModel = elasticModel

In [157]:
# Production Test
predictions = bestElasticModel.predict(featureVectorProduction)

print("R squared error: ",round(r2_score(dependVariableProduction, predictions),2))
print("Mean absolute error: ",round(mean_absolute_error(dependVariableProduction, predictions),2))
print("Mean squared error: ",round(mean_squared_error(dependVariableProduction, predictions),2))
print("Rooted mean squared error: ",round(mean_squared_error(dependVariableProduction, predictions,squared=False),2))

R squared error:  0.9
Mean absolute error:  1083.14
Mean squared error:  14526185.48
Rooted mean squared error:  3811.32


In [170]:
# Test Dataset
predictions = bestElasticModel.predict(featureVectorTest)

print("R squared error: ",round(r2_score(dependVariableTest, predictions),2))
print("Mean absolute error: ",round(mean_absolute_error(dependVariableTest, predictions),2))
print("Mean squared error: ",round(mean_squared_error(dependVariableTest, predictions),2))
print("Rooted mean squared error: ",round(mean_squared_error(dependVariableTest, predictions,squared=False),2))

R squared error:  0.86
Mean absolute error:  1695.55
Mean squared error:  11848849.07
Rooted mean squared error:  3442.22


In [158]:
# Decision Tree Regression Model
decisionTreeRegression = DecisionTreeRegressor(random_state=0)

splitNum = 20
kf = KFold(n_splits=splitNum)
splits = kf.split(featureVectorTrain)

bestDecisionTreeModel = None
bestMAE = float('inf')
for i in range(splitNum):
    result = next(splits, None)
    train_X = featureVectorTrain.iloc[result[0]]
    train_Y = dependVariableTrain.iloc[result[0]]
    test_X =  featureVectorTrain.iloc[result[1]]
    test_Y =  dependVariableTrain.iloc[result[1]]
    
    decisionTreeModel = decisionTreeRegression.fit(train_X,train_Y)
    predictions = decisionTreeModel.predict(test_X)

    if round(mean_absolute_error(test_Y, predictions),2) < bestMAE:
        bestMAE = round(mean_absolute_error(test_Y, predictions),2)
        bestDecisionTreeModel = decisionTreeModel

In [159]:
# Production Test
predictions = bestDecisionTreeModel.predict(featureVectorProduction)

print("R squared error: ",round(r2_score(dependVariableProduction, predictions),2))
print("Mean absolute error: ",round(mean_absolute_error(dependVariableProduction, predictions),2))
print("Mean squared error: ",round(mean_squared_error(dependVariableProduction, predictions),2))
print("Rooted mean squared error: ",round(mean_squared_error(dependVariableProduction, predictions,squared=False),2))

R squared error:  0.51
Mean absolute error:  1450.11
Mean squared error:  72912236.11
Rooted mean squared error:  8538.87


In [171]:
# Test Dataset
predictions = bestDecisionTreeModel.predict(featureVectorTest)

print("R squared error: ",round(r2_score(dependVariableTest, predictions),2))
print("Mean absolute error: ",round(mean_absolute_error(dependVariableTest, predictions),2))
print("Mean squared error: ",round(mean_squared_error(dependVariableTest, predictions),2))
print("Rooted mean squared error: ",round(mean_squared_error(dependVariableTest, predictions,squared=False),2))

R squared error:  0.3
Mean absolute error:  3849.0
Mean squared error:  61076017.0
Rooted mean squared error:  7815.11


In [162]:
# SVM Regression Model
svmRegression = SVR()

splitNum = 20
kf = KFold(n_splits=splitNum)
splits = kf.split(featureVectorTrain)

bestSVMModel = None
bestMAE = float('inf')
for i in range(splitNum):
    result = next(splits, None)
    train_X = featureVectorTrain.iloc[result[0]]
    train_Y = dependVariableTrain.iloc[result[0]]
    test_X =  featureVectorTrain.iloc[result[1]]
    test_Y =  dependVariableTrain.iloc[result[1]]
    
    svmModel = svmRegression.fit(train_X,train_Y.values.ravel())
    predictions = svmModel.predict(test_X)

    if round(mean_absolute_error(test_Y, predictions),2) < bestMAE:
        bestMAE = round(mean_absolute_error(test_Y, predictions),2)
        bestSVMModel = svmModel

In [164]:
# Production Test
predictions = bestSVMModel.predict(featureVectorProduction)

print("R squared error: ",round(r2_score(dependVariableProduction, predictions),2))
print("Mean absolute error: ",round(mean_absolute_error(dependVariableProduction, predictions),2))
print("Mean squared error: ",round(mean_squared_error(dependVariableProduction, predictions),2))
print("Rooted mean squared error: ",round(mean_squared_error(dependVariableProduction, predictions,squared=False),2))

R squared error:  -0.03
Mean absolute error:  2339.6
Mean squared error:  152723014.87
Rooted mean squared error:  12358.12


In [172]:
# Test Dataset
predictions = bestSVMModel.predict(featureVectorTest)

print("R squared error: ",round(r2_score(dependVariableTest, predictions),2))
print("Mean absolute error: ",round(mean_absolute_error(dependVariableTest, predictions),2))
print("Mean squared error: ",round(mean_squared_error(dependVariableTest, predictions),2))
print("Rooted mean squared error: ",round(mean_squared_error(dependVariableTest, predictions,squared=False),2))

R squared error:  -0.34
Mean absolute error:  5508.32
Mean squared error:  117106323.52
Rooted mean squared error:  10821.57
