In [1]:
# Import dependencies
import pandas as pd
from sqlalchemy import create_engine
import sys
sys.path.append('../')
from config import db_password

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.metrics import r2_score

In [2]:
db_string = f"postgres://postgres:{db_password}@module20covid.cgcfmenzscpu.us-east-2.rds.amazonaws.com:5432/postgres"
db = create_engine(db_string)

In [41]:
q = '''
SELECT location, DATE_PART('day', date - date_first_case) AS "day_from_first_case",
total_cases, total_deaths, total_cases_per_million, total_deaths_per_million, total_deaths/total_cases AS "fatality_rate",
si.stringency_index, population, population_density, median_age, aged_70_older, gdp_per_capita, diabetes_prevalence, life_expectancy

FROM "WorldWide_Cases" ww
LEFT JOIN "Country_FirstCase" fc ON ww.iso_code = fc.iso_code
LEFT JOIN "Country_Stats" cs ON ww.iso_code = cs.iso_code
LEFT JOIN (SELECT ww.iso_code, AVG(stringency_index) AS "stringency_index" FROM "WorldWide_Cases" ww
LEFT JOIN "Country_FirstCase" fc ON ww.iso_code = fc.iso_code
WHERE DATE_PART('day', date - date_first_case) BETWEEN 1 AND 100
GROUP BY ww.iso_code) si ON ww.iso_code = si.iso_code

WHERE DATE_PART('day', date - date_first_case) = 100
'''

worldwide_df = pd.read_sql(sql=q, con=db)

In [42]:
worldwide_df

Unnamed: 0,location,day_from_first_case,total_cases,total_deaths,total_cases_per_million,total_deaths_per_million,fatality_rate,stringency_index,population,population_density,median_age,aged_70_older,gdp_per_capita,diabetes_prevalence,life_expectancy
0,Afghanistan,100.0,423.0,14.0,10.866,0.360,0.033097,18.878333,3.892834e+07,54.422,18.6,1.337,1803.987,9.59,64.83
1,Albania,100.0,1672.0,37.0,580.999,12.857,0.022129,81.720000,2.877800e+06,104.871,38.0,8.643,11803.431,10.08,78.57
2,Algeria,100.0,1572.0,205.0,35.849,4.675,0.130407,19.965684,4.385104e+07,17.348,29.1,3.857,13913.839,6.73,76.88
3,Andorra,100.0,852.0,51.0,11026.985,660.066,0.059859,47.784607,7.726500e+04,163.755,,,,7.97,83.73
4,Angola,100.0,276.0,11.0,8.398,0.335,0.039855,79.158000,3.286627e+07,23.890,16.8,1.362,5819.495,3.94,61.15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
201,Vietnam,100.0,255.0,0.0,2.620,0.000,0.000000,37.962500,9.733858e+07,308.127,32.6,4.718,6171.884,6.00,75.40
202,Yemen,100.0,1581.0,443.0,53.008,14.853,0.280202,54.128791,2.982597e+07,53.508,20.3,1.583,1479.147,5.35,66.12
203,Zambia,100.0,1531.0,21.0,83.279,1.142,0.013717,48.013500,1.838396e+07,22.995,17.7,1.542,3689.251,3.94,63.89
204,Zimbabwe,100.0,567.0,6.0,38.149,0.404,0.010582,80.812800,1.486293e+07,42.729,19.6,1.882,1899.775,1.82,61.49


In [43]:
worldwide_df.dtypes

location                     object
day_from_first_case         float64
total_cases                 float64
total_deaths                float64
total_cases_per_million     float64
total_deaths_per_million    float64
fatality_rate               float64
stringency_index            float64
population                  float64
population_density          float64
median_age                  float64
aged_70_older               float64
gdp_per_capita              float64
diabetes_prevalence         float64
life_expectancy             float64
dtype: object

In [44]:
# Inspecting for Null Values
for column in worldwide_df.columns:
    print(f"Column {column} has {worldwide_df[column].isnull().sum()} null values")

Column location has 0 null values
Column day_from_first_case has 0 null values
Column total_cases has 0 null values
Column total_deaths has 0 null values
Column total_cases_per_million has 0 null values
Column total_deaths_per_million has 0 null values
Column fatality_rate has 0 null values
Column stringency_index has 36 null values
Column population has 0 null values
Column population_density has 10 null values
Column median_age has 24 null values
Column aged_70_older has 25 null values
Column gdp_per_capita has 26 null values
Column diabetes_prevalence has 16 null values
Column life_expectancy has 3 null values


In [45]:
worldwide_df = worldwide_df.dropna()

In [46]:
worldwide_df

Unnamed: 0,location,day_from_first_case,total_cases,total_deaths,total_cases_per_million,total_deaths_per_million,fatality_rate,stringency_index,population,population_density,median_age,aged_70_older,gdp_per_capita,diabetes_prevalence,life_expectancy
0,Afghanistan,100.0,423.0,14.0,10.866,0.360,0.033097,18.878333,38928341.0,54.422,18.6,1.337,1803.987,9.59,64.83
1,Albania,100.0,1672.0,37.0,580.999,12.857,0.022129,81.720000,2877800.0,104.871,38.0,8.643,11803.431,10.08,78.57
2,Algeria,100.0,1572.0,205.0,35.849,4.675,0.130407,19.965684,43851043.0,17.348,29.1,3.857,13913.839,6.73,76.88
4,Angola,100.0,276.0,11.0,8.398,0.335,0.039855,79.158000,32866268.0,23.890,16.8,1.362,5819.495,3.94,61.15
7,Argentina,100.0,27360.0,765.0,605.366,16.926,0.027961,85.948969,45195777.0,16.177,31.9,7.441,18933.907,5.50,76.67
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,Venezuela,100.0,4048.0,35.0,142.355,1.231,0.008646,82.194444,28435943.0,36.253,29.0,3.915,16745.022,6.47,72.06
201,Vietnam,100.0,255.0,0.0,2.620,0.000,0.000000,37.962500,97338583.0,308.127,32.6,4.718,6171.884,6.00,75.40
202,Yemen,100.0,1581.0,443.0,53.008,14.853,0.280202,54.128791,29825968.0,53.508,20.3,1.583,1479.147,5.35,66.12
203,Zambia,100.0,1531.0,21.0,83.279,1.142,0.013717,48.013500,18383956.0,22.995,17.7,1.542,3689.251,3.94,63.89


In [47]:
worldwide_df.columns

Index(['location', 'day_from_first_case', 'total_cases', 'total_deaths',
       'total_cases_per_million', 'total_deaths_per_million', 'fatality_rate',
       'stringency_index', 'population', 'population_density', 'median_age',
       'aged_70_older', 'gdp_per_capita', 'diabetes_prevalence',
       'life_expectancy'],
      dtype='object')

In [68]:
# Creating Data to fit Multiple Linear Regression Model
variables = ['stringency_index', 'population_density', 'median_age', 'aged_70_older', 'gdp_per_capita', 'diabetes_prevalence', 'life_expectancy']
X = worldwide_df[variables]
y = worldwide_df['total_cases_per_million']

In [69]:
# Spliting data to train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=78)

In [70]:
# Fitting Multiple Linear Regression Model
reg = LinearRegression(fit_intercept = False).fit(X_train, y_train)

In [71]:
# Coefficient for Variables
coeff_df = pd.DataFrame(reg.coef_, variables, columns=['Coefficient'])  
coeff_df

Unnamed: 0,Coefficient
stringency_index,14.294627
population_density,-0.214089
median_age,-23.1431
aged_70_older,84.177206
gdp_per_capita,0.026513
diabetes_prevalence,-23.491335
life_expectancy,-1.102213


In [72]:
# R Squared Value for the model
y_pred = reg.predict(X_test)
r2_score(y_test, y_pred)

0.03744944937070671

In [73]:
# Creating Data to fit Multiple Linear Regression Model
variables = ['stringency_index', 'population_density', 'median_age', 'aged_70_older', 'gdp_per_capita', 'diabetes_prevalence', 'life_expectancy']
X = worldwide_df[variables]
y = worldwide_df['fatality_rate']

In [74]:
# Spliting data to train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=78)

In [75]:
# Fitting Multiple Linear Regression Model
reg = LinearRegression(fit_intercept = False).fit(X_train, y_train)

In [76]:
# Coefficient for Variables
coeff_df = pd.DataFrame(reg.coef_, variables, columns=['Coefficient'])  
coeff_df

Unnamed: 0,Coefficient
stringency_index,-0.0001274961
population_density,-1.738097e-06
median_age,-0.001102732
aged_70_older,0.003950731
gdp_per_capita,-2.171868e-07
diabetes_prevalence,0.0004641434
life_expectancy,0.0006761981


In [77]:
# R Squared Value for the model
y_pred = reg.predict(X_test)
r2_score(y_test, y_pred)

-0.10262850960432446