In [1]:
# Import required libraries
import numpy as np
import pandas as pd
import hvplot.pandas
from pathlib import Path
from sklearn.linear_model import LinearRegression

In [27]:
# Read the data
file_path = Path("project-4-life-expectancy\Data_files\final_combined_data_cleaned.csv")
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,State,County,Census Tract Number,Life Expectancy Mean,Life Expectancy Range,Life Expectancy Standard Error,County.1,County with State,State ID,State Name,...,Population per Square Mile,Accommodation and Food Services Sales,Retail Sales,Firms Total,Women Owned Firms,Men Owned Firms,Minority Owned Firms,Nonminority Owned Firms,Veteran Owned Firms,NonVeteran Owned Firms
0,Alabama,"Autauga County, AL",201.0,73.1,56.9-75.1,2.2348,Autauga,Autauga County,AL,Alabama,...,91.8,93431,607854,2949,1093,1499,616,2160,285,2401
1,Alabama,"Autauga County, AL",202.0,76.9,75.2-77.5,3.3453,Autauga,Autauga County,AL,Alabama,...,91.8,93431,607854,2949,1093,1499,616,2160,285,2401
2,Alabama,"Autauga County, AL",204.0,75.4,75.2-77.5,1.0216,Autauga,Autauga County,AL,Alabama,...,91.8,93431,607854,2949,1093,1499,616,2160,285,2401
3,Alabama,"Autauga County, AL",205.0,79.4,77.6-79.5,1.1768,Autauga,Autauga County,AL,Alabama,...,91.8,93431,607854,2949,1093,1499,616,2160,285,2401
4,Alabama,"Autauga County, AL",206.0,73.1,56.9-75.1,1.5519,Autauga,Autauga County,AL,Alabama,...,91.8,93431,607854,2949,1093,1499,616,2160,285,2401


In [26]:
# Create scatter plot of "Life Expectancy Mean" vs "Median Household Income"
life_2010 = df.hvplot.scatter(
    x="2010 Population",
    y="Life Expectancy Mean",
    title="Life Expectancy of 2010 Population"
)
life_2010

In [4]:
# Create the X set by using 'reshape' fucntion to format life expectancy data as a single column array
X = df["2010 Population"].values.reshape(-1,1)

# Display sample data
X[:5]

array([[54571],
       [54571],
       [54571],
       [54571],
       [54571]], dtype=int64)

In [5]:
# Create an array for the dependent variable y with the median household income data
y = df["Life Expectancy Mean"]

In [6]:
# Create model with scikit-learn
model = LinearRegression()

In [7]:
# Fit data to the model
model.fit(X, y)

In [8]:
# Display the slope
print(f"Model's slope: {model.coef_}")

Model's slope: [3.00644169e-07]


In [9]:
# Display the y-intercept
print(f"Model's y-intercept: {model.intercept_}")

Model's y-intercept: 77.99554192956771


In [10]:
# Display the model's best fit line formula
print(f"Model's formula: y= {model.intercept_} + {model.coef_[0]}X")

Model's formula: y= 77.99554192956771 + 3.0064416906447414e-07X


In [11]:
# Make predictions using the X set
predicted_y_values = model.predict(X)

In [12]:
# Copy the original data
df_predicted = df.copy()

# Add a column with predicted life expectancy
df_predicted["life_expectancy_predicted"] = predicted_y_values

# Display sample data
df_predicted.head()

Unnamed: 0,State,County,Census Tract Number,Life Expectancy Mean,Life Expectancy Range,Life Expectancy Standard Error,County.1,County with State,State ID,State Name,...,Accommodation and Food Services Sales,Retail Sales,Firms Total,Women Owned Firms,Men Owned Firms,Minority Owned Firms,Nonminority Owned Firms,Veteran Owned Firms,NonVeteran Owned Firms,life_expectancy_predicted
0,Alabama,"Autauga County, AL",201.0,73.1,56.9-75.1,2.2348,Autauga,Autauga County,AL,Alabama,...,93431,607854,2949,1093,1499,616,2160,285,2401,78.011948
1,Alabama,"Autauga County, AL",202.0,76.9,75.2-77.5,3.3453,Autauga,Autauga County,AL,Alabama,...,93431,607854,2949,1093,1499,616,2160,285,2401,78.011948
2,Alabama,"Autauga County, AL",204.0,75.4,75.2-77.5,1.0216,Autauga,Autauga County,AL,Alabama,...,93431,607854,2949,1093,1499,616,2160,285,2401,78.011948
3,Alabama,"Autauga County, AL",205.0,79.4,77.6-79.5,1.1768,Autauga,Autauga County,AL,Alabama,...,93431,607854,2949,1093,1499,616,2160,285,2401,78.011948
4,Alabama,"Autauga County, AL",206.0,73.1,56.9-75.1,1.5519,Autauga,Autauga County,AL,Alabama,...,93431,607854,2949,1093,1499,616,2160,285,2401,78.011948


In [13]:
# Creat a line plot of the Life Expectancy Mean vs the Median Household Income values
line = df_predicted.hvplot.line(
    x = "2010 Population",
    y = "life_expectancy_predicted",
    color = "red"
)
line

In [14]:
# Superpose the original data and the best fit line
life_2010 * line 

In [15]:
# Save Image as PNG file
plot = life_2010 * line

from pathlib import Path

hvplot.save(plot, 'life_expectancy_2010.png')



## Manual Prediction

In [16]:
# Display the formula to predict the life expectancy of 2010 population
print(f"Model's formula: y = {model.intercept_} + {model.coef_[0]} * 50000")

# Predict the life expectancy 35% of county residents with a Bachelor's degree
y_50k = model.intercept_ + model.coef_[0] * 50000

# Display the prediction
print(f"Predicted Life Expectancy of 2020 Population: {y_50k:.2f} years")

Model's formula: y = 77.99554192956771 + 3.0064416906447414e-07 * 50000
Predicted Life Expectancy of 2020 Population: 78.01 years


## Prediction Using the 'Predict' Function

In [17]:
# Creat an array to predict life expectancy at 30k, 50k, 80k, 100k, and 130k
X_2010 = np.array([20000, 50000, 250000, 1000000, 5000000, 9000000])

# Format the array as one-column array
X_2010 = X_2010.reshape(-1,1)

# Display sample data
X_2010

array([[  20000],
       [  50000],
       [ 250000],
       [1000000],
       [5000000],
       [9000000]])

In [18]:
# Predict life expectancy for 2020 Populations of 20000, 50000, 250000, 1000000, 5000000, 9000000
predicted_life = model.predict(X_2010)

In [19]:
# Create datafram for the predicted life expectancy
df_predicted_2010 = pd.DataFrame(
    {
        "2020 Population": X_2010.reshape(1, -1)[0],
        "predicted_life": predicted_life
    }
)

# Display data
df_predicted_2010

Unnamed: 0,2020 Population,predicted_life
0,20000,78.001555
1,50000,78.010574
2,250000,78.070703
3,1000000,78.296186
4,5000000,79.498763
5,9000000,80.701339


## Assess the Model

In [20]:
#Import the relevan metrics from scikit-learn
from sklearn.metrics import mean_squared_error, r2_score

In [21]:
# Compute metrics for the linear regression model - score, r2, mse, rmse, std
score = model.score(X, y, sample_weight=None)
r2 = r2_score(y, predicted_y_values)
mse = mean_squared_error(y, predicted_y_values)
rmse = np.sqrt(mse)
std = np.std(y)

# Print the relevant metrics
print(f"The score is: {score}.")
print(f"The r2 is {r2}.")
print(f"The mean squared error is {mse}.")
print(f"The root mean squared error is {rmse}.")
print(f"The standard deviation is {std}.")

The score is: 0.02129411385787927.
The r2 is 0.02129411385787927.
The mean squared error is 15.441303259901577.
The root mean squared error is 3.9295423728344727.
The standard deviation is 3.9720606946413906.
