In [1]:
# Import required libraries
import numpy as np
import pandas as pd
import hvplot.pandas
from pathlib import Path
from sklearn.linear_model import LinearRegression

In [5]:
# Read the data
file_path = Path("project-4-life-expectancy\Data_files\final_combined_data_cleaned.csv")
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,State,County,Census Tract Number,Life Expectancy Mean,Life Expectancy Range,Life Expectancy Standard Error,County.1,County with State,State ID,State Name,...,Population per Square Mile,Accommodation and Food Services Sales,Retail Sales,Firms Total,Women Owned Firms,Men Owned Firms,Minority Owned Firms,Nonminority Owned Firms,Veteran Owned Firms,NonVeteran Owned Firms
0,Alabama,"Autauga County, AL",201.0,73.1,56.9-75.1,2.2348,Autauga,Autauga County,AL,Alabama,...,91.8,93431,607854,2949,1093,1499,616,2160,285,2401
1,Alabama,"Autauga County, AL",202.0,76.9,75.2-77.5,3.3453,Autauga,Autauga County,AL,Alabama,...,91.8,93431,607854,2949,1093,1499,616,2160,285,2401
2,Alabama,"Autauga County, AL",204.0,75.4,75.2-77.5,1.0216,Autauga,Autauga County,AL,Alabama,...,91.8,93431,607854,2949,1093,1499,616,2160,285,2401
3,Alabama,"Autauga County, AL",205.0,79.4,77.6-79.5,1.1768,Autauga,Autauga County,AL,Alabama,...,91.8,93431,607854,2949,1093,1499,616,2160,285,2401
4,Alabama,"Autauga County, AL",206.0,73.1,56.9-75.1,1.5519,Autauga,Autauga County,AL,Alabama,...,91.8,93431,607854,2949,1093,1499,616,2160,285,2401


In [6]:
# Create scatter plot of "Life Expectancy Mean" vs "Bachelor's Degree or Higher"
life_education = df.hvplot.scatter(
    x="Bachelor's Degree or Higher",
    y="Life Expectancy Mean",
    title="Life Expectancy of Individuals with Higher Education"
)
life_education

In [7]:
# Create the X set by using 'reshape' function to format bachelor's degree data as a single column array
X = df["Bachelor's Degree or Higher"].values.reshape(-1,1)

# Display sample data
X[:5]

array([[26.6],
       [26.6],
       [26.6],
       [26.6],
       [26.6]])

In [8]:
# Create an array for the dependent variable y with the median household income data
y = df["Life Expectancy Mean"]

In [9]:
# Create model with scikit-learn
model = LinearRegression()

In [10]:
# Fit data to the model
model.fit(X, y)

In [11]:
# Display the slope
print(f"Model's slope: {model.coef_}")

Model's slope: [0.11761328]


In [12]:
# Display the y-intercept
print(f"Model's y-intercept: {model.intercept_}")

Model's y-intercept: 74.65567035584283


In [13]:
# Display the model's best fit line formula
print(f"Model's formula: y= {model.intercept_} + {model.coef_[0]}X")

Model's formula: y= 74.65567035584283 + 0.11761327807574073X


In [14]:
# Make predictions using the X set
predicted_y_values = model.predict(X)

In [15]:
# Copy the original data
df_predicted = df.copy()

# Add a column with predicted life expectancy
df_predicted["life_expectancy_predicted"] = predicted_y_values

# Display sample data
df_predicted.head()

Unnamed: 0,State,County,Census Tract Number,Life Expectancy Mean,Life Expectancy Range,Life Expectancy Standard Error,County.1,County with State,State ID,State Name,...,Accommodation and Food Services Sales,Retail Sales,Firms Total,Women Owned Firms,Men Owned Firms,Minority Owned Firms,Nonminority Owned Firms,Veteran Owned Firms,NonVeteran Owned Firms,life_expectancy_predicted
0,Alabama,"Autauga County, AL",201.0,73.1,56.9-75.1,2.2348,Autauga,Autauga County,AL,Alabama,...,93431,607854,2949,1093,1499,616,2160,285,2401,77.784184
1,Alabama,"Autauga County, AL",202.0,76.9,75.2-77.5,3.3453,Autauga,Autauga County,AL,Alabama,...,93431,607854,2949,1093,1499,616,2160,285,2401,77.784184
2,Alabama,"Autauga County, AL",204.0,75.4,75.2-77.5,1.0216,Autauga,Autauga County,AL,Alabama,...,93431,607854,2949,1093,1499,616,2160,285,2401,77.784184
3,Alabama,"Autauga County, AL",205.0,79.4,77.6-79.5,1.1768,Autauga,Autauga County,AL,Alabama,...,93431,607854,2949,1093,1499,616,2160,285,2401,77.784184
4,Alabama,"Autauga County, AL",206.0,73.1,56.9-75.1,1.5519,Autauga,Autauga County,AL,Alabama,...,93431,607854,2949,1093,1499,616,2160,285,2401,77.784184


In [16]:
# Creat a line plot of the Life Expectancy Mean vs the Median Household Income values
line = df_predicted.hvplot.line(
    x = "Bachelor's Degree or Higher",
    y = "life_expectancy_predicted",
    color = "red"
)
line

In [17]:
# Superpose the original data and the best fit line
life_education * line 

In [18]:
# Save Image as PNG file
plot = life_education * line

from pathlib import Path

hvplot.save(plot, 'life_expectancy_higher_education.png')



## Manual Prediction

In [20]:
# Display the formula to predict the life expectancy of a percent of individual with Bachelor'set
# degree or higher
print(f"Model's formula: y = {model.intercept_} + {model.coef_[0]} * 35")

# Predict the life expectancy 35% of county residents with a Bachelor's degree
y_35 = model.intercept_ + model.coef_[0] * 35

# Display the prediction
print(f"Predicted Life Expectancy of 35% of county residents with Bachelor's degree or higher: {y_35:.2f} years")

Model's formula: y = 74.65567035584283 + 0.11761327807574073 * 35
Predicted Life Expectancy of 35% of county residents with Bachelor's degree or higher: 78.77 years


## Prediction Using 'Predict' Function

In [21]:
# Creat an array to predict life expectancy at 30k, 50k, 80k, 100k, and 130k
X_education = np.array([5, 20, 35, 50, 75])

# Format the array as one-column array
X_education = X_education.reshape(-1,1)

# Display sample data
X_education

array([[ 5],
       [20],
       [35],
       [50],
       [75]])

In [22]:
# Predict life expectancy for county residents with Bachelor's degree at percents 5, 20, 35, 50, 75
predicted_life = model.predict(X_education)

In [23]:
# Create datafram for the predicted life expectancy
df_predicted_educ = pd.DataFrame(
    {
        "Percent Bachelor's Degree or higher": X_education.reshape(1, -1)[0],
        "predicted_life": predicted_life
    }
)

# Display data
df_predicted_educ

Unnamed: 0,Percent Bachelor's Degree or higher,predicted_life
0,5,75.243737
1,20,77.007936
2,35,78.772135
3,50,80.536334
4,75,83.476666


## Assess the Model

In [24]:
#Import the relevan metrics from scikit-learn
from sklearn.metrics import mean_squared_error, r2_score

In [25]:
# Compute metrics for the linear regression model - score, r2, mse, rmse, std
score = model.score(X, y, sample_weight=None)
r2 = r2_score(y, predicted_y_values)
mse = mean_squared_error(y, predicted_y_values)
rmse = np.sqrt(mse)
std = np.std(y)

# Print the relevant metrics
print(f"The score is: {score}.")
print(f"The r2 is {r2}.")
print(f"The mean squared error is {mse}.")
print(f"The root mean squared error is {rmse}.")
print(f"The standard deviation is {std}.")

The score is: 0.10956177433140424.
The r2 is 0.10956177433140424.
The mean squared error is 14.048680887120826.
The root mean squared error is 3.7481569987289522.
The standard deviation is 3.9720606946413906.
