# Demo: Linear Regression

In [21]:
!uv pip install hvplot

[2mUsing Python 3.11.13 environment at: /Users/tarekatwan/Repos/MyWork/Teach/repos/adv_ml_ds/deep[0m
[2mAudited [1m1 package[0m [2min 4ms[0m[0m


In [22]:
# Import required libraries
import numpy as np
import pandas as pd
import hvplot.pandas
from pathlib import Path
from sklearn.linear_model import LinearRegression

## Data Loading and Visualization

In [23]:
# Read salary data
file_path = Path("../Resources/salary_data.csv")
df_salary = pd.read_csv(file_path)

# Display sample data
df_salary.head()

Unnamed: 0,years_experience,salary
0,1.1,39343.0
1,1.3,46205.0
2,1.5,37731.0
3,2.0,43525.0
4,2.2,39891.0


In [24]:
# Create a scatter plot with the salary information
salary_plot = df_salary.hvplot.scatter(
    x="years_experience",
    y="salary",
    title="Expected Salary Based on Years of Experience"
)
salary_plot

## Data Preparation

In [25]:
# Reformat data of the independent variable X as a single-column array
X = df_salary["years_experience"].values.reshape(-1, 1)

# Display sample data
X[:5]

array([[1.1],
       [1.3],
       [1.5],
       [2. ],
       [2.2]])

In [26]:
# The shape of X is 30 samples, with a single feature (column)
X.shape

(30, 1)

In [27]:
# Create an array for the dependent variable y
y = df_salary["salary"]

## Building the Linear Regression Model

In [28]:
# Create a model with scikit-learn
model = LinearRegression()

In [29]:
# Fit the data into the model
model.fit(X, y)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [30]:
# Display the slope
print(f"Model's slope: {model.coef_}")

Model's slope: [9449.96232146]


In [31]:
# Display the y-intercept
print(f"Model's y-intercept: {model.intercept_}")

Model's y-intercept: 25792.20019866869


In [32]:
# Display the model's best fit line formula
print(f"Model's formula: y = {model.intercept_} + {model.coef_[0]}X")

Model's formula: y = 25792.20019866869 + 9449.962321455077X


In [33]:
# Display the formula to predict the salary for a person with 7 years of experience
print(f"Model's formula: y = {model.intercept_} + {model.coef_[0]} * 7")

# Predict the salary for a person with 7 years of experience
y_7 = model.intercept_ + model.coef_[0] * 7

# Display the prediction
print(f"Predicted salary for a person with 7 years of experience: ${y_7:.2f}")

Model's formula: y = 25792.20019866869 + 9449.962321455077 * 7
Predicted salary for a person with 7 years of experience: $91941.94


In [34]:
# Make predictions using the X set
predicted_y_values = model.predict(X)

In [35]:
# Create a copy of the original data
df_salary_predicted = df_salary.copy()

# Add a column with the predicted salary values
df_salary_predicted["salary_predicted"] = predicted_y_values

# Display sample data
df_salary_predicted.head()

Unnamed: 0,years_experience,salary,salary_predicted
0,1.1,39343.0,36187.158752
1,1.3,46205.0,38077.151217
2,1.5,37731.0,39967.143681
3,2.0,43525.0,44692.124842
4,2.2,39891.0,46582.117306


In [36]:
# Create a line plot of the predicted salary values
best_fit_line = df_salary_predicted.hvplot.line(
    x = "years_experience",
    y = "salary_predicted",
    color = "red"
)
best_fit_line

In [37]:
# Superpose the original data and the best fit line
salary_plot * best_fit_line

## Linear Regression Model Assessment

In [38]:
# Import relevant metrics - score, r2, mse, rmse, std - from Scikit-learn
from sklearn.metrics import mean_squared_error, r2_score

In [39]:
# Compute the metrics for the linear regression model
score = model.score(X, y, sample_weight=None)
r2 = r2_score(y, predicted_y_values)
mse = mean_squared_error(y, predicted_y_values)
rmse = np.sqrt(mse)
std = np.std(y)

# Print relevant metrics.
print(f"The score is {score}.")
print(f"The r2 is {r2}.")
print(f"The mean squared error is {mse}.")
print(f"The root mean squared error is {rmse}.")
print(f"The standard deviation is {std}.")

The score is 0.9569566641435086.
The r2 is 0.9569566641435086.
The mean squared error is 31270951.72228097.
The root mean squared error is 5592.043608760662.
The standard deviation is 26953.65024877583.
