In [1]:
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score

In [2]:
X = np.array([1, 2, 3, 4, 5]).reshape(-1, 1) # Reshape X to 2D
y = np.array([50, 55, 65, 70, 75])

Linear Regression with K-Fold Cross-Validation

In [4]:
# Model
model = LinearRegression()

# 5-fold cross validation
scores = cross_val_score(
    model,
    X,
    y,
    cv=5,
    scoring="neg_mean_squared_error"
)

mse_scores = -scores

print("MSE for each fold:", mse_scores)
print("Average MSE:", mse_scores.mean())

MSE for each fold: [0.         4.59183673 6.25       0.51020408 6.25      ]
Average MSE: 3.5204081632653113


Using R² score instead (more intuitive)

In [7]:
r2_scores = cross_val_score(
    model,
    X,
    y,
    cv=2,
    scoring="r2"
)

print("R² for each fold:", r2_scores)
print("Average R²:", r2_scores.mean())

R² for each fold: [ 0.57142857 -0.61111111]
Average R²: -0.01984126984126572


In [9]:
model.fit(X, y)

print("Slope (w):", model.coef_[0])
print("Intercept (b):", model.intercept_)

print("Final model R²:", model.score(X, y))

Slope (w): 6.500000000000001
Intercept (b): 43.5
Final model R²: 0.9825581395348837


In [10]:
prediction = model.predict([[6]])
print("Predicted score:", prediction[0])

Predicted score: 82.5


Cross-Validation with Pipeline (Best Practice)

In [17]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("model", LinearRegression())
])

scores = cross_val_score(
    pipeline,
    X,
    y,
    cv=2,
    scoring="r2"
)

print("Average R²:", scores.mean())

pipeline.fit(X, y)
print("Final model R²:", pipeline.score(X, y))

Average R²: -0.01984126984127038
Final model R²: 0.9825581395348837


In [18]:
prediction = pipeline.predict([[6]])
print("Predicted score:", prediction[0])

Predicted score: 82.5
