# Linear Regression

Predict continuous values with linear models.

## What is Linear Regression?
- Predicts numeric outcomes
- Finds best-fit line
- Simple yet powerful
- Foundation of ML

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import (
    mean_squared_error, 
    r2_score, 
    mean_absolute_error
)

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)

## 1. Simple Linear Regression

One feature predicts one target.

**Formula:** y = mx + b

In [None]:
# Generate sample data
np.random.seed(42)
X = np.random.rand(100, 1) * 10
y = 2.5 * X + 5 + np.random.randn(100, 1) * 2

# Create DataFrame
df = pd.DataFrame({
    'Experience': X.flatten(),
    'Salary': y.flatten()
})

df.head()

In [None]:
# Visualize data
plt.scatter(df['Experience'], df['Salary'], 
            alpha=0.6)
plt.xlabel('Years of Experience')
plt.ylabel('Salary ($1000s)')
plt.title('Salary vs Experience')
plt.show()

In [None]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"Training set: {len(X_train)} samples")
print(f"Test set: {len(X_test)} samples")

In [None]:
# Train model
model = LinearRegression()
model.fit(X_train, y_train)

print(f"Coefficient: {model.coef_[0][0]:.2f}")
print(f"Intercept: {model.intercept_[0]:.2f}")
print(f"\nEquation: y = {model.coef_[0][0]:.2f}x + "
      f"{model.intercept_[0]:.2f}")

In [None]:
# Make predictions
y_pred = model.predict(X_test)

# Visualize results
plt.scatter(X_test, y_test, 
            alpha=0.6, label='Actual')
plt.plot(X_test, y_pred, 
         color='red', linewidth=2, 
         label='Predicted')
plt.xlabel('Years of Experience')
plt.ylabel('Salary ($1000s)')
plt.title('Linear Regression Results')
plt.legend()
plt.show()

In [None]:
# Evaluate model
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Model Performance:")
print(f"MSE: {mse:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"MAE: {mae:.2f}")
print(f"R² Score: {r2:.3f}")

## 2. Multiple Linear Regression

Multiple features predict one target.

In [None]:
# Load housing data
from sklearn.datasets import fetch_california_housing

housing = fetch_california_housing()
df_housing = pd.DataFrame(
    housing.data, 
    columns=housing.feature_names
)
df_housing['Price'] = housing.target

df_housing.head()

In [None]:
# Check correlations
plt.figure(figsize=(10, 8))
sns.heatmap(
    df_housing.corr(), 
    annot=True, 
    fmt='.2f',
    cmap='coolwarm'
)
plt.title('Feature Correlations')
plt.tight_layout()
plt.show()

In [None]:
# Prepare data
X = df_housing.drop('Price', axis=1)
y = df_housing['Price']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"Features: {list(X.columns)}")
print(f"Training samples: {len(X_train)}")

In [None]:
# Train model
model_multi = LinearRegression()
model_multi.fit(X_train, y_train)

# Feature importance
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': model_multi.coef_
}).sort_values('Coefficient', 
               key=abs, 
               ascending=False)

feature_importance

In [None]:
# Visualize feature importance
plt.figure(figsize=(10, 6))
plt.barh(
    feature_importance['Feature'], 
    feature_importance['Coefficient']
)
plt.xlabel('Coefficient Value')
plt.title('Feature Importance')
plt.tight_layout()
plt.show()

In [None]:
# Make predictions
y_pred_multi = model_multi.predict(X_test)

# Evaluate
r2_multi = r2_score(y_test, y_pred_multi)
rmse_multi = np.sqrt(
    mean_squared_error(y_test, y_pred_multi)
)

print(f"R² Score: {r2_multi:.3f}")
print(f"RMSE: {rmse_multi:.3f}")

In [None]:
# Actual vs Predicted
plt.figure(figsize=(10, 6))
plt.scatter(
    y_test, y_pred_multi, 
    alpha=0.5
)
plt.plot(
    [y_test.min(), y_test.max()], 
    [y_test.min(), y_test.max()], 
    'r--', linewidth=2
)
plt.xlabel('Actual Price')
plt.ylabel('Predicted Price')
plt.title('Actual vs Predicted Prices')
plt.show()

## 3. Residual Analysis

In [None]:
# Calculate residuals
residuals = y_test - y_pred_multi

# Plot residuals
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Residual plot
axes[0].scatter(
    y_pred_multi, residuals, 
    alpha=0.5
)
axes[0].axhline(y=0, color='r', linestyle='--')
axes[0].set_xlabel('Predicted Values')
axes[0].set_ylabel('Residuals')
axes[0].set_title('Residual Plot')

# Histogram of residuals
axes[1].hist(residuals, bins=30, edgecolor='black')
axes[1].set_xlabel('Residuals')
axes[1].set_ylabel('Frequency')
axes[1].set_title('Distribution of Residuals')

plt.tight_layout()
plt.show()

## 4. Making Predictions

In [None]:
# Predict for new data
new_house = pd.DataFrame({
    'MedInc': [3.5],
    'HouseAge': [25],
    'AveRooms': [5.5],
    'AveBedrms': [1.2],
    'Population': [1500],
    'AveOccup': [3.0],
    'Latitude': [37.5],
    'Longitude': [-122.3]
})

predicted_price = model_multi.predict(new_house)
print(f"Predicted Price: ${predicted_price[0]:.2f}")

## Practice Exercises

### Exercise 1
Create a model to predict car prices 
based on mileage and age.

In [None]:
# Your code here


### Exercise 2
Analyze residuals to check model assumptions.

In [None]:
# Your code here


## Key Takeaways

✅ **Linear Regression** - Predict continuous values  
✅ **R² Score** - Model fit (0-1, higher better)  
✅ **RMSE** - Prediction error  
✅ **Residuals** - Check assumptions  
✅ **Feature Importance** - Understand impact  

**Next:** [Logistic Regression](03_logistic_regression.ipynb) →