In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Scikit-learn for machine learning
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

sns.set_style('whitegrid')
print("Libraries imported successfully!")

---

## 1. Creating Sample Data

In [None]:
# Generate synthetic data
np.random.seed(42)

# X: Study hours
X = np.random.uniform(1, 10, 100)

# y: Exam score (with some noise)
# True relationship: score = 5 * hours + 40 + noise
y = 5 * X + 40 + np.random.normal(0, 5, 100)

# Create DataFrame
data = pd.DataFrame({
    'study_hours': X,
    'exam_score': y
})

print("Data shape:", data.shape)
data.head()

In [None]:
# Visualize the data
plt.figure(figsize=(10, 6))
plt.scatter(data['study_hours'], data['exam_score'], alpha=0.7)
plt.xlabel('Study Hours')
plt.ylabel('Exam Score')
plt.title('Study Hours vs Exam Score')
plt.show()

---

## 2. Exploratory Data Analysis

In [None]:
# Descriptive statistics
data.describe()

In [None]:
# Check correlation
correlation = data.corr()
print("Correlation:")
print(correlation)

# Visualize correlation
plt.figure(figsize=(6, 4))
sns.heatmap(correlation, annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Heatmap')
plt.show()

In [None]:
# Joint plot with regression
g = sns.jointplot(data=data, x='study_hours', y='exam_score', kind='reg', height=7)
g.fig.suptitle('Relationship with Regression Line', y=1.02)
plt.show()

---

## 3. Preparing Data for Model

In [None]:
# Separate features (X) and target (y)
X = data[['study_hours']]  # 2D array (required by sklearn)
y = data['exam_score']     # 1D array

print("X shape:", X.shape)
print("y shape:", y.shape)

In [None]:
# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"Training set: {len(X_train)} samples")
print(f"Testing set: {len(X_test)} samples")

---

## 4. Training the Model

In [None]:
# Create and train the model
model = LinearRegression()
model.fit(X_train, y_train)

print("Model trained!")

In [None]:
# Get model parameters
slope = model.coef_[0]
intercept = model.intercept_

print(f"Slope (m): {slope:.4f}")
print(f"Intercept (b): {intercept:.4f}")
print(f"\nEquation: y = {slope:.2f}x + {intercept:.2f}")

---

## 5. Making Predictions

In [None]:
# Predict on test data
y_pred = model.predict(X_test)

# Compare actual vs predicted
comparison = pd.DataFrame({
    'Actual': y_test.values,
    'Predicted': y_pred,
    'Difference': y_test.values - y_pred
})

comparison.head(10)

In [None]:
# Predict for new values
new_hours = [[3], [5], [8]]
predictions = model.predict(new_hours)

for hours, score in zip(new_hours, predictions):
    print(f"Study {hours[0]} hours ‚Üí Predicted score: {score:.2f}")

---

## 6. Model Evaluation

In [None]:
# Calculate evaluation metrics

# Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred)

# Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)

# Root Mean Squared Error (RMSE)
rmse = np.sqrt(mse)

# R-squared (Coefficient of Determination)
r2 = r2_score(y_test, y_pred)

print("=" * 40)
print("MODEL EVALUATION METRICS")
print("=" * 40)
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print(f"R-squared (R¬≤): {r2:.4f}")
print("=" * 40)

### Understanding Metrics:

| Metric | Description | Interpretation |
|--------|-------------|----------------|
| **MAE** | Average absolute error | Lower is better |
| **MSE** | Average squared error | Penalizes large errors |
| **RMSE** | Square root of MSE | Same unit as target |
| **R¬≤** | Variance explained | 0-1 (1 = perfect) |

---

## 7. Visualizing the Results

In [None]:
# Plot regression line with data
plt.figure(figsize=(10, 6))

# Plot training data
plt.scatter(X_train, y_train, color='blue', alpha=0.6, label='Training data')

# Plot testing data
plt.scatter(X_test, y_test, color='green', alpha=0.6, label='Testing data')

# Plot regression line
x_line = np.linspace(X['study_hours'].min(), X['study_hours'].max(), 100).reshape(-1, 1)
y_line = model.predict(x_line)
plt.plot(x_line, y_line, color='red', linewidth=2, label='Regression line')

plt.xlabel('Study Hours')
plt.ylabel('Exam Score')
plt.title('Linear Regression: Study Hours vs Exam Score')
plt.legend()
plt.show()

In [None]:
# Actual vs Predicted plot
plt.figure(figsize=(10, 6))

plt.scatter(y_test, y_pred, alpha=0.7)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 
         'r--', linewidth=2, label='Perfect prediction')

plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title('Actual vs Predicted')
plt.legend()
plt.show()

In [None]:
# Residual plot
residuals = y_test - y_pred

plt.figure(figsize=(10, 6))
plt.scatter(y_pred, residuals, alpha=0.7)
plt.axhline(y=0, color='r', linestyle='--', linewidth=2)
plt.xlabel('Predicted Values')
plt.ylabel('Residuals (Actual - Predicted)')
plt.title('Residual Plot')
plt.show()

In [None]:
# Residual distribution
plt.figure(figsize=(10, 5))
sns.histplot(residuals, kde=True, bins=15)
plt.xlabel('Residuals')
plt.title('Distribution of Residuals')
plt.axvline(x=0, color='r', linestyle='--')
plt.show()

---

## 8. Multiple Linear Regression

When we have more than one feature.

In [None]:
# Create multi-feature data
np.random.seed(42)
n = 100

multi_data = pd.DataFrame({
    'study_hours': np.random.uniform(1, 10, n),
    'sleep_hours': np.random.uniform(4, 9, n),
    'attendance': np.random.uniform(50, 100, n)
})

# Target: exam score (depends on all features)
multi_data['exam_score'] = (
    3 * multi_data['study_hours'] + 
    2 * multi_data['sleep_hours'] + 
    0.3 * multi_data['attendance'] + 
    np.random.normal(0, 5, n)
)

multi_data.head()

In [None]:
# Prepare features and target
X_multi = multi_data[['study_hours', 'sleep_hours', 'attendance']]
y_multi = multi_data['exam_score']

# Split data
X_train_m, X_test_m, y_train_m, y_test_m = train_test_split(
    X_multi, y_multi, test_size=0.2, random_state=42
)

# Train model
multi_model = LinearRegression()
multi_model.fit(X_train_m, y_train_m)

print("Coefficients:")
for feature, coef in zip(X_multi.columns, multi_model.coef_):
    print(f"  {feature}: {coef:.4f}")
print(f"Intercept: {multi_model.intercept_:.4f}")

In [None]:
# Evaluate
y_pred_m = multi_model.predict(X_test_m)

print(f"R¬≤ Score: {r2_score(y_test_m, y_pred_m):.4f}")
print(f"RMSE: {np.sqrt(mean_squared_error(y_test_m, y_pred_m)):.4f}")

In [None]:
# Feature importance visualization
importance = pd.DataFrame({
    'Feature': X_multi.columns,
    'Coefficient': multi_model.coef_
}).sort_values('Coefficient', ascending=True)

plt.figure(figsize=(8, 5))
plt.barh(importance['Feature'], importance['Coefficient'])
plt.xlabel('Coefficient Value')
plt.title('Feature Importance')
plt.axvline(x=0, color='gray', linestyle='--')
plt.show()

---

## 9. Using Real Dataset

Let's use the tips dataset for a practical example.

In [None]:
# Load tips dataset
tips = sns.load_dataset('tips')
tips.head()

In [None]:
# Predict tip from total_bill
X_tips = tips[['total_bill']]
y_tips = tips['tip']

# Split
X_tr, X_te, y_tr, y_te = train_test_split(X_tips, y_tips, test_size=0.2, random_state=42)

# Train
tip_model = LinearRegression()
tip_model.fit(X_tr, y_tr)

print(f"Equation: tip = {tip_model.coef_[0]:.4f} * total_bill + {tip_model.intercept_:.4f}")

# Evaluate
y_pred_tips = tip_model.predict(X_te)
print(f"\nR¬≤ Score: {r2_score(y_te, y_pred_tips):.4f}")

In [None]:
# Visualize with Seaborn
plt.figure(figsize=(10, 6))
sns.regplot(data=tips, x='total_bill', y='tip', 
            scatter_kws={'alpha': 0.5},
            line_kws={'color': 'red'})
plt.title('Tip Prediction from Total Bill')
plt.show()

---

## üìù Practice Problems

### Problem 1: Simple Linear Regression
Create data for house prices:
- X: House size (1000-3000 sq ft)
- y: Price = 150 * size + 50000 + noise

1. Generate 100 samples
2. Train a linear regression model
3. Calculate R¬≤ and RMSE
4. Visualize the regression line

In [None]:
# Your solution here

### Problem 2: Iris Regression
Using the iris dataset:
1. Predict petal_width from petal_length
2. Evaluate the model with R¬≤ and RMSE
3. Create actual vs predicted plot

In [None]:
# Your solution here

### Problem 3: Multiple Regression
Using the tips dataset:
1. Predict tip from total_bill and size
2. Display feature coefficients
3. Which feature has more impact on tip?

In [None]:
# Your solution here

---

## ‚úÖ Solutions

### Solution 1: Simple Linear Regression

In [None]:
# Solution 1
np.random.seed(42)

# Generate data
size = np.random.uniform(1000, 3000, 100)
price = 150 * size + 50000 + np.random.normal(0, 20000, 100)

house_data = pd.DataFrame({'size': size, 'price': price})

# Prepare data
X = house_data[['size']]
y = house_data['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

# Evaluate
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print(f"Equation: price = {model.coef_[0]:.2f} * size + {model.intercept_:.2f}")
print(f"R¬≤ Score: {r2:.4f}")
print(f"RMSE: ${rmse:,.2f}")

# Visualize
plt.figure(figsize=(10, 6))
plt.scatter(X, y, alpha=0.6, label='Data')
x_line = np.linspace(1000, 3000, 100).reshape(-1, 1)
plt.plot(x_line, model.predict(x_line), 'r-', linewidth=2, label='Regression line')
plt.xlabel('House Size (sq ft)')
plt.ylabel('Price ($)')
plt.title('House Price Prediction')
plt.legend()
plt.show()

### Solution 2: Iris Regression

In [None]:
# Solution 2
iris = sns.load_dataset('iris')

# Prepare data
X = iris[['petal_length']]
y = iris['petal_width']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train
model = LinearRegression()
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

# Evaluate
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print(f"R¬≤ Score: {r2:.4f}")
print(f"RMSE: {rmse:.4f}")

# Actual vs Predicted
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Regression plot
sns.regplot(data=iris, x='petal_length', y='petal_width', ax=axes[0])
axes[0].set_title('Petal Length vs Width')

# Actual vs Predicted
axes[1].scatter(y_test, y_pred, alpha=0.7)
axes[1].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
axes[1].set_xlabel('Actual')
axes[1].set_ylabel('Predicted')
axes[1].set_title('Actual vs Predicted')

plt.tight_layout()
plt.show()

### Solution 3: Multiple Regression

In [None]:
# Solution 3
tips = sns.load_dataset('tips')

# Prepare data
X = tips[['total_bill', 'size']]
y = tips['tip']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train
model = LinearRegression()
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

# Results
print("Feature Coefficients:")
for feature, coef in zip(X.columns, model.coef_):
    print(f"  {feature}: {coef:.4f}")
print(f"Intercept: {model.intercept_:.4f}")
print(f"\nR¬≤ Score: {r2_score(y_test, y_pred):.4f}")

# Visualize importance
importance = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': model.coef_
}).sort_values('Coefficient')

plt.figure(figsize=(8, 4))
plt.barh(importance['Feature'], importance['Coefficient'])
plt.xlabel('Coefficient')
plt.title('Feature Importance (Higher = More Impact on Tip)')
plt.show()

print("\nConclusion: total_bill has a higher coefficient, so it has more impact on tip.")

---

## üìå Summary

### Linear Regression Workflow:

| Step | Code |
|------|------|
| Import | `from sklearn.linear_model import LinearRegression` |
| Prepare data | `X = df[['feature']]`, `y = df['target']` |
| Split | `train_test_split(X, y, test_size=0.2)` |
| Create model | `model = LinearRegression()` |
| Train | `model.fit(X_train, y_train)` |
| Predict | `y_pred = model.predict(X_test)` |
| Evaluate | `r2_score()`, `mean_squared_error()` |

### Model Attributes:
- `model.coef_` - Slope(s)
- `model.intercept_` - Y-intercept

### Evaluation Metrics:
- **R¬≤** closer to 1 = better fit
- **RMSE** lower = better predictions

---

üéâ **Congratulations!** You've completed the Session 4 Notes series!