# 1. Iris Flower Classification — Multiclass classification basics

**Goal:** Train a simple classifier on the Iris dataset and show metrics & a confusion matrix.

In [None]:

# Iris classification (multiclass) - scikit-learn
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import pandas as pd
import matplotlib.pyplot as plt

iris = load_iris(as_frame=True)
X = iris.data
y = iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

clf = RandomForestClassifier(random_state=42, n_estimators=100)
clf.fit(X_train, y_train)
pred = clf.predict(X_test)

print(classification_report(y_test, pred, target_names=iris.target_names))
cm = confusion_matrix(y_test, pred)
print("Confusion matrix:\n", cm)

plt.figure(figsize=(4,4))
plt.imshow(cm, interpolation='nearest')
plt.title('Confusion matrix')
plt.colorbar()
plt.xticks(range(len(iris.target_names)), iris.target_names, rotation=45)
plt.yticks(range(len(iris.target_names)), iris.target_names)
plt.tight_layout()
plt.show()


# 2. House Price Prediction — Regression + feature engineering

**Goal:** Use California housing dataset, simple feature engineering, train a RandomForestRegressor and show RMSE.

In [None]:

from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd

data = fetch_california_housing(as_frame=True)
df = data.frame
# Simple feature engineering: add rooms_per_household and population_per_household
df['rooms_per_household'] = df['AveRooms'] / (df['HouseAge'] + 1)
df['population_per_household'] = df['Population'] / (df['Households'] + 1)
X = df.drop(columns=['MedHouseVal'])
y = df['MedHouseVal']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
pred = model.predict(X_test)
rmse = mean_squared_error(y_test, pred, squared=False)
print(f"RMSE: {rmse:.4f}")


# 3. Waiter Tips Prediction — Linear regression on small tabular data

**Goal:** Predict tip amount using basic features; use statsmodels for interpretability.

In [None]:

import seaborn as sns
tips = sns.load_dataset('tips')
display(tips.head())

# Encode categorical variables simply
tips['sex'] = tips['sex'].map({'Male':0,'Female':1})
tips = pd.get_dummies(tips, columns=['smoker','day','time'], drop_first=True)

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error

X = tips.drop(columns=['tip'])
y = tips['tip']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)

lr = LinearRegression()
lr.fit(X_train, y_train)
pred = lr.predict(X_test)
print("MAE:", mean_absolute_error(y_test, pred))
print("RMSE:", mean_squared_error(y_test, pred, squared=False))

# show coefficients
coef_df = pd.DataFrame({'feature': X.columns, 'coef': lr.coef_}).sort_values(by='coef', key=abs, ascending=False)
display(coef_df.head(10))


# 4. Breast Cancer Classification — Binary classification

**Goal:** Train a classifier on the breast cancer dataset and show ROC AUC.

In [None]:

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score, accuracy_score

bc = load_breast_cancer(as_frame=True)
X = bc.data
y = bc.target
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.25, random_state=42)

clf = GradientBoostingClassifier(random_state=42)
clf.fit(X_train, y_train)
pred_proba = clf.predict_proba(X_test)[:,1]
pred = clf.predict(X_test)

print('Accuracy:', accuracy_score(y_test, pred))
print('ROC AUC:', roc_auc_score(y_test, pred_proba))


# 5. Student Exam Score Predictor — Simple regression with few features

**Goal:** Create a small synthetic dataset of study hours and attendance to predict exam score.

In [None]:

import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error

# synthetic data
np.random.seed(42)
n = 150
study_hours = np.random.normal(5, 2, n).clip(0)
attendance = (np.random.rand(n) * 20 + 80).clip(60,100)  # percent
scores = 50 + 6*study_hours + 0.2*attendance + np.random.normal(0,5,n)

df = pd.DataFrame({'study_hours': study_hours, 'attendance': attendance, 'score': scores})
display(df.head())

X = df[['study_hours','attendance']]
y = df['score']
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

lr = LinearRegression()
lr.fit(X_train, y_train)
pred = lr.predict(X_test)
print('R2:', r2_score(y_test, pred))
print('RMSE:', mean_squared_error(y_test, pred, squared=False))


# 6. Diabetes Prediction — Logistic regression

**Goal:** Build a binary classifier (logistic regression) on a synthetic 'diabetes' dataset.

In [None]:

from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score

X, y = make_classification(n_samples=800, n_features=8, n_informative=5, n_redundant=1, weights=[0.6,0.4], random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X,y, stratify=y, test_size=0.2, random_state=42)

clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)
pred = clf.predict(X_test)
pred_proba = clf.predict_proba(X_test)[:,1]
print(classification_report(y_test, pred))
print('ROC AUC:', roc_auc_score(y_test, pred_proba))


# 7. Handwritten Digit Recognition — First CNN project (simple MLP using sklearn digits)

**Goal:** Train a simple classifier on the small digits dataset (8x8 images). For a full MNIST CNN, GPU and longer time are needed; here we keep it lightweight.

In [None]:

from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

digits = load_digits()
X = digits.images.reshape((len(digits.images), -1))
y = digits.target
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42, stratify=y)

mlp = MLPClassifier(hidden_layer_sizes=(128,), max_iter=500, random_state=42)
mlp.fit(X_train, y_train)
pred = mlp.predict(X_test)
print('Accuracy:', accuracy_score(y_test, pred))


# 8. SMS Spam Detection — NLP + TF-IDF

**Goal:** Build a simple spam detector using a tiny sample SMS dataset embedded in the notebook.

In [None]:

# tiny SMS dataset for demonstration
data = [
 ('ham', "I'm going to be late, stuck in traffic"),
 ('spam', 'WINNER!! You have won a free ticket. Reply CLAIM to get it'),
 ('ham', 'Can we meet tomorrow?'),
 ('spam', 'Congratulations, you won $1000! Call now'),
 ('ham', 'Please send the assignment by tonight'),
 ('spam', 'URGENT! Your account has been compromised. Click http://fake.link'),
 ('ham', 'Happy birthday!'),
 ('spam', 'Free entry in 2 a weekly competition to win FA Cup finals tickets'),
]

df = pd.DataFrame(data, columns=['label','text'])
display(df)

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

vect = TfidfVectorizer(stop_words='english', ngram_range=(1,2))
X = vect.fit_transform(df['text'])
y = (df['label']=='spam').astype(int)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y, test_size=0.3)

clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)
pred = clf.predict(X_test)
print(classification_report(y_test, pred))


# 9. Salary Prediction — Regression + categorical encoding

**Goal:** Create a small synthetic salary dataset including categorical features and train a model with one-hot encoding.

In [None]:

# small synthetic salary dataset
data = [
 {'title':'Software Engineer','experience':2,'location':'Bengaluru','salary':800000},
 {'title':'Senior Software Engineer','experience':6,'location':'Bengaluru','salary':1800000},
 {'title':'Data Scientist','experience':3,'location':'Mumbai','salary':1200000},
 {'title':'Software Engineer','experience':4,'location':'Mumbai','salary':1100000},
 {'title':'Manager','experience':8,'location':'Delhi','salary':2000000},
 {'title':'Data Analyst','experience':1,'location':'Bengaluru','salary':600000},
 {'title':'Software Engineer','experience':5,'location':'Delhi','salary':1400000},
 {'title':'Senior Software Engineer','experience':7,'location':'Mumbai','salary':2000000},
 {'title':'Data Scientist','experience':2,'location':'Delhi','salary':900000},
 {'title':'Data Analyst','experience':3,'location':'Bengaluru','salary':750000},
]

df = pd.DataFrame(data)
display(df)

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

X = df.drop(columns=['salary'])
y = df['salary']

cat_cols = ['title','location']
preproc = ColumnTransformer([('cat', OneHotEncoder(), cat_cols)], remainder='passthrough')

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
model = make_pipeline(preproc, RandomForestRegressor(n_estimators=100, random_state=42))
model.fit(X_train, y_train)
pred = model.predict(X_test)
print('MAE:', mean_absolute_error(y_test, pred))
