# Scikit-Learn Notebook

In [7]:
!pip install scikit-learn



In [9]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.datasets import make_classification, make_regression
from sklearn.metrics import accuracy_score, mean_squared_error

## 1) Estimator API workflow

In [10]:
X_cls, y_cls = make_classification(
    n_samples=300, n_features=4, n_informative=3, n_redundant=0, random_state=42
)
X_train, X_test, y_train, y_test = train_test_split(X_cls, y_cls, test_size=0.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape

((240, 4), (60, 4), (240,))

In [11]:
clf = LogisticRegression(max_iter=500)
clf.fit(X_train, y_train)
preds = clf.predict(X_test)
accuracy_score(y_test, preds)

0.9

## 2) Preprocessing: scaling

In [12]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train_scaled.mean(axis=0)

array([-2.04234777e-16,  2.59052039e-17,  6.01370805e-18, -1.97989773e-16])

In [13]:
clf_scaled = LogisticRegression(max_iter=500)
clf_scaled.fit(X_train_scaled, y_train)
scaled_preds = clf_scaled.predict(X_test_scaled)
accuracy_score(y_test, scaled_preds)

0.9166666666666666

## 3) Encoding categorical + numeric together

In [14]:
df_mix = pd.DataFrame({
    "num1": [1.0, 2.5, 3.2, 0.7],
    "num2": [10, 20, 10, 30],
    "color": ["red", "blue", "red", "green"],
    "label": [0, 1, 0, 1]
})
X = df_mix[["num1", "num2", "color"]]
y = df_mix["label"]

numeric_cols = ["num1", "num2"]
categorical_cols = ["color"]

numeric_pipe = Pipeline([("scaler", StandardScaler())])
categorical_pipe = Pipeline([("encoder", OneHotEncoder(sparse_output=False, handle_unknown="ignore"))])

preprocess = ColumnTransformer([
    ("num", numeric_pipe, numeric_cols),
    ("cat", categorical_pipe, categorical_cols)
])

clf_pipe = Pipeline([
    ("prep", preprocess),
    ("model", LogisticRegression(max_iter=500))
])

clf_pipe.fit(X, y)
clf_pipe.predict(X)

array([0, 1, 0, 1])

## 4) Regression example

In [15]:
X_reg, y_reg = make_regression(n_samples=200, n_features=3, noise=5.0, random_state=42)
Xr_train, Xr_test, yr_train, yr_test = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)

reg = LinearRegression()
reg.fit(Xr_train, yr_train)
reg_preds = reg.predict(Xr_test)
mean_squared_error(yr_test, reg_preds) ** 0.5  # RMSE

5.758118776795974

## 5) Cross-validation

In [16]:
cv_scores = cross_val_score(LogisticRegression(max_iter=500), X_cls, y_cls, cv=5)
cv_scores.mean()

np.float64(0.8466666666666667)