## Using scikit-learn for classification

In [1]:
import pandas as pd
import numpy as np

data = pd.read_table("../data/voting.tab", skiprows=[1,2])
X = data.drop('party', axis=1).values
y = data['party'].values

##### Answer 8-1-1

In [2]:
from sklearn import preprocessing
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression

LR = LogisticRegression(max_iter=1000)
enc = preprocessing.OrdinalEncoder()
X = enc.fit_transform(X)

strategies = ["mean", "median", "most_frequent", "constant"]
for strategy in strategies:
    imp = SimpleImputer(strategy=strategy, fill_value=2)
    imp.fit(X)
    X_transformed = imp.transform(X)
    print(f"Strategy: {strategy}")
    model = LR.fit(X_transformed, y)
    print("%.1f" % ((model.predict(X_transformed)== y).sum()/len(y)*100)+"%")

Strategy: mean
97.2%
Strategy: median
96.8%
Strategy: most_frequent
96.8%
Strategy: constant
96.3%


##### Answer 8-1-2

In [3]:
data.iloc[6]

party                                     democrat
handicapped-infants                              n
water-project-cost-sharing                       y
adoption-of-the-budget-resolution                n
physician-fee-freeze                             y
el-salvador-aid                                  y
religious-groups-in-schools                      y
anti-satellite-test-ban                          n
aid-to-nicaraguan-contras                        n
mx-missile                                       n
immigration                                      n
synfuels-corporation-cutback                     n
education-spending                               n
superfund-right-to-sue                         NaN
crime                                            y
duty-free-exports                                y
export-administration-act-south-africa           y
Name: 6, dtype: object

In [4]:
df_long = data.melt(id_vars="party", var_name="vote", value_name="value")
counts = (
    df_long[df_long["value"].isin(["y", "n"])]
    .groupby(["vote", "party", "value"])
    .size()
    .unstack(fill_value=0)
)
counts

Unnamed: 0_level_0,value,n,y
vote,party,Unnamed: 2_level_1,Unnamed: 3_level_1
adoption-of-the-budget-resolution,democrat,29,231
adoption-of-the-budget-resolution,republican,142,22
aid-to-nicaraguan-contras,democrat,45,218
aid-to-nicaraguan-contras,republican,133,24
anti-satellite-test-ban,democrat,59,200
anti-satellite-test-ban,republican,123,39
crime,democrat,167,90
crime,republican,3,158
duty-free-exports,democrat,91,160
duty-free-exports,republican,142,14


##### Answer 8-1-3

In [5]:
from sklearn.datasets import fetch_california_housing
X, y = fetch_california_housing(return_X_y=True, as_frame=True)
X.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [6]:
y.head()

0    4.526
1    3.585
2    3.521
3    3.413
4    3.422
Name: MedHouseVal, dtype: float64

In [7]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import Ridge

pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
    ("pca", PCA()),
    ("model", Ridge())
])

param_grid = {
    "pca__n_components": [None, 2, 4, 6, 8],
    "model__alpha": [0.1, 1.0, 10.0, 100.0]
}


In [8]:
from sklearn.model_selection import GridSearchCV

gscv = GridSearchCV(
    pipe,
    param_grid=param_grid,
    cv=5,
    scoring="neg_mean_squared_error",
    n_jobs=-1
)

gscv.fit(X, y)

In [9]:
print("Best parameters:", gscv.best_params_)
best_rmse = (-gscv.best_score_) ** 0.5
print(f"Best CV RMSE: {best_rmse:.3f}")

Best parameters: {'model__alpha': 10.0, 'pca__n_components': None}
Best CV RMSE: 0.747
