# Review of Supervised Learning with scikit-learn

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
sklearn.set_config(print_changed_only=True)

In [2]:
# read data.
# you can find a description in data/bank-campaign-desc.txt
data = pd.read_csv("data/bank-campaign.csv")

In [3]:
data.shape

(41188, 64)

In [4]:
data.columns

Index(['age', 'duration', 'campaign', 'pdays', 'previous', 'emp.var.rate',
       'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed',
       'job_admin.', 'job_blue-collar', 'job_entrepreneur', 'job_housemaid',
       'job_management', 'job_retired', 'job_self-employed', 'job_services',
       'job_student', 'job_technician', 'job_unemployed', 'job_unknown',
       'marital_divorced', 'marital_married', 'marital_single',
       'marital_unknown', 'education_basic.4y', 'education_basic.6y',
       'education_basic.9y', 'education_high.school', 'education_illiterate',
       'education_professional.course', 'education_university.degree',
       'education_unknown', 'default_no', 'default_unknown', 'default_yes',
       'housing_no', 'housing_unknown', 'housing_yes', 'loan_no',
       'loan_unknown', 'loan_yes', 'contact_cellular', 'contact_telephone',
       'month_apr', 'month_aug', 'month_dec', 'month_jul', 'month_jun',
       'month_mar', 'month_may', 'month_nov', 'month_oc

In [5]:
data.head()

Unnamed: 0,age,duration,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,...,month_sep,day_of_week_fri,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,poutcome_failure,poutcome_nonexistent,poutcome_success,target
0,56,261,1,999,0,1.1,93.994,-36.4,4.857,5191.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,no
1,57,149,1,999,0,1.1,93.994,-36.4,4.857,5191.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,no
2,37,226,1,999,0,1.1,93.994,-36.4,4.857,5191.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,no
3,40,151,1,999,0,1.1,93.994,-36.4,4.857,5191.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,no
4,56,307,1,999,0,1.1,93.994,-36.4,4.857,5191.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,no


In [6]:
y = data.target

In [7]:
X = data.drop("target", axis=1)

In [8]:
X.shape

(41188, 63)

In [9]:
y.shape

(41188,)

In [10]:
y.head()

0    no
1    no
2    no
3    no
4    no
Name: target, dtype: object

In [11]:
data.target.value_counts()

no     36548
yes     4640
Name: target, dtype: int64

In [12]:
data.target.value_counts(normalize=True)

no     0.887346
yes    0.112654
Name: target, dtype: float64

Splitting the data:

In [13]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=.2, random_state=42, stratify=y)

In [14]:
np.sum(y_train == "yes") / len(y_train)

0.11265553869499241

In [15]:
np.sum(y_test == "yes") / len(y_test)

0.11264870114105366

In [16]:
# import model
from sklearn.linear_model import LogisticRegression
# instantiate model, set parameters
lr = LogisticRegression(C=0.1, max_iter=1000)
# fit model
lr.fit(X_train, y_train)
lr.coef_

array([[ 5.97245350e-03,  4.56683402e-03, -3.42531789e-02,
        -1.75264447e-03, -2.18195118e-01, -2.54120282e-01,
         3.45453359e-01,  2.45073240e-02, -1.63852852e-01,
        -6.48849214e-03,  9.06193952e-02, -2.23165326e-01,
        -2.83147271e-02, -7.91693983e-03,  1.49883074e-02,
         1.10829492e-01, -1.19468481e-02, -6.17478439e-02,
         6.41649715e-02,  3.93010294e-02,  1.17358459e-02,
         4.34687934e-03, -2.31118654e-02, -8.95133643e-02,
         1.14047208e-01,  1.47225663e-03, -3.96814852e-02,
        -1.75291635e-02, -1.13858841e-01, -5.52214550e-02,
         2.36134084e-03,  1.63971984e-02,  1.75045418e-01,
         3.53812216e-02,  1.74191935e-01, -1.71198693e-01,
        -9.90073013e-05,  1.09402625e-02,  4.82432708e-03,
        -1.28703546e-02,  1.32664737e-02,  4.82432708e-03,
        -1.51965658e-02,  2.46280300e-01, -2.43386065e-01,
         2.91937916e-02,  1.31304214e-01,  8.53210913e-03,
         1.84393744e-01,  1.40868335e-01,  2.24235745e-0

Make predictions:

In [17]:
lr.score(X_train, y_train)

0.9095295902883156

In [18]:
(y_train == "no").mean()

0.8873444613050075

In [19]:
lr.score(X_test, y_test)

0.9128429230395727

# Exercise
Load the dataset ``data/bike_day_raw.csv``, which has the regression target ``cnt``.
This dataset is hourly bike rentals in the citybike platform. The ``cnt`` column is the number of rentals, which we want to predict from date and weather data.

Split the data into a training and a test set using ``train_test_split``.
Use the ``LinearRegression`` class to learn a regression model on this data. You can evaluate with the ``score`` method, which provides the $R^2$ or using the ``mean_squared_error`` function from ``sklearn.metrics`` (or write it yourself in numpy).

In [None]:
# %load solutions/bike_regression.py