# Use of validation set with cross validation demo

In [1]:
# Load Python libraries
import pandas as pd
import numpy as np

In [2]:
# Load dataset and display the first several data samples.
df = pd.read_csv("customer-behaviour.csv")
df.head()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0


This is a dataset for predicting whether a user purchases a specific product.
- Gender: Customer gender (categorical: male, female)
- Age: Customer age (numeric)
- EstimatedSalary: Estimated customer salary (numeric)
- Purchased (Label): whether the customer has purchased the product (categorical: 0 (no), 1 (yes)).

In [3]:
# Get some basic data information
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   User ID          400 non-null    int64 
 1   Gender           400 non-null    object
 2   Age              400 non-null    int64 
 3   EstimatedSalary  400 non-null    int64 
 4   Purchased        400 non-null    int64 
dtypes: int64(4), object(1)
memory usage: 15.8+ KB


In [4]:
# Drop "User ID" column as it does not contain useful information for building the model
df = df.drop(columns=["User ID"], axis=1)

In [5]:
# Show the dataframe
df

Unnamed: 0,Gender,Age,EstimatedSalary,Purchased
0,Male,19,19000,0
1,Male,35,20000,0
2,Female,26,43000,0
3,Female,27,57000,0
4,Male,19,76000,0
...,...,...,...,...
395,Female,46,41000,1
396,Male,51,23000,1
397,Female,50,20000,1
398,Male,36,33000,0


# Model Training

In [6]:
# Make a data copy
df_copy = df.copy()

In [7]:
# The Pandas get_dummies function creates dummy variables from Pandas objects in Python,
# i.e., a dummy variable is a numeric variable that encodes categorical information.
# this is also sometimes referred to as “one-hot” encoding of categorical data.
data = pd.get_dummies(data=df_copy, columns=["Gender"])

# Show the data with dummy variables
data

Unnamed: 0,Age,EstimatedSalary,Purchased,Gender_Female,Gender_Male
0,19,19000,0,0,1
1,35,20000,0,0,1
2,26,43000,0,1,0
3,27,57000,0,1,0
4,19,76000,0,0,1
...,...,...,...,...,...
395,46,41000,1,1,0
396,51,23000,1,0,1
397,50,20000,1,1,0
398,36,33000,0,0,1


In [8]:
feature_names = data.columns.tolist()     # Get the list of data features from column names
feature_names.remove("Purchased")         # Remove the label column "Purchased" from the data features
X = data[feature_names].values            # Assign data feature values to variable X

y = data.Purchased.values                 # Assign data label values to variable y

In [9]:
# Show data feature shape
X.shape

(400, 4)

In [10]:
# Show label shape
y.shape

(400,)

In [11]:
# Split the data into train/test set using sklearn library
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1) # Training is 1-0.2=0.8, testing is 0.2
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=1) # Training is now 0.8*0.8=0.64, validation is now 0.8*0.2=0.16, testing is still 0.2.

In [12]:
# Standardize the data using Standard scaler
from sklearn.preprocessing import StandardScaler
normalizer = StandardScaler()
X_normal_train = normalizer.fit_transform(X_train)     # Note that we use fit_transform() on training data so that it can learn the scaling parameters of that data.
X_normal_val = normalizer.fit_transform(X_val)         # But we only transform() the validation data using the learned scaling parameters.
X_normal_test = normalizer.transform(X_test)           # But we only transform() the test data using the learned scaling parameters.

# Manual use of the validation set

Let say we try to build some Logistic Regression models with different values of C (a hyperparameter of Logistic Regression), and see which value of C gives the best result on the validation set. C is the inverse of regularization strength; must be a positive float. Smaller values of C specify stronger regularization. Reference: https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html


In [13]:
from sklearn.linear_model import LogisticRegression

# First try with C=0.01
logmodel = LogisticRegression(C=0.01)                # Initialize Logistic Regression model
logmodel.fit(X_normal_train, y_train)                # Train the model
logmodel.score(X_normal_val, y_val)                  # Validation acciracy of the hyperparameter C=0.01

0.640625

In [14]:
# Second try with C=0.1
logmodel = LogisticRegression(C=0.1)                # Initialize Logistic Regression model
logmodel.fit(X_normal_train, y_train)               # Train the model
logmodel.score(X_normal_val, y_val)                 # Validation accuracy of the hyperparameter C=0.1

0.75

In [15]:
# Third try with C=1
logmodel = LogisticRegression(C=1)                # Initialize Logistic Regression model
logmodel.fit(X_normal_train, y_train)             # Train the model
logmodel.score(X_normal_val, y_val)               # Validation accuacy of the hyperparameter C=1

0.765625

In [16]:
# We can see that C=1 gives the best validation accuracy among the choices of C=0.01, C=0.1 and C=1.
# Now take C=1 and test it on the test set to double check the accuracy
logmodel = LogisticRegression(C=1)                # Initialize Logistic Regression model
logmodel.fit(X_normal_train, y_train)             # Train the model
logmodel.score(X_normal_test, y_test)             # Test accuracy of the 'best' hyperparameter C=1

0.825

In [17]:
# We see that the validation set accuracy (0.765625) is not really close to the test accuracy (0.825). This is a signal of unreliability since we only validate the model once.

# Automatic use of the validation set with GridSearchCV, together with the cross-validation to reliably evaluate the model accuracy

In [20]:
# Load the libraries
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

grid_search={"C":[0.01, 0.1, 1]} # Define the values of hyperparameter C we want to try
logmodel=LogisticRegression() # Initialize the logistic regression model
logmodel_cv=GridSearchCV(logmodel, grid_search, cv=5) # Set up GridSearchCV to find the best value of hyperparameter C, with 5-fold cross validation, i.e., cv=5.
logreg_cv = logmodel_cv.fit(X_normal_train, y_train) # Train the model

In [21]:
logreg_cv.best_params_ # Show the best value of C

{'C': 1}

In [1]:
logreg_cv.best_score_ # Show the model performance with the best value of C

NameError: name 'logreg_cv' is not defined

In [None]:
# Now take the best hyperparameter (C=1) and test it on the test set to double check the accuracy
logmodel = LogisticRegression(C=logreg_cv.best_params_['C'])                # Initialize Logistic Regression model with the best value of hyper parameter C
logmodel.fit(X_normal_train, y_train)             # Train the model
logmodel.score(X_normal_test, y_test)             # Test accuracy of the 'best' hyperparameter C=1

In [None]:
# We see that now the validation set accuracy (0.84788) is much closer to the test accuracy (0.825). This is a signal of reliability thanks to the cross-validation.