In [1]:
# This exercise focuses on applying standardization to a dataset containing numerical values.
# The dataset used is 'Wholesale customers data.csv', which contains annual spending in monetary
# units across various product categories (e.g., Fresh, Milk, Grocery, etc.) for wholesale clients.
# The goal is to load the dataset using pandas and apply the standard scaler method to transform
# all numerical features so that they share the same scale (mean = 0, standard deviation = 1).
# This step is essential for ensuring that models relying on distance metrics (e.g., KNN, SVM)
# are not biased by differences in feature magnitudes.

import pandas as pd
dataset = 'https://raw.githubusercontent.com/TrainingByPackt/Data-Science-with-Python/refs/heads/master/Chapter01/Data/Wholesale%20customers%20data.csv'
df = pd.read_csv(dataset, header=0)

In [2]:
# The 'Channel' column indicates the type of customer:
# 1 = Horeca (Hotel/Restaurant/Café)
# 2 = Retail

# The 'Region' column indicates the geographical region of the customer:
# 1 = Lisbon
# 2 = Oporto
# 3 = Other Region

df

Unnamed: 0,Channel,Region,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicassen
0,2,3,12669,9656,7561,214,2674,1338
1,2,3,7057,9810,9568,1762,3293,1776
2,2,3,6353,8808,7684,2405,3516,7844
3,1,3,13265,1196,4221,6404,507,1788
4,2,3,22615,5410,7198,3915,1777,5185
5,2,3,9413,8259,5126,666,1795,1451
6,2,3,12126,3199,6975,480,3140,545
7,2,3,7579,4956,9426,1669,3321,2566
8,1,3,5963,3648,6192,425,1716,750
9,2,3,6006,11093,18881,1159,7425,2098


In [3]:
# Check for missing data before preprocessing
# This step ensures there are no NaN values that could affect scaling or model performance.
# The result shows all columns contain only valid integers (int64) and no missing data,
# so there is no need to use df.dropna().

null_ = df.isna().any()
dtypes = df.dtypes
info = pd.concat([null_,dtypes],axis = 1,keys = ['Null', 'type'])
print(info)

                   Null   type
Channel           False  int64
Region            False  int64
Fresh             False  int64
Milk              False  int64
Grocery           False  int64
Frozen            False  int64
Detergents_Paper  False  int64
Delicassen        False  int64


In [4]:
# Perform standard scaling on the dataset using StandardScaler
# This step transforms all columns to have a mean of 0 and a standard deviation of 1.
# It ensures that features with larger magnitudes do not dominate the model training.
# This is crucial for models that rely on distance or gradient-based optimization.

from sklearn import preprocessing
std_scale = preprocessing.StandardScaler().fit_transform(df)
scaled_frame = pd.DataFrame(std_scale, columns=df.columns)
scaled_frame.head()

Unnamed: 0,Channel,Region,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicassen
0,1.448652,0.590668,0.052933,0.523568,-0.041115,-0.589367,-0.043569,-0.066339
1,1.448652,0.590668,-0.391302,0.544458,0.170318,-0.270136,0.086407,0.089151
2,1.448652,0.590668,-0.447029,0.408538,-0.028157,-0.137536,0.133232,2.243293
3,-0.690297,0.590668,0.100111,-0.62402,-0.392977,0.687144,-0.498588,0.093411
4,1.448652,0.590668,0.840239,-0.052396,-0.079356,0.173859,-0.231918,1.299347
