# 1. Data Preprocessing Tools :

### Importing The Libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

### Importing the dataset

In [2]:
dataset = pd.read_csv("Data.csv")
print(dataset)

   Country   Age   Salary Purchased
0   France  44.0  72000.0        No
1    Spain  27.0  48000.0       Yes
2  Germany  30.0  54000.0        No
3    Spain  38.0  61000.0        No
4  Germany  40.0      NaN       Yes
5   France  35.0  58000.0       Yes
6    Spain   NaN  52000.0        No
7   France  48.0  79000.0       Yes
8  Germany  50.0  83000.0        No
9   France  37.0  67000.0       Yes


In [3]:
dataset = pd.read_csv("Data.csv")
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

In [5]:
print(X)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


In [6]:
print(y)

['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']


# Taking care of missing data

In [8]:
'''Imports SimpleImputer from the sklearn.impute module.
It is used to handle missing data (e.g., NaN values).

-- Creates an object called imputer.
missing_values=np.nan: Treats all np.nan entries as missing.
strategy="mean": Missing values will be filled with the mean of each column.

-- fit() -- "Learn the Pattern"
Look at the training data and learn the average (mean) 
In the case of missing values,fit() will calculate the mean 
for each column with missing data.
It does not change the data — it just prepares.

-- transform() — "Apply the fix"
Using the information from fit(), transform() will replace 
the missing values with the calculated mean.
This step modifies the data by filling in the missing values.
'''
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values = np.nan, strategy = "mean")
imputer.fit(X[:, 1:3])
X[:, 1:3] = imputer.transform(X[:, 1:3])

In [9]:
print(X)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 63777.77777777778]
 ['France' 35.0 58000.0]
 ['Spain' 38.77777777777778 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


# Encoding categorical data

### Encoding the Indpendent Variable :-

In [10]:
''' 
→ sklearn.compose 
ColumnTransformer: to apply different preprocessing to different columns.

→ sklearn.preprocessing : This is scikit-learn’s toolbox for cleaning and preparing data.
 OneHotEncoder(): Converts categories into separate columns of 0s and 1s.
What happened?
Country	France	Germany	Spain
France	1	0	0
Germany	0	1	0
Spain	0	0	1
France	1	0	0
Each category becomes a new column.
The row has 1 where the category matches, and 0 elsewhere.
This helps machine learning models understand text data as numbers

--transformers=[...]: this is a list of what transformations to apply.
'encoder': just a name for the transformation (can be anything).
[0]: apply this only to column 0 (the first column), 
which is usually where the categorical data is.

remainder='passthrough': this means "leave the other columns as they are" 
— only apply OneHotEncoding to column 0, and keep the rest.
'''

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder',OneHotEncoder(),[0])], remainder = 'passthrough')
X = np.array(ct.fit_transform(X))

In [12]:
print(X)

[[1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [0.0 1.0 0.0 30.0 54000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 35.0 58000.0]
 [0.0 0.0 1.0 38.77777777777778 52000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


### Encoding the Dependent Variable


In [14]:
'''
-- fit(y):
Looks at the unique values in y (e.g., "Yes" and "No").
It learns what values exist.
-- transform(y):
Replaces the text with numbers:
"Yes" → 1
"No" → 0
So "Yes", "No", "Yes" becomes 1, 0, 1
'''
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

In [15]:
print(y)

[0 1 0 0 1 1 0 1 0 1]


### Splitting the dataset into the Training set and Test set

In [29]:
'''
-- from sklearn.model_selection import train_test_split
from → You're taking something from a package.
sklearn.model_selection → This is the part of scikit-learn that helps split your data.
import train_test_split → You're bringing in the function called train_test_split to use it.
In short: "Bring in the tool to split the data.

-- X_train, X_test, y_train, y_test 
Put the training part of X into X_train
Put the test part of X into X_test
Put the training part of y into y_train
Put the test part of y into y_test

-- train_test_split(X, y, ...)
This function takes:
X = all input features (like age, country, salary)
y = all output labels (like "yes" or "no")
 "Split both X and y."

-- test_size=0.2
This means 20% of the data will be used for testing, 
and 80% for training.

-- random_state=1
Make the split the same each time, so you get consistent results."
'''
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [30]:
print(X_train)

[[0.0 0.0 1.0 38.77777777777778 52000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 35.0 58000.0]]


In [18]:
print(X_test)

[[0.0 1.0 0.0 30.0 54000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


In [19]:
print(y_train)

[0 1 0 0 1 1 0 1]


In [20]:
print(y_test)

[0 1]


### Feature Scaling

In [31]:
'''
from sklearn.preprocessing import StandardScaler
from → Take something from a library/module.

sklearn.preprocessing → This is scikit-learn’s toolbox for cleaning and preparing data.

import StandardScaler → StandardScaler: 
A tool inside preprocessing to scale/normalize numbers.
It scales numbers so they have: 
Mean = 0
Standard deviation = 1
This helps models learn faster and better.

sc = StandardScaler()
You're creating an object called sc.
Now sc can be used to apply scaling.
Create a scaler that will change your numbers to a standard format.

X_train[:, 3:]
X_train → Your training data.
[:, 3:] → Select all rows (:), and columns starting from index 3 to the end.
This assumes columns 0, 1, 2 are already processed (e.g., OneHotEncoded),
columns from 3 onward are numeric features like salary or age.

 sc.fit_transform(X_train[:, 3:])
fit → Look at the training data and learn the average (mean) and spread (standard deviation).
transform → Use what it learned to scale the numbers.

X_train[:, 3:] = ...
This replaces the old values with the new, scaled values.

X_test[:, 3:] = ...
This replaces the original values in those columns with the scaled values, so the test data is ready for prediction.
'''
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train[:, 3:] = sc.fit_transform(X_train[:, 3:])
X_test[:, 3:] = sc.transform(X_test[:, 3:])

In [25]:
print(X_train)

[[0.0 0.0 1.0 -0.19159184384578545 -1.0781259408412425]
 [0.0 1.0 0.0 -0.014117293757057777 -0.07013167641635372]
 [1.0 0.0 0.0 0.566708506533324 0.633562432710455]
 [0.0 0.0 1.0 -0.30453019390224867 -0.30786617274297867]
 [0.0 0.0 1.0 -1.9018011447007988 -1.420463615551582]
 [1.0 0.0 0.0 1.1475343068237058 1.232653363453549]
 [0.0 1.0 0.0 1.4379472069688968 1.5749910381638885]
 [1.0 0.0 0.0 -0.7401495441200351 -0.5646194287757332]]


In [26]:
print(X_test)

[[0.0 1.0 0.0 -1.4661817944830124 -0.9069571034860727]
 [1.0 0.0 0.0 -0.44973664397484414 0.2056403393225306]]
