## Importing Libraries

In [2]:
# Install necessary packages
# %pip install numpy
# %pip install pandas
# %pip install scikit-learn

import numpy as np
import pandas as pd

In [3]:
dataset = pd.read_csv('Preprocessing.csv')
x = dataset.iloc[:, :-1].values
y = dataset.iloc[:, 6].values
print(x)
print(y)

[[1 'Vignesh' 20.0 'Male' 50000.0 'India']
 [2 'Dhanush' 34.0 'Male' 60000.0 'USA']
 [3 'Eswar' nan 'Male' 45000.0 'UK']
 [4 'Sirisha' 29.0 'Female' nan 'India']
 [5 'Fakruddin' 40.0 'Male' 70000.0 'UK']
 [6 'Harshitha' 22.0 'Female' 38000.0 'USA']
 [7 'Sumanth' 35.0 'Male' 52000.0 'India']
 [8 'Anusha' 27.0 'Female' 48000.0 'UK']
 [9 'Afzal' 30.0 'Male' nan 'USA']
 [10 'Pavan' nan 'Male' 55000.0 'India']]
['Yes' 'No' 'Yes' 'No' 'Yes' 'No' 'Yes' 'Yes' 'Yes' 'No']


## Handling Missing Values

In [4]:
from sklearn.impute import SimpleImputer

# Select only the numeric columns for imputation

# imputation for Age column
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
x[:, 2:3] = imputer.fit_transform(x[:, 2:3])
print("Handling missing values for Age column")
print(x)
# imputation for Salary column
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
x[:, 4:5] = imputer.fit_transform(x[:, 4:5])
print("Handling missing values for Salary column")
print(x)

Handling missing values for Age column
[[1 'Vignesh' 20.0 'Male' 50000.0 'India']
 [2 'Dhanush' 34.0 'Male' 60000.0 'USA']
 [3 'Eswar' 29.625 'Male' 45000.0 'UK']
 [4 'Sirisha' 29.0 'Female' nan 'India']
 [5 'Fakruddin' 40.0 'Male' 70000.0 'UK']
 [6 'Harshitha' 22.0 'Female' 38000.0 'USA']
 [7 'Sumanth' 35.0 'Male' 52000.0 'India']
 [8 'Anusha' 27.0 'Female' 48000.0 'UK']
 [9 'Afzal' 30.0 'Male' nan 'USA']
 [10 'Pavan' 29.625 'Male' 55000.0 'India']]
Handling missing values for Salary column
[[1 'Vignesh' 20.0 'Male' 50000.0 'India']
 [2 'Dhanush' 34.0 'Male' 60000.0 'USA']
 [3 'Eswar' 29.625 'Male' 45000.0 'UK']
 [4 'Sirisha' 29.0 'Female' 52250.0 'India']
 [5 'Fakruddin' 40.0 'Male' 70000.0 'UK']
 [6 'Harshitha' 22.0 'Female' 38000.0 'USA']
 [7 'Sumanth' 35.0 'Male' 52000.0 'India']
 [8 'Anusha' 27.0 'Female' 48000.0 'UK']
 [9 'Afzal' 30.0 'Male' 52250.0 'USA']
 [10 'Pavan' 29.625 'Male' 55000.0 'India']]


## Encoding Categorical data

### Encoding Independent Variable

In [5]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [5])], remainder='passthrough')
x = ct.fit_transform(x)
print("\nEncoding the Country column")
print(x)

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
x[:, 6] = le.fit_transform(x[:, 6])
print("\nEncoding the Gender column")
print(x)


Encoding the Country column
[[1.0 0.0 0.0 1 'Vignesh' 20.0 'Male' 50000.0]
 [0.0 0.0 1.0 2 'Dhanush' 34.0 'Male' 60000.0]
 [0.0 1.0 0.0 3 'Eswar' 29.625 'Male' 45000.0]
 [1.0 0.0 0.0 4 'Sirisha' 29.0 'Female' 52250.0]
 [0.0 1.0 0.0 5 'Fakruddin' 40.0 'Male' 70000.0]
 [0.0 0.0 1.0 6 'Harshitha' 22.0 'Female' 38000.0]
 [1.0 0.0 0.0 7 'Sumanth' 35.0 'Male' 52000.0]
 [0.0 1.0 0.0 8 'Anusha' 27.0 'Female' 48000.0]
 [0.0 0.0 1.0 9 'Afzal' 30.0 'Male' 52250.0]
 [1.0 0.0 0.0 10 'Pavan' 29.625 'Male' 55000.0]]

Encoding the Gender column
[[1.0 0.0 0.0 1 'Vignesh' 20.0 1 50000.0]
 [0.0 0.0 1.0 2 'Dhanush' 34.0 1 60000.0]
 [0.0 1.0 0.0 3 'Eswar' 29.625 1 45000.0]
 [1.0 0.0 0.0 4 'Sirisha' 29.0 0 52250.0]
 [0.0 1.0 0.0 5 'Fakruddin' 40.0 1 70000.0]
 [0.0 0.0 1.0 6 'Harshitha' 22.0 0 38000.0]
 [1.0 0.0 0.0 7 'Sumanth' 35.0 1 52000.0]
 [0.0 1.0 0.0 8 'Anusha' 27.0 0 48000.0]
 [0.0 0.0 1.0 9 'Afzal' 30.0 1 52250.0]
 [1.0 0.0 0.0 10 'Pavan' 29.625 1 55000.0]]


### Encoding Dependent Variable

In [6]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)
print("\nEncoding the Purchased column")
print(y)


Encoding the Purchased column
[1 0 1 0 1 0 1 1 1 0]


## Split the dataset into the Training set and Test set

In [7]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3,random_state=1)
print("\nSplitting the dataset into training and testing sets")
print("x_train")
print(x_train)

print("\nx_test")
print(x_test)

print("\ny_train")
print(y_train)

print("\ny_test")
print(y_test)


Splitting the dataset into training and testing sets
x_train
[[0.0 1.0 0.0 5 'Fakruddin' 40.0 1 70000.0]
 [1.0 0.0 0.0 1 'Vignesh' 20.0 1 50000.0]
 [1.0 0.0 0.0 4 'Sirisha' 29.0 0 52250.0]
 [0.0 0.0 1.0 2 'Dhanush' 34.0 1 60000.0]
 [0.0 1.0 0.0 8 'Anusha' 27.0 0 48000.0]
 [0.0 0.0 1.0 9 'Afzal' 30.0 1 52250.0]
 [0.0 0.0 1.0 6 'Harshitha' 22.0 0 38000.0]]

x_test
[[0.0 1.0 0.0 3 'Eswar' 29.625 1 45000.0]
 [1.0 0.0 0.0 10 'Pavan' 29.625 1 55000.0]
 [1.0 0.0 0.0 7 'Sumanth' 35.0 1 52000.0]]

y_train
[1 1 0 0 1 1 0]

y_test
[1 0 1]


## Feature Scalling

In [8]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
# Only numerical features (Age, Salary) need scaling
x_train[:, [5,7]] = sc.fit_transform(x_train[:, [5,7]])
x_test[:, [5,7]] = sc.transform(x_test[:, [5,7]])
print("\nFeature Scaling")
print("x_train")
print(x_train)
print("\nx_test")
print(x_test)


Feature Scaling
x_train
[[0.0 1.0 0.0 5 'Fakruddin' 1.7591498917973942 1 1.8468680746410224]
 [1.0 0.0 0.0 1 'Vignesh' -1.3982986319415185 1 -0.3168267408379995]
 [1.0 0.0 0.0 4 'Sirisha' 0.022553203740992154 0 -0.07341107409660956]
 [0.0 0.0 1.0 2 'Dhanush' 0.8119153346757203 1 0.7650206669015114]
 [0.0 1.0 0.0 8 'Anusha' -0.2931916486328991 0 -0.5331962223859017]
 [0.0 0.0 1.0 9 'Afzal' 0.1804256299279378 1 -0.07341107409660956]
 [0.0 0.0 1.0 6 'Harshitha' -1.0825537795676272 0 -1.6150436301254127]]

x_test
[[0.0 1.0 0.0 3 'Eswar' 0.12122347010783317 1 -0.857750444707755]
 [1.0 0.0 0.0 10 'Pavan' 0.12122347010783317 1 0.22409696303175594]
 [1.0 0.0 0.0 7 'Sumanth' 0.9697877608626659 1 -0.10045725929009733]]
