## Importing Libraries

In [310]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [311]:
dataset = pd.read_csv('Preprocessing.csv')
x = dataset.iloc[:, :].values
y = dataset.iloc[:, 3].values
print(x)
print(y)

[[1 'Vignesh' 20.0 'Male' 50000.0 'India']
 [2 'Dhanush' 34.0 'Male' 60000.0 'USA']
 [3 'Eswar' nan 'Male' 45000.0 'UK']
 [4 'Sirisha' 29.0 'Female' nan 'India']
 [5 'Fakruddin' 40.0 'Male' 70000.0 'UK']
 [6 'Harshitha' 22.0 'Female' 38000.0 'USA']
 [7 'Sumanth' 35.0 'Male' 52000.0 'India']
 [8 'Anusha' 27.0 'Female' 48000.0 'UK']
 [9 'Afzal' 30.0 'Male' nan 'USA']
 [10 'Pavan' nan 'Male' 55000.0 'India']]
['Male' 'Male' 'Male' 'Female' 'Male' 'Female' 'Male' 'Female' 'Male'
 'Male']


## Handling Missing Values

In [312]:
from sklearn.impute import SimpleImputer

# Select only the numeric columns for imputation

# imputation for Age column
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(x[:, 2:3])
x[:, 2:3] = imputer.transform(x[:, 2:3])
print("Handling missing values for Age column")
print(x)

# imputation for Salary column
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(x[:, 4:5])
x[:, 4:5] = imputer.transform(x[:, 4:5])
print("\nHandling missing values for Salary column")
print(x)



Handling missing values for Age column
[[1 'Vignesh' 20.0 'Male' 50000.0 'India']
 [2 'Dhanush' 34.0 'Male' 60000.0 'USA']
 [3 'Eswar' 29.625 'Male' 45000.0 'UK']
 [4 'Sirisha' 29.0 'Female' nan 'India']
 [5 'Fakruddin' 40.0 'Male' 70000.0 'UK']
 [6 'Harshitha' 22.0 'Female' 38000.0 'USA']
 [7 'Sumanth' 35.0 'Male' 52000.0 'India']
 [8 'Anusha' 27.0 'Female' 48000.0 'UK']
 [9 'Afzal' 30.0 'Male' nan 'USA']
 [10 'Pavan' 29.625 'Male' 55000.0 'India']]

Handling missing values for Salary column
[[1 'Vignesh' 20.0 'Male' 50000.0 'India']
 [2 'Dhanush' 34.0 'Male' 60000.0 'USA']
 [3 'Eswar' 29.625 'Male' 45000.0 'UK']
 [4 'Sirisha' 29.0 'Female' 52250.0 'India']
 [5 'Fakruddin' 40.0 'Male' 70000.0 'UK']
 [6 'Harshitha' 22.0 'Female' 38000.0 'USA']
 [7 'Sumanth' 35.0 'Male' 52000.0 'India']
 [8 'Anusha' 27.0 'Female' 48000.0 'UK']
 [9 'Afzal' 30.0 'Male' 52250.0 'USA']
 [10 'Pavan' 29.625 'Male' 55000.0 'India']]


## Encoding Categorical data

### Encoding Independent Variable

In [313]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [5])], remainder='passthrough')
x = np.array(ct.fit_transform(x))
print("\nEncoding the Country column")
print(x)


Encoding the Country column
[[1.0 0.0 0.0 1 'Vignesh' 20.0 'Male' 50000.0]
 [0.0 0.0 1.0 2 'Dhanush' 34.0 'Male' 60000.0]
 [0.0 1.0 0.0 3 'Eswar' 29.625 'Male' 45000.0]
 [1.0 0.0 0.0 4 'Sirisha' 29.0 'Female' 52250.0]
 [0.0 1.0 0.0 5 'Fakruddin' 40.0 'Male' 70000.0]
 [0.0 0.0 1.0 6 'Harshitha' 22.0 'Female' 38000.0]
 [1.0 0.0 0.0 7 'Sumanth' 35.0 'Male' 52000.0]
 [0.0 1.0 0.0 8 'Anusha' 27.0 'Female' 48000.0]
 [0.0 0.0 1.0 9 'Afzal' 30.0 'Male' 52250.0]
 [1.0 0.0 0.0 10 'Pavan' 29.625 'Male' 55000.0]]


### Encoding Dependent Variable

In [314]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)
print("\nEncoding the Purchased column")
print(y)


Encoding the Purchased column
[1 1 1 0 1 0 1 0 1 1]


## Split the dataset into the Training set and Test set

In [317]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3,random_state=1)
print("\nSplitting the dataset into training and testing sets")
print("x_train")
print(x_train)

print("\nx_test")
print(x_test)

print("\ny_train")
print(y_train)

print("\ny_test")
print(y_test)


Splitting the dataset into training and testing sets
x_train
[[0.0 1.0 0.0 5 'Fakruddin' 40.0 'Male' 70000.0]
 [1.0 0.0 0.0 1 'Vignesh' 20.0 'Male' 50000.0]
 [1.0 0.0 0.0 4 'Sirisha' 29.0 'Female' 52250.0]
 [0.0 0.0 1.0 2 'Dhanush' 34.0 'Male' 60000.0]
 [0.0 1.0 0.0 8 'Anusha' 27.0 'Female' 48000.0]
 [0.0 0.0 1.0 9 'Afzal' 30.0 'Male' 52250.0]
 [0.0 0.0 1.0 6 'Harshitha' 22.0 'Female' 38000.0]]

x_test
[[0.0 1.0 0.0 3 'Eswar' 29.625 'Male' 45000.0]
 [1.0 0.0 0.0 10 'Pavan' 29.625 'Male' 55000.0]
 [1.0 0.0 0.0 7 'Sumanth' 35.0 'Male' 52000.0]]

y_train
[1 1 0 1 0 1 0]

y_test
[1 1 1]
