In [2]:
## Importing the essential libraries

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [3]:
## Importing the dataset

dataset = pd.read_csv('Data.csv')
print(dataset.head())
print('-' * 80)
print(dataset.info())

   Country   Age   Salary Purchased
0   France  44.0  72000.0        No
1    Spain  27.0  48000.0       Yes
2  Germany  30.0  54000.0        No
3    Spain  38.0  61000.0        No
4  Germany  40.0      NaN       Yes
--------------------------------------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Country    10 non-null     object 
 1   Age        9 non-null      float64
 2   Salary     9 non-null      float64
 3   Purchased  10 non-null     object 
dtypes: float64(2), object(2)
memory usage: 448.0+ bytes
None


In [4]:
## Handling Missing data

from sklearn.impute import SimpleImputer

imputer = SimpleImputer(missing_values=np.nan, strategy='mean') 
imputer = imputer.fit(dataset[['Age', 'Salary']]) 
dataset[['Age', 'Salary']] = imputer.transform(dataset[['Age', 'Salary']]) 
dataset

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,63777.777778,Yes
5,France,35.0,58000.0,Yes
6,Spain,38.777778,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [5]:
## Encoding the Independent variable

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# As you can see, you don't need to use LabelEncoder anymore.
ct = ColumnTransformer([('Country-Encoder', OneHotEncoder(), [0])], remainder='passthrough')
dataset = ct.fit_transform(dataset)
dataset

array([[1.0, 0.0, 0.0, 44.0, 72000.0, 'No'],
       [0.0, 0.0, 1.0, 27.0, 48000.0, 'Yes'],
       [0.0, 1.0, 0.0, 30.0, 54000.0, 'No'],
       [0.0, 0.0, 1.0, 38.0, 61000.0, 'No'],
       [0.0, 1.0, 0.0, 40.0, 63777.77777777778, 'Yes'],
       [1.0, 0.0, 0.0, 35.0, 58000.0, 'Yes'],
       [0.0, 0.0, 1.0, 38.77777777777778, 52000.0, 'No'],
       [1.0, 0.0, 0.0, 48.0, 79000.0, 'Yes'],
       [0.0, 1.0, 0.0, 50.0, 83000.0, 'No'],
       [1.0, 0.0, 0.0, 37.0, 67000.0, 'Yes']], dtype=object)

In [6]:
## Encoding the Dependent variable

from sklearn.preprocessing import LabelEncoder

l_encoder_y = LabelEncoder()
dataset[:, 5] = l_encoder_y.fit_transform(dataset[:,5])
dataset

array([[1.0, 0.0, 0.0, 44.0, 72000.0, 0],
       [0.0, 0.0, 1.0, 27.0, 48000.0, 1],
       [0.0, 1.0, 0.0, 30.0, 54000.0, 0],
       [0.0, 0.0, 1.0, 38.0, 61000.0, 0],
       [0.0, 1.0, 0.0, 40.0, 63777.77777777778, 1],
       [1.0, 0.0, 0.0, 35.0, 58000.0, 1],
       [0.0, 0.0, 1.0, 38.77777777777778, 52000.0, 0],
       [1.0, 0.0, 0.0, 48.0, 79000.0, 1],
       [0.0, 1.0, 0.0, 50.0, 83000.0, 0],
       [1.0, 0.0, 0.0, 37.0, 67000.0, 1]], dtype=object)

In [7]:
## Matrix of features

# Independent variables
X = dataset[:,:-1]

# Dependent variable vector
y = dataset[:,5]

X.shape, y.shape

((10, 5), (10,))

In [8]:
## Splitting the dataset into Training and Test set

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
X_train.shape

(8, 5)

In [10]:
## Feature Scaling

from sklearn.preprocessing import StandardScaler

sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

sc_y = StandardScaler()
y_train = np.squeeze(sc_y.fit_transform(y_train.reshape(-1, 1)))
y_train

array([ 0.77459667,  0.77459667,  0.77459667, -1.29099445,  0.77459667,
       -1.29099445, -1.29099445,  0.77459667])