# Data Preprocessing

### Importing the libraries

In [64]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

### Importing the dataset


In [65]:
dataset = pd.read_excel('ML101 dataset_5 Data.xlsx')

In [66]:
dataset

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [67]:
dataset.describe()

Unnamed: 0,Age,Salary
count,9.0,9.0
mean,38.777778,63777.777778
std,7.693793,12265.579662
min,27.0,48000.0
25%,35.0,54000.0
50%,38.0,61000.0
75%,44.0,72000.0
max,50.0,83000.0


In [68]:
dataset["Age"].fillna(dataset["Age"].mean(),inplace=True)
dataset["Salary"].fillna(dataset["Salary"].mean(),inplace=True)
dataset

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,63777.777778,Yes
5,France,35.0,58000.0,Yes
6,Spain,38.777778,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [69]:
dataset["Age"].skew() # no need of transformation

-0.010260911562919134

In [70]:
dataset["Salary"].skew() # want to do log trasnformation

0.4049124736789858

In [71]:
dataset["log_salary"] = np.log(dataset["Salary"])
dataset

Unnamed: 0,Country,Age,Salary,Purchased,log_salary
0,France,44.0,72000.0,No,11.184421
1,Spain,27.0,48000.0,Yes,10.778956
2,Germany,30.0,54000.0,No,10.896739
3,Spain,38.0,61000.0,No,11.018629
4,Germany,40.0,63777.777778,Yes,11.06316
5,France,35.0,58000.0,Yes,10.968198
6,Spain,38.777778,52000.0,No,10.858999
7,France,48.0,79000.0,Yes,11.277203
8,Germany,50.0,83000.0,No,11.326596
9,France,37.0,67000.0,Yes,11.112448


In [13]:
dataset["log_salary"].skew()

0.13797640024529664

In [21]:
dataset["Age"].skew()
# slighly left skewd - apply exponential transformation ( not recommended here becoz the data is is noraml distribution)

-0.010260911562919134

In [22]:
# exponential transaformation just for example here since data is 
# rigth skwed and exponential is applied on left skwed i.e -ve skew
dataset["exp_age"] = np.exp(dataset["Age"])

In [23]:
# power transformation
dataset["power_age"] = dataset["Age"]**(4)

In [25]:
### BOX-COX Transformation can be applied on both left and right skewed 

from scipy.stats import boxcox 
dataset["salary_boxcox"], param = boxcox(dataset["Salary"]) 
# boxcox(dataset["Salary"]) give two values one trasnformed array and second the parameter (lambda)
dataset["salary_boxcox"].skew()

0.02936641120113348

In [26]:
dataset["Salary"].skew()

0.4049124736789858

In [36]:
param # at lambda = param the boxcox transformation gives normal distribution , lambda between -5 to 5

-0.39693910238637137

### Selecting Independent variables

In [93]:
dataset = pd.read_excel('ML101 dataset_5 Data.xlsx')
dataset["Age"].fillna(dataset["Age"].mean(),inplace=True)
dataset["Salary"].fillna(dataset["Salary"].mean(),inplace=True)


X = dataset.iloc[:, :-1]
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Country    10 non-null     object 
 1   Age        10 non-null     float64
 2   Salary     10 non-null     float64
 3   Purchased  10 non-null     object 
dtypes: float64(2), object(2)
memory usage: 448.0+ bytes


In [94]:
X

Unnamed: 0,Country,Age,Salary
0,France,44.0,72000.0
1,Spain,27.0,48000.0
2,Germany,30.0,54000.0
3,Spain,38.0,61000.0
4,Germany,40.0,63777.777778
5,France,35.0,58000.0
6,Spain,38.777778,52000.0
7,France,48.0,79000.0
8,Germany,50.0,83000.0
9,France,37.0,67000.0


### Selecting Dependent variable

In [95]:
dataset

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,63777.777778,Yes
5,France,35.0,58000.0,Yes
6,Spain,38.777778,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [96]:
y = dataset.iloc[:, -1]
y = dataset["Purchased"]
y

0     No
1    Yes
2     No
3     No
4    Yes
5    Yes
6     No
7    Yes
8     No
9    Yes
Name: Purchased, dtype: object

In [90]:
# Taking care of missing data
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(X[:, 1:3])
X[:, 1:3] = imputer.transform(X[:, 1:3])
X

TypeError: '(slice(None, None, None), slice(1, 3, None))' is an invalid key

In [97]:
y

0     No
1    Yes
2     No
3     No
4    Yes
5    Yes
6     No
7    Yes
8     No
9    Yes
Name: Purchased, dtype: object

## Encoding categorical data

## 2.1: Encoding the Independent Variable

If we convert it into France, Spain and Germany into 0, 1 & 2. ML models may take it as a numerical order. 
ML model may think that order matters. 
However, there is no relationship. There will be some misinterpreted correlation. 
Onehot encoding convert this country column into three columns, using binary vectors. 
Hence, there will be no numerical order. 

In [92]:
from sklearn.compose import ColumnTransformer # Importing class column transform
from sklearn.preprocessing import OneHotEncoder # import class OneHotEncoder

# create object ct
# Call class ColumnTransformer
# First call transformers to tell which tranformation we have to do and which column we have to use
# encoder means we want encoding and then what type of encoding means OneHotEncoder
# remainder will tell us to keep the column where we will not apply the transformation
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')

# using fit_transform method to do onehotencoding
# It does not give result into numpy array
X = (ct.fit_transform(X))
X

array([[1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 4.40000000e+01,
        7.20000000e+04],
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 2.70000000e+01,
        4.80000000e+04],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 3.00000000e+01,
        5.40000000e+04],
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 3.80000000e+01,
        6.10000000e+04],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 4.00000000e+01,
        6.37777778e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 3.50000000e+01,
        5.80000000e+04],
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 3.87777778e+01,
        5.20000000e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 4.80000000e+01,
        7.90000000e+04],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 5.00000000e+01,
        8.30000000e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 3.70000000e+01,
        6.70000000e+04]])

### Encoding the Dependent Variable

In [56]:
from sklearn.preprocessing import LabelEncoder # importing labelencoder
le = LabelEncoder()
y = le.fit_transform(y)
y

  return f(*args, **kwargs)


array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1], dtype=int64)

#### When should we do feature scaling? Is it before or after splitting the dataset?

Feature scaling should be done after splitting the dataset. For example, if we do before mean and standard deviation will be from all the values including the one's from test set. Test set is not supposed to have information from training set. If we use all values for feature scaling, it would lead to information leakage on the test set. Test set is supposed to be new data or new observation. 

## Splitting the dataset into the Training set and Test set

In [98]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)
# X will be divided into two parts X_train and x_test , similaryly with y
# test_size 0.2 means 20 percent of data will be in test 
# random_State = 1 means, we will get the same information everytime as we are randomly splitting the data. since its not time series
# random_state means we are just fixing the seet here.


In [99]:
X_train # without random_state = 1 if we excecute this cell every time the output will be changed 
# random_state = 1 becoz same data we need to give it to other algoriths , 

Unnamed: 0,Country,Age,Salary
6,Spain,38.777778,52000.0
4,Germany,40.0,63777.777778
0,France,44.0,72000.0
3,Spain,38.0,61000.0
1,Spain,27.0,48000.0
7,France,48.0,79000.0
8,Germany,50.0,83000.0
5,France,35.0,58000.0


In [100]:
X_test # without random_state = 1 if we excecute this cell every time the output will be changed 

Unnamed: 0,Country,Age,Salary
2,Germany,30.0,54000.0
9,France,37.0,67000.0


In [101]:
y_train

6     No
4    Yes
0     No
3     No
1    Yes
7    Yes
8     No
5    Yes
Name: Purchased, dtype: object

In [102]:
y_test

2     No
9    Yes
Name: Purchased, dtype: object

## Feature Scaling

#### Standardization
$X = \frac{Xi - {X}_{mean}} {SD}$

In [45]:
df = pd.read_excel('ML101 dataset_5 Data.xlsx')

In [46]:
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [48]:
df["Age"].fillna(df["Age"].mean(),inplace=True)
df["Salary"].fillna(df["Salary"].mean(),inplace=True)

In [49]:
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,63777.777778,Yes
5,France,35.0,58000.0,Yes
6,Spain,38.777778,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [None]:
# Age - continuous data , Salay - Continuous data 

In [50]:
df["Age"].mean()

38.77777777777778

In [52]:
df["Age"].std(ddof=0) # population value so ddof =0 , for sample ddof = 1

6.881537295950342

In [54]:
df["Age_Standard"] = (df["Age"]-df["Age"].mean())/df["Age"].std(ddof=0)

In [55]:
df

Unnamed: 0,Country,Age,Salary,Purchased,Age_Standard
0,France,44.0,72000.0,No,0.758874
1,Spain,27.0,48000.0,Yes,-1.711504
2,Germany,30.0,54000.0,No,-1.275555
3,Spain,38.0,61000.0,No,-0.113024
4,Germany,40.0,63777.777778,Yes,0.177609
5,France,35.0,58000.0,Yes,-0.548973
6,Spain,38.777778,52000.0,No,0.0
7,France,48.0,79000.0,Yes,1.34014
8,Germany,50.0,83000.0,No,1.630773
9,France,37.0,67000.0,Yes,-0.25834


In [56]:
df["Salary_Standard"] = (df["Salary"]-df["Salary"].mean())/df["Salary"].std(ddof=0)

In [57]:
df

Unnamed: 0,Country,Age,Salary,Purchased,Age_Standard,Salary_Standard
0,France,44.0,72000.0,No,0.758874,0.7494733
1,Spain,27.0,48000.0,Yes,-1.711504,-1.438178
2,Germany,30.0,54000.0,No,-1.275555,-0.8912655
3,Spain,38.0,61000.0,No,-0.113024,-0.2532004
4,Germany,40.0,63777.777778,Yes,0.177609,6.632192e-16
5,France,35.0,58000.0,Yes,-0.548973,-0.5266569
6,Spain,38.777778,52000.0,No,0.0,-1.07357
7,France,48.0,79000.0,Yes,1.34014,1.387538
8,Germany,50.0,83000.0,No,1.630773,1.752147
9,France,37.0,67000.0,Yes,-0.25834,0.2937125


In [None]:
# Question 1 : every time we are going to do the standarzization ? - yes , when magnitude of one column is high and others are low 
# Question 2 : if we have lots of columns then this above pandas method is not good > so we use sklearn , StandardScalar method 

In [59]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

sc.fit_transform(df[["Age","Salary"]]) # sc.fit_transform(df[list of column names to transfrom ]) list -> two dimensional array

array([[ 7.58874362e-01,  7.49473254e-01],
       [-1.71150388e+00, -1.43817841e+00],
       [-1.27555478e+00, -8.91265492e-01],
       [-1.13023841e-01, -2.53200424e-01],
       [ 1.77608893e-01,  6.63219199e-16],
       [-5.48972942e-01, -5.26656882e-01],
       [ 0.00000000e+00, -1.07356980e+00],
       [ 1.34013983e+00,  1.38753832e+00],
       [ 1.63077256e+00,  1.75214693e+00],
       [-2.58340208e-01,  2.93712492e-01]])

In [60]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

sc.fit_transform(df.iloc[:,[1,2]])  # same above method just selected all rows and column 1,2 index with iloc 

array([[ 7.58874362e-01,  7.49473254e-01],
       [-1.71150388e+00, -1.43817841e+00],
       [-1.27555478e+00, -8.91265492e-01],
       [-1.13023841e-01, -2.53200424e-01],
       [ 1.77608893e-01,  6.63219199e-16],
       [-5.48972942e-01, -5.26656882e-01],
       [ 0.00000000e+00, -1.07356980e+00],
       [ 1.34013983e+00,  1.38753832e+00],
       [ 1.63077256e+00,  1.75214693e+00],
       [-2.58340208e-01,  2.93712492e-01]])

In [61]:
df 

Unnamed: 0,Country,Age,Salary,Purchased,Age_Standard,Salary_Standard
0,France,44.0,72000.0,No,0.758874,0.7494733
1,Spain,27.0,48000.0,Yes,-1.711504,-1.438178
2,Germany,30.0,54000.0,No,-1.275555,-0.8912655
3,Spain,38.0,61000.0,No,-0.113024,-0.2532004
4,Germany,40.0,63777.777778,Yes,0.177609,6.632192e-16
5,France,35.0,58000.0,Yes,-0.548973,-0.5266569
6,Spain,38.777778,52000.0,No,0.0,-1.07357
7,France,48.0,79000.0,Yes,1.34014,1.387538
8,Germany,50.0,83000.0,No,1.630773,1.752147
9,France,37.0,67000.0,Yes,-0.25834,0.2937125


In [40]:
df = pd.read_excel('ML101 dataset_5 Data.xlsx')
df["Age"].fillna(df["Age"].mean(),inplace=True)
df["Salary"].fillna(df["Salary"].mean(),inplace=True)

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

df.iloc[:,[1,2]] = sc.fit_transform(df.iloc[:,[1,2]]) # replacing the columns with transformed columns

In [70]:
df  # values replaced vwith transfomed values 

Unnamed: 0,Country,Age,Salary,Purchased
0,France,0.758874,0.7494733,No
1,Spain,-1.711504,-1.438178,Yes
2,Germany,-1.275555,-0.8912655,No
3,Spain,-0.113024,-0.2532004,No
4,Germany,0.177609,6.632192e-16,Yes
5,France,-0.548973,-0.5266569,Yes
6,Spain,0.0,-1.07357,No
7,France,1.34014,1.387538,Yes
8,Germany,1.630773,1.752147,No
9,France,-0.25834,0.2937125,Yes


In [103]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

# fit will calculate mean and sd
# transform will calculate standardized value
X_train.iloc[:,[1,2]] = sc.fit_transform(X_train.iloc[:, [1,2]])

# Here we will use only transform method as test data set is like new dataset. 
# Hence, we have to use same scalar that was used for training datset because ML model will be trained with particular scaler
# using fit method will add new scalar
X_test.iloc[:, [1,2]] = sc.transform(X_test.iloc[:, [1,2]])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value[:, i].tolist(), pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value[:, i].tolist(), pi)


In [104]:
X_train

Unnamed: 0,Country,Age,Salary
6,Spain,-0.191592,-1.078126
4,Germany,-0.014117,-0.070132
0,France,0.566709,0.633562
3,Spain,-0.30453,-0.307866
1,Spain,-1.901801,-1.420464
7,France,1.147534,1.232653
8,Germany,1.437947,1.574991
5,France,-0.74015,-0.564619


In [105]:
X_test

Unnamed: 0,Country,Age,Salary
2,Germany,-1.466182,-0.906957
9,France,-0.449737,0.20564


#### Min-max scaler (brings every value between 0 to 1)
$X = \frac{{X}_{i} - min(X)} {max(X) - min(X)}$

In [30]:
df = pd.read_excel('ML101 dataset_5 Data.xlsx')
df["Age"].fillna(df["Age"].mean(),inplace=True)
df["Salary"].fillna(df["Salary"].mean(),inplace=True)

from sklearn.preprocessing import MinMaxScaler

mm = MinMaxScaler()

df.iloc[:,[1,2]] = mm.fit_transform(df.iloc[:,[1,2]]) # replacing the columns with transformed columns
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,0.73913,0.685714,No
1,Spain,0.0,0.0,Yes
2,Germany,0.130435,0.171429,No
3,Spain,0.478261,0.371429,No
4,Germany,0.565217,0.450794,Yes
5,France,0.347826,0.285714,Yes
6,Spain,0.512077,0.114286,No
7,France,0.913043,0.885714,Yes
8,Germany,1.0,1.0,No
9,France,0.434783,0.542857,Yes


In [107]:
from sklearn.preprocessing import MinMaxScaler
mm = MinMaxScaler()

# fit will calculate mean and sd
# transform will calculate standardized value
X_train.iloc[:, [1,2]] = mm.fit_transform(X_train.iloc[:, [1,2]])

# Here we will use only transform method as test data set is like new dataset. 
# Hence, we have to use same scalar that was used for training datset because ML model will be trained with particular scaler
# using fit method will add new scalar
X_test.iloc[:, [1,2]] = mm.transform(X_test.iloc[:, [1,2]])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value[:, i].tolist(), pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value[:, i].tolist(), pi)


In [108]:
X_train

Unnamed: 0,Country,Age,Salary
6,Spain,0.512077,0.114286
4,Germany,0.565217,0.450794
0,France,0.73913,0.685714
3,Spain,0.478261,0.371429
1,Spain,0.0,0.0
7,France,0.913043,0.885714
8,Germany,1.0,1.0
5,France,0.347826,0.285714


In [109]:
X_test

Unnamed: 0,Country,Age,Salary
2,Germany,0.130435,0.171429
9,France,0.434783,0.542857


In [110]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [111]:
dataset = pd.read_excel('ML101 dataset_5 Data.xlsx')
dataset["Age"].fillna(dataset["Age"].mean(),inplace=True)
dataset["Salary"].fillna(dataset["Salary"].mean(),inplace=True)


X = dataset.iloc[:, :-1]
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Country    10 non-null     object 
 1   Age        10 non-null     float64
 2   Salary     10 non-null     float64
 3   Purchased  10 non-null     object 
dtypes: float64(2), object(2)
memory usage: 448.0+ bytes


In [112]:
X

Unnamed: 0,Country,Age,Salary
0,France,44.0,72000.0
1,Spain,27.0,48000.0
2,Germany,30.0,54000.0
3,Spain,38.0,61000.0
4,Germany,40.0,63777.777778
5,France,35.0,58000.0
6,Spain,38.777778,52000.0
7,France,48.0,79000.0
8,Germany,50.0,83000.0
9,France,37.0,67000.0


In [113]:
dataset

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,63777.777778,Yes
5,France,35.0,58000.0,Yes
6,Spain,38.777778,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [114]:
y = dataset.iloc[:, -1]
y = dataset["Purchased"]
y

0     No
1    Yes
2     No
3     No
4    Yes
5    Yes
6     No
7    Yes
8     No
9    Yes
Name: Purchased, dtype: object

In [115]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)
# X will be divided into two parts X_train and x_test , similaryly with y
# test_size 0.2 means 20 percent of data will be in test 
# random_State = 1 means, we will get the same information everytime as we are randomly splitting the data. since its not time series
# random_state means we are just fixing the seet here.


In [116]:
X_train # without random_state = 1 if we excecute this cell every time the output will be changed 
# random_state = 1 becoz same data we need to give it to other algoriths , 

Unnamed: 0,Country,Age,Salary
6,Spain,38.777778,52000.0
4,Germany,40.0,63777.777778
0,France,44.0,72000.0
3,Spain,38.0,61000.0
1,Spain,27.0,48000.0
7,France,48.0,79000.0
8,Germany,50.0,83000.0
5,France,35.0,58000.0


In [117]:
X_test # without random_state = 1 if we excecute this cell every time the output will be changed 

Unnamed: 0,Country,Age,Salary
2,Germany,30.0,54000.0
9,France,37.0,67000.0


In [118]:
y_train

6     No
4    Yes
0     No
3     No
1    Yes
7    Yes
8     No
5    Yes
Name: Purchased, dtype: object

In [119]:
y_test

2     No
9    Yes
Name: Purchased, dtype: object

In [120]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

# fit will calculate mean and sd
# transform will calculate standardized value
X_train.iloc[:,[1,2]] = sc.fit_transform(X_train.iloc[:, [1,2]])

# Here we will use only transform method as test data set is like new dataset. 
# Hence, we have to use same scalar that was used for training datset because ML model will be trained with particular scaler
# using fit method will add new scalar
X_test.iloc[:, [1,2]] = sc.transform(X_test.iloc[:, [1,2]])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value[:, i].tolist(), pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value[:, i].tolist(), pi)


In [121]:
X_train

Unnamed: 0,Country,Age,Salary
6,Spain,-0.191592,-1.078126
4,Germany,-0.014117,-0.070132
0,France,0.566709,0.633562
3,Spain,-0.30453,-0.307866
1,Spain,-1.901801,-1.420464
7,France,1.147534,1.232653
8,Germany,1.437947,1.574991
5,France,-0.74015,-0.564619


In [122]:
X_test

Unnamed: 0,Country,Age,Salary
2,Germany,-1.466182,-0.906957
9,France,-0.449737,0.20564


In [123]:
from sklearn.preprocessing import MinMaxScaler
mm = MinMaxScaler()

# fit will calculate mean and sd
# transform will calculate standardized value
X_train.iloc[:, [1,2]] = mm.fit_transform(X_train.iloc[:, [1,2]])

# Here we will use only transform method as test data set is like new dataset. 
# Hence, we have to use same scalar that was used for training datset because ML model will be trained with particular scaler
# using fit method will add new scalar
X_test.iloc[:, [1,2]] = mm.transform(X_test.iloc[:, [1,2]])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value[:, i].tolist(), pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value[:, i].tolist(), pi)


In [124]:
X_train

Unnamed: 0,Country,Age,Salary
6,Spain,0.512077,0.114286
4,Germany,0.565217,0.450794
0,France,0.73913,0.685714
3,Spain,0.478261,0.371429
1,Spain,0.0,0.0
7,France,0.913043,0.885714
8,Germany,1.0,1.0
5,France,0.347826,0.285714


In [125]:
X_test

Unnamed: 0,Country,Age,Salary
2,Germany,0.130435,0.171429
9,France,0.434783,0.542857
