###Loading a Dataset and Checking Its Dimensions


In [1]:
import pandas as pd
df=pd.read_csv('country_dataset.csv')
df.shape

(34, 4)

In [2]:
df.head()

Unnamed: 0,Country,Age,Salary,Purchase
0,India,38.0,48000.0,No
1,India,38.0,48000.0,Yes
2,India,38.0,48000.0,No
3,India,35.0,58000.0,No
4,India,35.0,58000.0,Yes


In [3]:
df.isnull().sum()

Country     0
Age         3
Salary      3
Purchase    0
dtype: int64

###Handling Missing Values Using Mean Imputation

In [6]:
import numpy as np
from sklearn.impute import SimpleImputer
simple=SimpleImputer(missing_values = np.nan,strategy='mean')
df=simple.fit_transform(df)

ValueError: Cannot use mean strategy with non-numeric data:
could not convert string to float: 'India'

In [7]:
df['Country'].unique()

array(['India', 'France', 'Germany'], dtype=object)

###Encoding Categorical Data Using Label Encoding

In [15]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
df_encoded = df
df_encoded['Country'] =encoder.fit_transform(df['Country'])

In [16]:
df['Country'].unique()

array([2, 0, 1])

In [17]:
df_encoded['Purchase'] =encoder.fit_transform(df['Purchase'])

In [18]:
df['Purchase'].unique()

array([0, 1])

In [19]:
df_encoded.shape

(34, 4)

###Handling Missing Values Using Mean Imputation 

In [20]:
import numpy as np
from sklearn.impute import SimpleImputer
simple=SimpleImputer(missing_values = np.nan,strategy='mean')
df_encoded=simple.fit_transform(df)

In [21]:
df_encoded.isnull().sum()

AttributeError: 'numpy.ndarray' object has no attribute 'isnull'

In [22]:
df_encoded = pd.DataFrame(df_encoded)

In [23]:
df_encoded.isnull().sum()

0    0
1    0
2    0
3    0
dtype: int64

In [24]:
df_encoded.describe()

Unnamed: 0,0,1,2,3
count,34.0,34.0,34.0,34.0
mean,0.794118,41.516129,63451.612903,0.529412
std,0.844928,6.198098,13796.878676,0.50664
min,0.0,30.0,45000.0,0.0
25%,0.0,37.0,53000.0,0.0
50%,1.0,41.516129,63451.612903,1.0
75%,1.75,48.0,77000.0,1.0
max,2.0,50.0,88000.0,1.0


In [25]:
x=df_encoded.iloc[:,0:3]
y=df_encoded.iloc[:,:3]

###Feature Scaling Using Min-Max Normalization

In [26]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
x_scaled = scaler.fit_transform(x)

In [27]:
x_scaled

array([[1.        , 0.4       , 0.06976744],
       [1.        , 0.4       , 0.06976744],
       [1.        , 0.4       , 0.06976744],
       [1.        , 0.25      , 0.30232558],
       [1.        , 0.25      , 0.30232558],
       [1.        , 0.25      , 0.30232558],
       [1.        , 1.        , 1.        ],
       [1.        , 1.        , 1.        ],
       [1.        , 1.        , 1.        ],
       [0.        , 0.65      , 0.        ],
       [0.        , 0.65      , 0.        ],
       [0.        , 0.65      , 0.        ],
       [0.        , 0.65      , 0.        ],
       [0.        , 0.9       , 0.46511628],
       [0.        , 0.9       , 0.46511628],
       [0.        , 0.9       , 0.46511628],
       [0.        , 0.9       , 0.46511628],
       [0.        , 0.95      , 0.79069767],
       [0.        , 0.95      , 0.79069767],
       [0.        , 0.95      , 0.79069767],
       [0.        , 0.95      , 0.79069767],
       [0.        , 0.35      , 0.74418605],
       [0.

###Feature Scaling Using Standardization

In [28]:
from sklearn.preprocessing import StandardScaler
std_scaler = StandardScaler()
x_std_scaled = scaler.fit_transform(x)

In [29]:
x_std_scaled

array([[1.        , 0.4       , 0.06976744],
       [1.        , 0.4       , 0.06976744],
       [1.        , 0.4       , 0.06976744],
       [1.        , 0.25      , 0.30232558],
       [1.        , 0.25      , 0.30232558],
       [1.        , 0.25      , 0.30232558],
       [1.        , 1.        , 1.        ],
       [1.        , 1.        , 1.        ],
       [1.        , 1.        , 1.        ],
       [0.        , 0.65      , 0.        ],
       [0.        , 0.65      , 0.        ],
       [0.        , 0.65      , 0.        ],
       [0.        , 0.65      , 0.        ],
       [0.        , 0.9       , 0.46511628],
       [0.        , 0.9       , 0.46511628],
       [0.        , 0.9       , 0.46511628],
       [0.        , 0.9       , 0.46511628],
       [0.        , 0.95      , 0.79069767],
       [0.        , 0.95      , 0.79069767],
       [0.        , 0.95      , 0.79069767],
       [0.        , 0.95      , 0.79069767],
       [0.        , 0.35      , 0.74418605],
       [0.

###Train Test Spilt

In [30]:
from sklearn.model_selection import train_test_split

In [31]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state=3)

In [34]:
X_train.shape


(23, 3)

In [35]:
X_test.shape

(11, 3)

In [36]:
y_train.shape

(23, 3)

In [37]:
y_test.shape

(11, 3)