In [None]:
### Data Preprocessing Template

In [1]:
## Importing the libraries

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
## Importing the dataset with using pandas
datasets = pd.read_csv("Data.csv")            # imported the Dataset as pandas dataframe.It includes header.

datasets

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [3]:
## Splitting the dataset to the features and label
X = datasets.iloc[:,:-1].values            # takes all columns without last column which is label

y = datasets.iloc[:, 3].values             # just takes last column according the its column's order 
                                           # ( be careful that python list sort rule begins with 0th)

X,y                                        # X :all features -  y:label or target value 

(array([['France', 44.0, 72000.0],
        ['Spain', 27.0, 48000.0],
        ['Germany', 30.0, 54000.0],
        ['Spain', 38.0, 61000.0],
        ['Germany', 40.0, nan],
        ['France', 35.0, 58000.0],
        ['Spain', nan, 52000.0],
        ['France', 48.0, 79000.0],
        ['Germany', 50.0, 83000.0],
        ['France', 37.0, 67000.0]], dtype=object),
 array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'],
       dtype=object))

In [4]:
## Taking Care of Missing Values    
from  sklearn.preprocessing import Imputer                          # Imputation transformer for completing missing values(by sklearn_docs)   
imputer = Imputer(missing_values = "NaN", strategy="mean", axis=0)  # object is created to fill NaN variable according to the strategy. 
                                                                    # strategy can be: "mean","median","most_frequent"


In [5]:
imputer = imputer.fit(X[:,1:3])                   # fit and transform the values which are 1st and 2th columns                             
X[:,1:3] = imputer.transform(X[:,1:3])            # all NaN values are filled with selected strategy 

X[:,1:3]

array([[44.0, 72000.0],
       [27.0, 48000.0],
       [30.0, 54000.0],
       [38.0, 61000.0],
       [40.0, 63777.77777777778],
       [35.0, 58000.0],
       [38.77777777777778, 52000.0],
       [48.0, 79000.0],
       [50.0, 83000.0],
       [37.0, 67000.0]], dtype=object)

In [7]:
## Encoding Categorical Data
from sklearn.preprocessing import LabelEncoder          # Encode labels with value between 0 and n_classes-1


                                                        
labelencoder_X = LabelEncoder()                            

X[:,0] = labelencoder_X.fit_transform(X[:,0])           # Turns the categorical variables to the numerical values

                                                       
X[:,0]                                                  # France,Germany, Spain  are encoded with respect to 0,1,2 
                                                        # they all are turned to the numerical values.

array([0, 2, 1, 2, 1, 0, 2, 0, 1, 0], dtype=object)

In [8]:
from sklearn.preprocessing import OneHotEncoder         # One hot encoder to perform “binarization” of the category
                                                        # and include it as a feature to train the model.

onehotencoder = OneHotEncoder(categorical_features = [0])
X = onehotencoder.fit_transform(X).toarray()

X                                                       # one hot encoder creates columns for all encoded features because of binarization.
                                                        # If not hot encoding, machine learning algorithm thinks 2>1>0,
                                                        # so it cause some false predictions. One hot encoders occurs equal ground all of them.   

array([[1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 4.40000000e+01,
        7.20000000e+04],
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 2.70000000e+01,
        4.80000000e+04],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 3.00000000e+01,
        5.40000000e+04],
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 3.80000000e+01,
        6.10000000e+04],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 4.00000000e+01,
        6.37777778e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 3.50000000e+01,
        5.80000000e+04],
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 3.87777778e+01,
        5.20000000e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 4.80000000e+01,
        7.90000000e+04],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 5.00000000e+01,
        8.30000000e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 3.70000000e+01,
        6.70000000e+04]])

In [9]:
labelencoder_y = LabelEncoder()
y = labelencoder_y.fit_transform(y)

y                                     # encoded like -> no : 0 , yes : 1

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1])

In [15]:
## Splitting the dataset into the Training set and Test set

from sklearn.model_selection import train_test_split              

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state=0)   

                                    # at this stage, creates datasets for training and testing the ML model.
                                    #  test_size parameter is preferable around 0.2 or 0.25.
                                    # random state parameter selected a number for the getting the same result on the further splits
                                    # some other split parameters can be important for the real case problems like "shuffle"... 
                


In [16]:
X_train, X_test, y_train, y_test

(array([[0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 4.00000000e+01,
         6.37777778e+04],
        [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 3.70000000e+01,
         6.70000000e+04],
        [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 2.70000000e+01,
         4.80000000e+04],
        [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 3.87777778e+01,
         5.20000000e+04],
        [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 4.80000000e+01,
         7.90000000e+04],
        [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 3.80000000e+01,
         6.10000000e+04],
        [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 4.40000000e+01,
         7.20000000e+04],
        [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 3.50000000e+01,
         5.80000000e+04]]),
 array([[0.0e+00, 1.0e+00, 0.0e+00, 3.0e+01, 5.4e+04],
        [0.0e+00, 1.0e+00, 0.0e+00, 5.0e+01, 8.3e+04]]),
 array([1, 1, 1, 0, 1, 0, 0, 1]),
 array([0, 0]))

In [21]:
## Feature Scaling 
from sklearn.preprocessing import StandardScaler    # Standardization of a dataset is a common requirement 
                                                    # for many machine learning estimators:they might behave
                                                    # badly if the individual feature do not more or less look
                                                    # like standard normally distributed data (by sklearn_docs)

sc_X = StandardScaler()                             # There are lots of scaler but we are going to use StandardScaler.
X_train = sc_X.fit_transform(X_train)               # Scaling gives the computational speed and some algorithm
                                                    # requires same scaled datasets on the their learning processes.
X_test = sc_X.transform(X_test)

In [22]:
X_train, X_test

(array([[-1.        ,  2.64575131, -0.77459667,  0.26306757,  0.12381479],
        [ 1.        , -0.37796447, -0.77459667, -0.25350148,  0.46175632],
        [-1.        , -0.37796447,  1.29099445, -1.97539832, -1.53093341],
        [-1.        , -0.37796447,  1.29099445,  0.05261351, -1.11141978],
        [ 1.        , -0.37796447, -0.77459667,  1.64058505,  1.7202972 ],
        [-1.        , -0.37796447,  1.29099445, -0.0813118 , -0.16751412],
        [ 1.        , -0.37796447, -0.77459667,  0.95182631,  0.98614835],
        [ 1.        , -0.37796447, -0.77459667, -0.59788085, -0.48214934]]),
 array([[-1.        ,  2.64575131, -0.77459667, -1.45882927, -0.90166297],
        [-1.        ,  2.64575131, -0.77459667,  1.98496442,  2.13981082]]))