# Data Preprocessing

## Importing the library

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

dataset = pd.read_csv("Data.csv")
dataset

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


## importing the dataset

In [2]:
x = dataset.iloc[:, :-1].values
y = dataset.iloc[:, 3].values

## Taking Care Of Missing Data

In [3]:
from sklearn.preprocessing import Imputer

In [4]:
# Imputer for completing missing values. 

#Parameters:
    # missing_values : intger or "NaN", optional(defaut = "NaN"). For missing values encoded as np.nan, use the string value "NaN".
    # strategy : string, optional (default="mean")
        # - If "mean", then replace missing values using the mean along the axis.
        # - If "median", then replace missing values using the median along the axis.
        # - If "most_frequent", then replace missing using the most frequent value along the axis.
    # axis : integer, optional (default=0)
        # The axis along which to impute.
        # - If `axis=0`, then impute along columns.
        # - If `axis=1`, then impute along rows.
    # verbose : integer, optional (default=0)
        # Controls the verbosity of the imputer.

    # copy : boolean, optional (default=True)
        # If True, a copy of X will be created. If False, imputation will be done in-place whenever possible. 
        # Note that, in the following cases, a new copy will always be made, even if `copy=False`

imputer = Imputer(missing_values="NaN", strategy="mean", axis=0)

imputer = imputer.fit(x[:,1:3])
x[:, 1:3] = imputer.transform(x[:, 1:3])
print(x)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 63777.77777777778]
 ['France' 35.0 58000.0]
 ['Spain' 38.77777777777778 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]




## encoding categorical data

In [5]:
from sklearn.preprocessing import LabelEncoder

In [6]:
labelencoder_x = LabelEncoder()
x[:, 0] = labelencoder_x.fit_transform(x[:, 0])

In [7]:
x

array([[0, 44.0, 72000.0],
       [2, 27.0, 48000.0],
       [1, 30.0, 54000.0],
       [2, 38.0, 61000.0],
       [1, 40.0, 63777.77777777778],
       [0, 35.0, 58000.0],
       [2, 38.77777777777778, 52000.0],
       [0, 48.0, 79000.0],
       [1, 50.0, 83000.0],
       [0, 37.0, 67000.0]], dtype=object)

In [8]:
from sklearn.preprocessing import OneHotEncoder

In [9]:
labelencoder_x = LabelEncoder()
x[:, 0] = labelencoder_x.fit_transform(x[:, 0])
onehotencoder = OneHotEncoder(categorical_features=[0])

In [10]:
x = onehotencoder.fit_transform(x).toarray()
x = x.astype("int")

labelencoder_y = LabelEncoder()
y = labelencoder_y.fit_transform(y)

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


## splitting the data set into training set annd test set
##### in updated version of sklearn, train_test_split changed class from cross_validation to model_selection

In [13]:
from sklearn.model_selection import train_test_split

In [17]:
x_train, x_test,y_train, y_test = train_test_split(x,y,test_size=0.2, random_state=0)