In [1]:
import pandas as pd
import numpy as np

In [2]:
dataset = pd.read_csv("Data.csv")

In [6]:
x = dataset.iloc[:,:-1].values
y = dataset.iloc[:,3].values

In [4]:
# Take care of missing data

from sklearn.preprocessing import Imputer
imputer = Imputer(missing_values='NaN', strategy='mean', axis=0)

In [8]:
# Fit the Imputer object to our data object X
# Fit only the required columns

imputer = imputer.fit(x[:,1:3])
x[:,1:3] = imputer.transform(x[:,1:3])

In [9]:
x

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 63777.77777777778],
       ['France', 35.0, 58000.0],
       ['Spain', 38.77777777777778, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [13]:
# Encoding categorical data

from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [14]:
labelencoder_x = LabelEncoder()
x[:,0] = labelencoder_x.fit_transform(x[:,0])

# Enter the index number of the categorical column

onehotencoder = OneHotEncoder(categorical_features=[0])
x = onehotencoder.fit_transform(x).toarray()

In [19]:
x.shape

(10, 5)

In [21]:
np.set_printoptions(formatter={'float_kind':'{:f}'.format})
x

array([[1.000000, 0.000000, 0.000000, 44.000000, 72000.000000],
       [0.000000, 0.000000, 1.000000, 27.000000, 48000.000000],
       [0.000000, 1.000000, 0.000000, 30.000000, 54000.000000],
       [0.000000, 0.000000, 1.000000, 38.000000, 61000.000000],
       [0.000000, 1.000000, 0.000000, 40.000000, 63777.777778],
       [1.000000, 0.000000, 0.000000, 35.000000, 58000.000000],
       [0.000000, 0.000000, 1.000000, 38.777778, 52000.000000],
       [1.000000, 0.000000, 0.000000, 48.000000, 79000.000000],
       [0.000000, 1.000000, 0.000000, 50.000000, 83000.000000],
       [1.000000, 0.000000, 0.000000, 37.000000, 67000.000000]])

In [22]:
# Encode the last column

labelencoder_y = LabelEncoder()
y = labelencoder_y.fit_transform(y)

In [23]:
y

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1])

In [28]:
# Splitting the dataset into the test and training sets

from sklearn.cross_validation import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

In [33]:
print(x_train)
print("---------------")
print(y_train)

[[0.000000 1.000000 0.000000 40.000000 63777.777778]
 [1.000000 0.000000 0.000000 37.000000 67000.000000]
 [0.000000 0.000000 1.000000 27.000000 48000.000000]
 [0.000000 0.000000 1.000000 38.777778 52000.000000]
 [1.000000 0.000000 0.000000 48.000000 79000.000000]
 [0.000000 0.000000 1.000000 38.000000 61000.000000]
 [1.000000 0.000000 0.000000 44.000000 72000.000000]
 [1.000000 0.000000 0.000000 35.000000 58000.000000]]
---------------
[1 1 1 0 1 0 0 1]


In [34]:
print(x_test)
print("---------------")
print(y_test)

[[0.000000 1.000000 0.000000 30.000000 54000.000000]
 [0.000000 1.000000 0.000000 50.000000 83000.000000]]
---------------
[0 0]


In [35]:
# Feature Scaling

from sklearn.preprocessing import StandardScaler

In [36]:
sc_x = StandardScaler()

In [37]:
# At x_train need to fit and transform so that it transform at the same way
# No need to apply this on dependent variable on y_train and y_test

x_train = sc_x.fit_transform(x_train)

# No need to fit and transform because it is already fit in x_train
# just need to transform

x_test = sc_x.transform(x_test)

In [38]:
x_train

array([[-1.000000, 2.645751, -0.774597, 0.263068, 0.123815],
       [1.000000, -0.377964, -0.774597, -0.253501, 0.461756],
       [-1.000000, -0.377964, 1.290994, -1.975398, -1.530933],
       [-1.000000, -0.377964, 1.290994, 0.052614, -1.111420],
       [1.000000, -0.377964, -0.774597, 1.640585, 1.720297],
       [-1.000000, -0.377964, 1.290994, -0.081312, -0.167514],
       [1.000000, -0.377964, -0.774597, 0.951826, 0.986148],
       [1.000000, -0.377964, -0.774597, -0.597881, -0.482149]])

In [39]:
x_test

array([[-1.000000, 2.645751, -0.774597, -1.458829, -0.901663],
       [-1.000000, 2.645751, -0.774597, 1.984964, 2.139811]])