# This is a template for pre porcessing data

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

### First thing you want to do is get the data

In [2]:
#creates a path object with the current directory
import os
from pathlib import Path
current_dir = Path(os.getcwd())
current_dir

WindowsPath('D:/Workspace/machine-learning-study/notebooks')

In [3]:
#goes on directory up 
w_dir = str(current_dir.parents[0])
#we will take as an example \Machine Learning A-Z Template Folder\Part 1 - Data Preprocessing\Data_Preprocessing
#make sure to take of the first \ because if it is left there then it is considered an absolute path...
data_url = r'Machine Learning A-Z Template Folder/Part 1 - Data Preprocessing/Data_Preprocessing/Data.csv'
path = os.path.join(w_dir, data_url)
path

'D:\\Workspace\\machine-learning-study\\Machine Learning A-Z Template Folder/Part 1 - Data Preprocessing/Data_Preprocessing/Data.csv'

### With that we can get the data using pandas

In [4]:
data = pd.read_csv(path) # this file is correctly formatted and so no other parameters need to be added
data.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes


## Now we need to take the data and split it into independent params(X) and dependent params(y)
### we use the values of X to figure out the corresponding y's

In [5]:
#iloc is used for multi-indexing, the first : is for rows the second for cols
#which means the next line takes all the values of the rows and takes off the last col
X = data.iloc[:, :-1].values
y = data.iloc[:, data.shape[1]-1].values

### We need to make sure that our data set is not missing data

#### Lets look at row four in X as an example

In [6]:
X[4] #as we can the third value is NaN, we take care of this by taking the mean of that column and inserting that value

array(['Germany', 40.0, nan], dtype=object)

In [7]:
from sklearn.preprocessing import Imputer
imputer = Imputer(missing_values='NaN', strategy ='mean', axis=0) # axis 0 is the cols for some reason
imputer = imputer.fit(X[:, 1:3]) # the cols in which we want to make sure there are no missing values
X[:, 1:3] = imputer.transform(X[:, 1:3])
X # as we can se now the values which were NaN have been turned into the mean of the col

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 63777.77777777778],
       ['France', 35.0, 58000.0],
       ['Spain', 38.77777777777778, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

### Another problem we face is the algrothims take only numerical data

#### this means that we cannot have cols which have string values

In [8]:
#for example: this first col in X is a range between France, Spain, Germany
#we can solve this by encoding the data
from sklearn.preprocessing import LabelEncoder
labelencoder_X = LabelEncoder()
X[:, 0] = labelencoder_X.fit_transform(X[:, 0])
y = labelencoder_X.fit_transform(y)
X[:, 0], y # as we can see the data is turned in to numerical values that we can work with (France = 0, Spain = 2, Germany = 1)

(array([0, 2, 1, 2, 1, 0, 2, 0, 1, 0], dtype=object),
 array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1], dtype=int64))

### With the encoding we face another problem, the algorithm will be biased towards higher values and so it would not work as we wanted to

In [9]:
#to solve this we can encode the values again this time creating a col for each option with the value one where the data fits
from sklearn.preprocessing import OneHotEncoder
onehotencoder = OneHotEncoder(categorical_features = [0]) # we give it the cols where we want the encoding to happen
X = onehotencoder.fit_transform(X).toarray()
#notice that we dont need to say which cols because we gave the correct 
#cols in the consturctor
#we also dont need to do this for y because it only has 2 values 0,1
X

array([[  1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          4.40000000e+01,   7.20000000e+04],
       [  0.00000000e+00,   0.00000000e+00,   1.00000000e+00,
          2.70000000e+01,   4.80000000e+04],
       [  0.00000000e+00,   1.00000000e+00,   0.00000000e+00,
          3.00000000e+01,   5.40000000e+04],
       [  0.00000000e+00,   0.00000000e+00,   1.00000000e+00,
          3.80000000e+01,   6.10000000e+04],
       [  0.00000000e+00,   1.00000000e+00,   0.00000000e+00,
          4.00000000e+01,   6.37777778e+04],
       [  1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          3.50000000e+01,   5.80000000e+04],
       [  0.00000000e+00,   0.00000000e+00,   1.00000000e+00,
          3.87777778e+01,   5.20000000e+04],
       [  1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          4.80000000e+01,   7.90000000e+04],
       [  0.00000000e+00,   1.00000000e+00,   0.00000000e+00,
          5.00000000e+01,   8.30000000e+04],
       [  1.00000000e+00,   0.0000000

### We dont want to overfit the data so we split it up to training data, and testing data

#### overfitting data means that the algorithm nows the data well but doesnt learn the logic, memorizes the data

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
X_train, X_test, y_train, y_test

(array([[  0.00000000e+00,   1.00000000e+00,   0.00000000e+00,
           4.00000000e+01,   6.37777778e+04],
        [  1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
           3.70000000e+01,   6.70000000e+04],
        [  0.00000000e+00,   0.00000000e+00,   1.00000000e+00,
           2.70000000e+01,   4.80000000e+04],
        [  0.00000000e+00,   0.00000000e+00,   1.00000000e+00,
           3.87777778e+01,   5.20000000e+04],
        [  1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
           4.80000000e+01,   7.90000000e+04],
        [  0.00000000e+00,   0.00000000e+00,   1.00000000e+00,
           3.80000000e+01,   6.10000000e+04],
        [  1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
           4.40000000e+01,   7.20000000e+04],
        [  1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
           3.50000000e+01,   5.80000000e+04]]),
 array([[  0.00000000e+00,   1.00000000e+00,   0.00000000e+00,
           3.00000000e+01,   5.40000000e+04],
        [  0.0000

### Last but not least we need to feature scale the data, this means we make all the values range from -1 to 1 

#### we do this to make sure the data handels each col with out bais, as again, bigger values have bigger impact on the algorithim

#### this will also make it eaiser for the algorithm to work

In [11]:
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)
X_train, X_test

(array([[-1.        ,  2.64575131, -0.77459667,  0.26306757,  0.12381479],
        [ 1.        , -0.37796447, -0.77459667, -0.25350148,  0.46175632],
        [-1.        , -0.37796447,  1.29099445, -1.97539832, -1.53093341],
        [-1.        , -0.37796447,  1.29099445,  0.05261351, -1.11141978],
        [ 1.        , -0.37796447, -0.77459667,  1.64058505,  1.7202972 ],
        [-1.        , -0.37796447,  1.29099445, -0.0813118 , -0.16751412],
        [ 1.        , -0.37796447, -0.77459667,  0.95182631,  0.98614835],
        [ 1.        , -0.37796447, -0.77459667, -0.59788085, -0.48214934]]),
 array([[-1.        ,  2.64575131, -0.77459667, -1.45882927, -0.90166297],
        [-1.        ,  2.64575131, -0.77459667,  1.98496442,  2.13981082]]))