# Data Processing

* Importing Python library
* Reading Data
* Missing Data
* Deal with Categorical Data
* Splitting Data
* Normalize Data

## Import Python Library and Read Data

In [223]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import Imputer, LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv('Data.csv')

In [3]:
data

Unnamed: 0,City,Age,Sex,Smoke,HappinessIndex,Healthy
0,Mumbai,24.0,Male,Yes,241.0,Yes
1,London,80.0,Female,No,928.0,No
2,NewYork,38.0,Male,Yes,,Yes
3,NewYork,22.0,Female,Yes,786.0,Yes
4,NewYork,36.0,Male,Yes,967.0,Yes
5,London,,Female,Yes,665.0,Yes
6,Mumbai,17.0,Female,No,293.0,No
7,NewYork,28.0,Female,No,494.0,Yes
8,Mumbai,45.0,Female,No,707.0,No
9,London,29.0,Male,Yes,599.0,No


In [115]:
x = data.iloc[:,0:5].values #we add values to extract the return from iloc as a numpy array instead of a DataFrame

In [116]:
x

array([['Mumbai', 24.0, 'Male', 'Yes', 241.0],
       ['London', 80.0, 'Female', 'No', 928.0],
       ['NewYork', 38.0, 'Male', 'Yes', nan],
       ['NewYork', 22.0, 'Female', 'Yes', 786.0],
       ['NewYork', 36.0, 'Male', 'Yes', 967.0],
       ['London', nan, 'Female', 'Yes', 665.0],
       ['Mumbai', 17.0, 'Female', 'No', 293.0],
       ['NewYork', 28.0, 'Female', 'No', 494.0],
       ['Mumbai', 45.0, 'Female', 'No', 707.0],
       ['London', 29.0, 'Male', 'Yes', 599.0]], dtype=object)

In [117]:
y = data.iloc[:, 5].values

In [110]:
y

array(['Yes', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'No', 'No'],
      dtype=object)

In [118]:
xy = x.copy()

In [9]:
xy = np.append(xy, y.reshape((10,1)), axis=1) # if we need to append a y array back to the array

In [119]:
xy

array([['Mumbai', 24.0, 'Male', 'Yes', 241.0],
       ['London', 80.0, 'Female', 'No', 928.0],
       ['NewYork', 38.0, 'Male', 'Yes', nan],
       ['NewYork', 22.0, 'Female', 'Yes', 786.0],
       ['NewYork', 36.0, 'Male', 'Yes', 967.0],
       ['London', nan, 'Female', 'Yes', 665.0],
       ['Mumbai', 17.0, 'Female', 'No', 293.0],
       ['NewYork', 28.0, 'Female', 'No', 494.0],
       ['Mumbai', 45.0, 'Female', 'No', 707.0],
       ['London', 29.0, 'Male', 'Yes', 599.0]], dtype=object)

## Missing Data

In [120]:
xy = x.copy()

In [121]:
xy

array([['Mumbai', 24.0, 'Male', 'Yes', 241.0],
       ['London', 80.0, 'Female', 'No', 928.0],
       ['NewYork', 38.0, 'Male', 'Yes', nan],
       ['NewYork', 22.0, 'Female', 'Yes', 786.0],
       ['NewYork', 36.0, 'Male', 'Yes', 967.0],
       ['London', nan, 'Female', 'Yes', 665.0],
       ['Mumbai', 17.0, 'Female', 'No', 293.0],
       ['NewYork', 28.0, 'Female', 'No', 494.0],
       ['Mumbai', 45.0, 'Female', 'No', 707.0],
       ['London', 29.0, 'Male', 'Yes', 599.0]], dtype=object)

In [122]:
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)

In [123]:
xy[:,1:2] = imp.fit_transform(xy[:,1:2]) # replaces NaN in second column with mean
xy[:,4:5] = imp.fit_transform(xy[:,4:5]) # replaces NaN in last column with mean

In [124]:
xy

array([['Mumbai', 24.0, 'Male', 'Yes', 241.0],
       ['London', 80.0, 'Female', 'No', 928.0],
       ['NewYork', 38.0, 'Male', 'Yes', 631.1111111111111],
       ['NewYork', 22.0, 'Female', 'Yes', 786.0],
       ['NewYork', 36.0, 'Male', 'Yes', 967.0],
       ['London', 35.44444444444444, 'Female', 'Yes', 665.0],
       ['Mumbai', 17.0, 'Female', 'No', 293.0],
       ['NewYork', 28.0, 'Female', 'No', 494.0],
       ['Mumbai', 45.0, 'Female', 'No', 707.0],
       ['London', 29.0, 'Male', 'Yes', 599.0]], dtype=object)

In [72]:
xy = x.copy()

In [73]:
type(xy[2,4])

float

In [74]:
pd1 = pd.DataFrame(xy)

In [79]:
pd1.dropna().values

array([['Mumbai', 24.0, 'Male', 'Yes', 241.0],
       ['London', 80.0, 'Female', 'No', 928.0],
       ['NewYork', 22.0, 'Female', 'Yes', 786.0],
       ['NewYork', 36.0, 'Male', 'Yes', 967.0],
       ['Mumbai', 17.0, 'Female', 'No', 293.0],
       ['NewYork', 28.0, 'Female', 'No', 494.0],
       ['Mumbai', 45.0, 'Female', 'No', 707.0],
       ['London', 29.0, 'Male', 'Yes', 599.0]], dtype=object)

In [78]:
pd1.dropna(axis='columns').values

array([['Mumbai', 'Male', 'Yes'],
       ['London', 'Female', 'No'],
       ['NewYork', 'Male', 'Yes'],
       ['NewYork', 'Female', 'Yes'],
       ['NewYork', 'Male', 'Yes'],
       ['London', 'Female', 'Yes'],
       ['Mumbai', 'Female', 'No'],
       ['NewYork', 'Female', 'No'],
       ['Mumbai', 'Female', 'No'],
       ['London', 'Male', 'Yes']], dtype=object)

## Categorical Data

In [185]:
xy = x.copy()

In [186]:
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)

In [187]:
xy[:,1:2] = imp.fit_transform(xy[:,1:2]) # replaces NaN in second column with mean
xy[:,4:5] = imp.fit_transform(xy[:,4:5]) # replaces NaN in last column with mean

In [188]:
xy # have to replace non numerical data into numerical data

array([['Mumbai', 24.0, 'Male', 'Yes', 241.0],
       ['London', 80.0, 'Female', 'No', 928.0],
       ['NewYork', 38.0, 'Male', 'Yes', 631.1111111111111],
       ['NewYork', 22.0, 'Female', 'Yes', 786.0],
       ['NewYork', 36.0, 'Male', 'Yes', 967.0],
       ['London', 35.44444444444444, 'Female', 'Yes', 665.0],
       ['Mumbai', 17.0, 'Female', 'No', 293.0],
       ['NewYork', 28.0, 'Female', 'No', 494.0],
       ['Mumbai', 45.0, 'Female', 'No', 707.0],
       ['London', 29.0, 'Male', 'Yes', 599.0]], dtype=object)

In [189]:
le_x = LabelEncoder()
le_y = LabelEncoder()

In [190]:
xy[:,0] = le_x.fit_transform(xy[:,0]) # before one hot encoding we need to label encoding (mumbay, NewYork, Lodon)
xy[:,2] = le_x.fit_transform(xy[:,2]) # before one hot encoding we need to label encoding (female, male)
xy[:,3] = le_x.fit_transform(xy[:,3]) # this is straight-forward because we only have two values (yes/no)
y = le_y.fit_transform(y) # this is straight-forward because we only have two values (yes/no)

In [191]:
xy

array([[1, 24.0, 1, 1, 241.0],
       [0, 80.0, 0, 0, 928.0],
       [2, 38.0, 1, 1, 631.1111111111111],
       [2, 22.0, 0, 1, 786.0],
       [2, 36.0, 1, 1, 967.0],
       [0, 35.44444444444444, 0, 1, 665.0],
       [1, 17.0, 0, 0, 293.0],
       [2, 28.0, 0, 0, 494.0],
       [1, 45.0, 0, 0, 707.0],
       [0, 29.0, 1, 1, 599.0]], dtype=object)

In [192]:
y

array([1, 0, 1, 1, 1, 1, 0, 1, 0, 0], dtype=int64)

In [193]:
ohe_city = OneHotEncoder(categorical_features=[0])

In [194]:
xy = ohe_city.fit_transform(xy).toarray()

In [195]:
xy

array([[  0.        ,   1.        ,   0.        ,  24.        ,
          1.        ,   1.        , 241.        ],
       [  1.        ,   0.        ,   0.        ,  80.        ,
          0.        ,   0.        , 928.        ],
       [  0.        ,   0.        ,   1.        ,  38.        ,
          1.        ,   1.        , 631.11111111],
       [  0.        ,   0.        ,   1.        ,  22.        ,
          0.        ,   1.        , 786.        ],
       [  0.        ,   0.        ,   1.        ,  36.        ,
          1.        ,   1.        , 967.        ],
       [  1.        ,   0.        ,   0.        ,  35.44444444,
          0.        ,   1.        , 665.        ],
       [  0.        ,   1.        ,   0.        ,  17.        ,
          0.        ,   0.        , 293.        ],
       [  0.        ,   0.        ,   1.        ,  28.        ,
          0.        ,   0.        , 494.        ],
       [  0.        ,   1.        ,   0.        ,  45.        ,
          0.    

In [196]:
ohe_sex = OneHotEncoder(categorical_features=[4])

In [197]:
xy = ohe_sex.fit_transform(xy)

In [199]:
xy.toarray() # OneHot Encoding format -> | female | male | london | mumbai | newyork | age | smoking (yes/no) | happinessIndex

array([[  0.        ,   1.        ,   0.        ,   1.        ,
          0.        ,  24.        ,   1.        , 241.        ],
       [  1.        ,   0.        ,   1.        ,   0.        ,
          0.        ,  80.        ,   0.        , 928.        ],
       [  0.        ,   1.        ,   0.        ,   0.        ,
          1.        ,  38.        ,   1.        , 631.11111111],
       [  1.        ,   0.        ,   0.        ,   0.        ,
          1.        ,  22.        ,   1.        , 786.        ],
       [  0.        ,   1.        ,   0.        ,   0.        ,
          1.        ,  36.        ,   1.        , 967.        ],
       [  1.        ,   0.        ,   1.        ,   0.        ,
          0.        ,  35.44444444,   1.        , 665.        ],
       [  1.        ,   0.        ,   0.        ,   1.        ,
          0.        ,  17.        ,   0.        , 293.        ],
       [  1.        ,   0.        ,   0.        ,   0.        ,
          1.        ,  28.       

## Splitting Data in Train and Test

In [247]:
xy_train, xy_test, y_train, y_test = train_test_split(xy, y, test_size=0.2) # 20% of the data goes for testing and 80% goes for training

In [248]:
xy_train.toarray()

array([[  1.        ,   0.        ,   0.        ,   1.        ,
          0.        ,  17.        ,   0.        , 293.        ],
       [  0.        ,   1.        ,   0.        ,   0.        ,
          1.        ,  38.        ,   1.        , 631.11111111],
       [  1.        ,   0.        ,   1.        ,   0.        ,
          0.        ,  35.44444444,   1.        , 665.        ],
       [  1.        ,   0.        ,   0.        ,   0.        ,
          1.        ,  22.        ,   1.        , 786.        ],
       [  1.        ,   0.        ,   1.        ,   0.        ,
          0.        ,  80.        ,   0.        , 928.        ],
       [  1.        ,   0.        ,   0.        ,   0.        ,
          1.        ,  28.        ,   0.        , 494.        ],
       [  0.        ,   1.        ,   1.        ,   0.        ,
          0.        ,  29.        ,   1.        , 599.        ],
       [  1.        ,   0.        ,   0.        ,   1.        ,
          0.        ,  45.       

In [249]:
xy_test.toarray()

array([[  0.,   1.,   0.,   1.,   0.,  24.,   1., 241.],
       [  0.,   1.,   0.,   0.,   1.,  36.,   1., 967.]])

In [250]:
y_train

array([0, 1, 1, 1, 0, 1, 0, 0], dtype=int64)

In [251]:
y_test

array([1, 1], dtype=int64)

## Normalize Data

In [252]:
xy_test.toarray()

array([[  0.,   1.,   0.,   1.,   0.,  24.,   1., 241.],
       [  0.,   1.,   0.,   0.,   1.,  36.,   1., 967.]])

In [253]:
sc_x = StandardScaler() # (x - xmin) / (maxValue - minValue)... may use (x - xmean) / std_deviation instead

In [254]:
xy_train = sc_x.fit_transform(xy_train.toarray())

In [255]:
xy_train

array([[ 0.57735027, -0.57735027, -0.77459667,  1.73205081, -0.77459667,
        -1.08086023, -1.        , -1.94006195],
       [-1.73205081,  1.73205081, -0.77459667, -0.57735027,  1.29099445,
         0.06518512,  1.        , -0.03812622],
       [ 0.57735027, -0.57735027,  1.29099445, -0.57735027, -0.77459667,
        -0.07428072,  1.        ,  0.15250487],
       [ 0.57735027, -0.57735027, -0.77459667, -0.57735027,  1.29099445,
        -0.80799229,  1.        ,  0.83315161],
       [ 0.57735027, -0.57735027,  1.29099445, -0.57735027, -0.77459667,
         2.35727581, -1.        ,  1.63192711],
       [ 0.57735027, -0.57735027, -0.77459667, -0.57735027,  1.29099445,
        -0.48055076, -1.        , -0.80940085],
       [-1.73205081,  1.73205081,  1.29099445, -0.57735027, -0.77459667,
        -0.42597717,  1.        , -0.21875699],
       [ 0.57735027, -0.57735027, -0.77459667,  1.73205081, -0.77459667,
         0.44720023, -1.        ,  0.38876241]])

In [256]:
xy_test = sc_x.transform(xy_test.toarray()) #already fitted this before (xy_train) so just need to tranform

In [257]:
xy_test

array([[-1.73205081,  1.73205081, -0.77459667,  1.73205081, -0.77459667,
        -0.69884511,  1.        , -2.2325713 ],
       [-1.73205081,  1.73205081, -0.77459667, -0.57735027,  1.29099445,
        -0.04396206,  1.        ,  1.85130912]])