# This is a template for data preprocessing

## ============ import the data set ==========

In [2]:
import pandas as pd
dataset = pd.read_csv('Data.csv', header = 0, sep = ',') #header=0:
    #the first row in the data set, if header=1, then the
    #second row is treated as the header, and lines >2 is 
    #the data, so the first row is neglected. header =
    #None: no header row in the data.
#dataset = pd.read_csv('https://s3-eu-west-1.amazonaws.com/shanebucket/downloads/uk-500.csv')
#print('# of rows: %d, # of columns: %d \n'
#      %(dataset.shape[0], dataset.shape[1]))
#
#dataset.head(2)
#dataset.tail(5)
#dataset.iloc[0] #the first row
#dataset.iloc[-4]
#dataset.iloc[0, 3] #cell identified by row and column
#dataset.iloc[:, -2] #the last second column
#dataset.iloc[1, 1:3]
#dataset.iloc[1, :-1]

x = dataset.iloc[:, :3].values #values: transfrom from
                #data frame to numpy array.
y = dataset.iloc[:, -1].values
print ('x: ', end = '') 
print(x)

print ('y: ', end = '') 
print(y)

x: [['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]
y: ['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']


## ========= Taking care of the missing data ======

In [3]:
from sklearn.preprocessing import Imputer
imputer = Imputer(missing_values = 'NaN', strategy = 'mean', axis = 0)#take the mean along axis = 0 (column)           
#imputer = imputer.fit(x[:, 1:3])
#xx = imputer.transform(x[:, 1:3])
x[:, 1:3] = imputer.fit_transform(x[:, 1:3])

print ('x: ')
print(x)

x: 
[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 63777.77777777778]
 ['France' 35.0 58000.0]
 ['Spain' 38.77777777777778 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


## ========== Taking care of the categorical data ========

In [4]:
from sklearn.preprocessing import LabelEncoder,\
OneHotEncoder

#Dealing with the independent categorical data
print ('===Independent categorical data: \n')
label_encoder_x = LabelEncoder()
x[:, 0] = label_encoder_x.fit_transform(x[:, 0])
#x[:, 3] = label_encoder_x.fit_transform(x[:, 3])
one_hot_encoder_x = OneHotEncoder(categorical_features = [0])
x = one_hot_encoder_x.fit_transform(x).toarray() #x must
                                   #be multi-dim array
print ('Before dealing with the dummy variable trap: \nx: ')
print (x)
x = x[:, 1:] #deal with the dummy variable trap.
print ('After dealing with the dummy variable trap: \nx: ')
print (x)

#Dealing with the categorical dependent variable. It can 
#be the same as the independent categorical variables, but 
#if it is binary (0/1, or yes/no, etc.), we can just use 
#LabelEncoder
print ('\n===Dependent categorical data: \n')
label_encoder_y = LabelEncoder()
y = label_encoder_y.fit_transform(y)
print ('y: ')
print (y)

===Independent categorical data: 

Before dealing with the dummy variable trap: 
x: 
[[1.00000000e+00 0.00000000e+00 0.00000000e+00 4.40000000e+01
  7.20000000e+04]
 [0.00000000e+00 0.00000000e+00 1.00000000e+00 2.70000000e+01
  4.80000000e+04]
 [0.00000000e+00 1.00000000e+00 0.00000000e+00 3.00000000e+01
  5.40000000e+04]
 [0.00000000e+00 0.00000000e+00 1.00000000e+00 3.80000000e+01
  6.10000000e+04]
 [0.00000000e+00 1.00000000e+00 0.00000000e+00 4.00000000e+01
  6.37777778e+04]
 [1.00000000e+00 0.00000000e+00 0.00000000e+00 3.50000000e+01
  5.80000000e+04]
 [0.00000000e+00 0.00000000e+00 1.00000000e+00 3.87777778e+01
  5.20000000e+04]
 [1.00000000e+00 0.00000000e+00 0.00000000e+00 4.80000000e+01
  7.90000000e+04]
 [0.00000000e+00 1.00000000e+00 0.00000000e+00 5.00000000e+01
  8.30000000e+04]
 [1.00000000e+00 0.00000000e+00 0.00000000e+00 3.70000000e+01
  6.70000000e+04]]
After dealing with the dummy variable trap: 
x: 
[[0.00000000e+00 0.00000000e+00 4.40000000e+01 7.20000000e+04]
 [

## ============ Feature scaling ==========

In [5]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler() #or normalization function
x = sc.fit_transform(x)
#if y is not needed for feature scaling, then don't 
#y = sc.fit_transform(y)
#y = sc.fit_transform(y.reshape(-1, 1)) #if only one
                        #feature
print ('The feature-scaled x: ')
print (x)

The feature-scaled x: 
[[-6.54653671e-01 -6.54653671e-01  7.58874362e-01  7.49473254e-01]
 [-6.54653671e-01  1.52752523e+00 -1.71150388e+00 -1.43817841e+00]
 [ 1.52752523e+00 -6.54653671e-01 -1.27555478e+00 -8.91265492e-01]
 [-6.54653671e-01  1.52752523e+00 -1.13023841e-01 -2.53200424e-01]
 [ 1.52752523e+00 -6.54653671e-01  1.77608893e-01  6.63219199e-16]
 [-6.54653671e-01 -6.54653671e-01 -5.48972942e-01 -5.26656882e-01]
 [-6.54653671e-01  1.52752523e+00  0.00000000e+00 -1.07356980e+00]
 [-6.54653671e-01 -6.54653671e-01  1.34013983e+00  1.38753832e+00]
 [ 1.52752523e+00 -6.54653671e-01  1.63077256e+00  1.75214693e+00]
 [-6.54653671e-01 -6.54653671e-01 -2.58340208e-01  2.93712492e-01]]


## ============ Splitting data into training and testing set  ========

In [6]:
from sklearn.cross_validation import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state=0)
print ('x_train: ')
print (x_train)
print ('y_train: ')
print (y_train)
print ('x_test: ')
print (x_test)
print ('y_test')
print (y_test)

x_train: 
[[ 1.52752523e+00 -6.54653671e-01  1.77608893e-01  6.63219199e-16]
 [-6.54653671e-01 -6.54653671e-01 -2.58340208e-01  2.93712492e-01]
 [-6.54653671e-01  1.52752523e+00 -1.71150388e+00 -1.43817841e+00]
 [-6.54653671e-01  1.52752523e+00  0.00000000e+00 -1.07356980e+00]
 [-6.54653671e-01 -6.54653671e-01  1.34013983e+00  1.38753832e+00]
 [-6.54653671e-01  1.52752523e+00 -1.13023841e-01 -2.53200424e-01]
 [-6.54653671e-01 -6.54653671e-01  7.58874362e-01  7.49473254e-01]
 [-6.54653671e-01 -6.54653671e-01 -5.48972942e-01 -5.26656882e-01]]
y_train: 
[1 1 1 0 1 0 0 1]
x_test: 
[[ 1.52752523 -0.65465367 -1.27555478 -0.89126549]
 [ 1.52752523 -0.65465367  1.63077256  1.75214693]]
y_test
[0 0]
