In [None]:
# Pre-processing

In [49]:
import numpy as np
import pandas as pd

In [50]:
dataset = pd.read_csv('Data.csv')
X = dataset.iloc[ : , :-1].values
Y = dataset.iloc[ : , 3].values

In [51]:
from sklearn.impute import SimpleImputer
import numpy as np

# This imputer substitutes missing values with mean of the rest of the values in that column
# https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

# learns the distribution of values in the the 1-2 columns (counting from 0)
imputer = imputer.fit(X[ : , 1:3])

# transforms the data inputted using the imputer strategy specified above (mean)
X[ : , 1:3] = imputer.transform(X[ : , 1:3])

In [52]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
# The countries get fitted to numerical labels. For example ["Germany", "France", "France"] --> [0, 1, 1]
labelencoder_X = LabelEncoder()
X[ : , 0] = labelencoder_X.fit_transform(X[ : , 0])

In [60]:
# https://datascience.stackexchange.com/questions/71804/how-to-perform-one-hot-encoding-on-multiple-categorical-columns
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html
# Determine categories from training data
onehotencoder = OneHotEncoder(categories = 'auto')
encoded = onehotencoder.fit_transform(X[ : , 0:1]).toarray()

In [61]:
X

array([[0, 44.0, 72000.0],
       [2, 27.0, 48000.0],
       [1, 30.0, 54000.0],
       [2, 38.0, 61000.0],
       [1, 40.0, 63777.77777777778],
       [0, 35.0, 58000.0],
       [2, 38.77777777777778, 52000.0],
       [0, 48.0, 79000.0],
       [1, 50.0, 83000.0],
       [0, 37.0, 67000.0]], dtype=object)

In [62]:
encoded

array([[1., 0., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.]])

In [63]:
concatenated_array_columns = np.concatenate((encoded, X[:, 1:]), axis=1)

In [65]:
X = concatenated_array_columns

In [66]:
labelencoder_Y = LabelEncoder()
Y =  labelencoder_Y.fit_transform(Y)

In [72]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split( X , Y , 
                                                    test_size = 0.2, # Setting aside 80% for training and 20% for testing
                                                    random_state = 0) # Random_state sets the rng for reproducibility accross machines
                                                                     # This is primarily useful for reproducing bugs or results 

In [73]:
# from sklearn.preprocessing import StandardScaler

# sc_X = StandardScaler()
# X_train = sc_X.fit_transform(X_train)
# X_test = sc_X.fit_transform(X_test)

In [74]:
import numpy as np
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

def scale_columns(data):
    salary_column = data[:, -1].reshape(-1, 1)  # Reshape to 2D array for scaler
    age_column = data[:, -2].reshape(-1, 1)  # Reshape to 2D array for scaler
    
    # Apply StandardScaler to the selected column
    scaled_salary = scaler.fit_transform(salary_column)
    scaled_age = scaler.fit_transform(age_column)
    
    # Replace the original 'salary' column with the scaled one
    data[:, -1] = scaled_salary.flatten()  # Flatten back to 1D for assignment
    data[:, -2] = scaled_age.flatten()  # Flatten back to 1D for assignment

In [75]:
X_train

array([[0.0, 1.0, 0.0, 0.2630675731713538, 0.1238147854838185],
       [1.0, 0.0, 0.0, -0.25350147960148617, 0.4617563176278856],
       [0.0, 0.0, 1.0, -1.9753983221776195, -1.5309334063940294],
       [0.0, 0.0, 1.0, 0.05261351463427101, -1.1114197802841526],
       [1.0, 0.0, 0.0, 1.6405850472322605, 1.7202971959575162],
       [0.0, 0.0, 1.0, -0.08131179534387283, -0.16751412153692966],
       [1.0, 0.0, 0.0, 0.9518263102018072, 0.9861483502652316],
       [1.0, 0.0, 0.0, -0.5978808481167128, -0.48214934111933727]],
      dtype=object)

In [None]:
# Data Analysis