In [1]:
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [4]:
# Importing a csv using the pandas library, saving in a 'variable'
# function will read the dataset and create a dataframe (the dataset variable)
dataset = pd.read_csv('Data.csv')
# print(dataset)


# Creating 2 entities,the first is the matrix of features and the second is the dependent variable vector 
# In any machine learning model that we'll build, the features usually go in the first columns in the dataset, and the dependent variable in the last
# In the dataset example imported from the Data.csv file, our matrix of features includes columns 'Country', 'Age' and 'Salary'
# Seperately in the imported example, we want to create the dependent variable vector containing only the last column 'Purchased'
# The 'Purchased' column, representing a boolean is the one we want to predict in this example

# The Matrix of features
# using iloc (locate rows/columns indexes), that located the indexes that we want to extract from the dataset
# rows first with iloc, here adding ':' to specify all rows
# including all the columns except the last one -> The matrix of features -> ':-1' this will exclude the last column
# at the end, using '.values' that indicates that we are taking the values in all the rows and columns of the dataset, except last column
X = dataset.iloc[:, :-1].values

# The dependent variable vector
y = dataset.iloc[:, -1].values # selecting only last column as dependent variable vector


In [5]:
# Contains all the features, except the last one that we want to predict
print(X)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


In [6]:
# Contains all the decisions of whether or not the customer purchased the product
print(y)

['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']


In [7]:
# Taking care of missing data -> Causes errors in ML model
# One way is to remove them, another is to replace with data average from same feature

# sklearn data science library includes a lot of tools, including a lot of data pre-processing tools
# will use the class 'simple imputer' from sklearn
# Importing the simple imputer class
from sklearn.impute import SimpleImputer

# Creating an instance(an object) of the simple imputer class, that will allow us to replace the missing salary in the dataset with the average of the salries
# This will allow us to have an updated dataset (matrix of features)
# Note: Options for replacing missing values other than with Average, can be median, most frequent value (relevant for categories)
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

# Applying (connecting) the impute object on the matrix of features
# Remember a class contains an assemble of instructions as well as operations and actions that can be applied to other objects called methods
# The fit method will connect this imputer to the matrix of features
# The fit method will look at the missing values in the salary column and will compute the average of salaries
# To do the replacement we call transform method which will apply the transformation by replacing the missing salary with the average of the salaries

# The fit method expects all the columns of 'X' with numerical values
# As a general rule include all numerical columns, as in huge datasets, it s hard to spot which have missing values
imputer.fit(X[:, 1:3]) # fit method will look for all missing values in the age and salary columns

# Transform method from imputer object
# This will do the actual replacement of missing salary be mean of salaries (same thing for missing age)
X[:, 1:3] = imputer.transform(X[:, 1:3]) # this returns the new updated version of the matrix of features 'X' with the replacements of the missing salary and age for that we are directly replacing them

print(X)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 63777.77777777778]
 ['France' 35.0 58000.0]
 ['Spain' 38.77777777777778 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


In [None]:
# Encoding categorical data
# In our dataset, there is a column with a country category (France, Spain or Germany)
# For ML model, it would be difficult to compute some correlations between these features and the outcome (dependent variable)
# We need to turn the categories (strings) into numbers
# An idea would be to encode France into '0', Spain into '1' and Germany into '2'
# However, if we did that, our future ML model can understand that because they are numbered 0, 1 and 2 then there is a numerical order between these countries
# The ML model might interpret that the numerical order matters in such a case, when in fact it does not
# To avoid such mis-interpreted correlations between the features and the outcome we want to predict

# What we can do is turn country column into 3 columns in this case as we have 3 different classes (categories)
# This consists of creating binary vectors for each of the countries
# e.g. France will have the vector [1,0,0], Spain [0,1,0] and Germany [0,0,1]
# This way there's no numerical order between these 3 countries
# This is called one-hut encoding, very useful when pre-processing datasets containing categorical variables

# no and yes in the Purchased column will be replaced by zeros and ones

# Using 2 classes, first is column transformer class from the compose module of the sklearn library
# Second is the one-hut encoder class from the pre-processing module of the same sklearn library
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder


# Mixing those 2 classes in order to do the one hot encoding on the country column

# Step 1: Creating an object of the column transformer class
# 2 arguments go into ColumnTransformer:
# Transformers -> where we specify what kind of transformations we want to do and on which indexes of the columns we want to transform
# Remainder -> Specifying the columns that will not have transformations applied to (here age and salary)
ct = ColumnTransformer(transformers= [('encoder', OneHotEncoder(), [0])] ,remainder='passthrough')

# Connecting to matrix of feature X, this can be done directly because column transformer class has a method 'fit_transform' that will fit and transform at once
# Forcing the output to be a numPy array -> the fit_transform method does not return a numPy array, and its compulsory to have X as a numPy array
# This is because it will be expected by future ML models that will be built
X = np.array(ct.fit_transform(X))

