# Master Code to Machine Learning Studies

## Importing the libraries

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [3]:
# Uses pandas to read csv file as a dataset
dataset = pd.read_csv('Data.csv')

# Create the INDEPENDENT variables matrix
X = dataset.iloc[:, :-1].values

# Create the DEPENDENT variables matrix
y = dataset.iloc[:, -1].values

In [4]:
dataset # Complete dataset

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [5]:
X # Independent matrix

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, nan],
       ['France', 35.0, 58000.0],
       ['Spain', nan, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [6]:
y # Dependent matrix

array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'],
      dtype=object)

## Taking care of missing data

In [7]:
# Library to get mean and input it on dataset
from sklearn.impute import SimpleImputer

# Get the mean
imputer = SimpleImputer(missing_values = np.nan, strategy = 'mean')

# fit the mean at X matrix
imputer = imputer.fit(X[:,1:3])
X[:,1:3] = imputer.transform(X[:,1:3])
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 63777.77777777778],
       ['France', 35.0, 58000.0],
       ['Spain', 38.77777777777778, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

## Encoding Categorical Data

In [8]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# Create the class encoder
labelencoder_X = LabelEncoder()

# add to X matrix the country column encoded
X[:,0] = labelencoder_X.fit_transform(X[:,0]) # Only this can be a problem because some categorical variables are not linked  

# Solving the problem with dummy variables
onehotencoder = OneHotEncoder(categorical_features = [0])
X = onehotencoder.fit_transform(X).toarray()

# Doing the same to y matrix
labelencoder_y = LabelEncoder()
y = labelencoder_y.fit_transform(y)
y

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1])

## Splitting the dataset into the Training set and Test set

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

## Feature Scaling

In [10]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train[:, 3:] = sc.fit_transform(X_train[:, 3:])
X_test[:, 3:] = sc.transform(X_test[:, 3:])

In [11]:
X_train

array([[ 0.        ,  1.        ,  0.        ,  0.26306757,  0.12381479],
       [ 1.        ,  0.        ,  0.        , -0.25350148,  0.46175632],
       [ 0.        ,  0.        ,  1.        , -1.97539832, -1.53093341],
       [ 0.        ,  0.        ,  1.        ,  0.05261351, -1.11141978],
       [ 1.        ,  0.        ,  0.        ,  1.64058505,  1.7202972 ],
       [ 0.        ,  0.        ,  1.        , -0.0813118 , -0.16751412],
       [ 1.        ,  0.        ,  0.        ,  0.95182631,  0.98614835],
       [ 1.        ,  0.        ,  0.        , -0.59788085, -0.48214934]])