In [127]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [128]:
dataset = pd.read_csv('Data.csv')

In [129]:
#printing the dataset
dataset

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [130]:
dataset.describe()

Unnamed: 0,Age,Salary
count,9.0,9.0
mean,38.777778,63777.777778
std,7.693793,12265.579662
min,27.0,48000.0
25%,35.0,54000.0
50%,38.0,61000.0
75%,44.0,72000.0
max,50.0,83000.0


In [131]:
# If we want all the dataframe columns to be analysed in the report
dataset.describe(include = 'all')

Unnamed: 0,Country,Age,Salary,Purchased
count,10,9.0,9.0,10
unique,3,,,2
top,France,,,No
freq,4,,,5
mean,,38.777778,63777.777778,
std,,7.693793,12265.579662,
min,,27.0,48000.0,
25%,,35.0,54000.0,
50%,,38.0,61000.0,
75%,,44.0,72000.0,


# In the result top is the most frequent value

In [132]:
# Experimenting with a Series to understand pd.describe() better
ser_a = pd.Series([2,3,4])

In [133]:
ser_a.describe()

count    3.0
mean     3.0
std      1.0
min      2.0
25%      2.5
50%      3.0
75%      3.5
max      4.0
dtype: float64

# Selecting the columns - Bifurcating into X, y features and prediction target

In [134]:
# Features
X = dataset.iloc[:,:-1].values

In [135]:
# Prediction Target
y = dataset.iloc[:, 3].values

In [136]:
print(X,y)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]] ['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']


# dataset.iloc[:, 3].values will return the numpy.ndarray

# Using Scikit-learn Pre-processing Imputer function

In [137]:
from sklearn.preprocessing import Imputer

In [138]:
imputer = Imputer(missing_values = "NaN", strategy = "mean", axis = 0)

# Imputer meaning: assign (a value) to something by inference from the value of the products or processes to which it contributes

In [139]:
imputer = imputer.fit(X[:,1:3])
X[:,1:3] = imputer.transform(X[:,1:3])

In [140]:
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 63777.77777777778],
       ['France', 35.0, 58000.0],
       ['Spain', 38.77777777777778, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

# What if we have qualitative data in our features... For example the first column in our features are countries

In [141]:
from sklearn.preprocessing import LabelEncoder

In [142]:
labelencoder = LabelEncoder()

In [143]:
type(labelencoder)

sklearn.preprocessing.label.LabelEncoder

In [144]:
# imputing the Label Encoder to the data
labelencoder = labelencoder.fit(X[:,0])

In [145]:
labelencoder

LabelEncoder()

In [146]:
X[:,0] = labelencoder.transform(X[:,0])

In [147]:
X

array([[0, 44.0, 72000.0],
       [2, 27.0, 48000.0],
       [1, 30.0, 54000.0],
       [2, 38.0, 61000.0],
       [1, 40.0, 63777.77777777778],
       [0, 35.0, 58000.0],
       [2, 38.77777777777778, 52000.0],
       [0, 48.0, 79000.0],
       [1, 50.0, 83000.0],
       [0, 37.0, 67000.0]], dtype=object)

# But countries got attributed to numbers now.
France to 0, Spain to 2, Germany to 1 etc.,
So, is it like Spain is prioritized than France as 2 > 0.
This seems confusing.
Hence, we use OneHotEncoding where in the number of different countries will be taken as columns
and a check whether the country is in place is denoted by '1' in rows.

In [148]:
from sklearn.preprocessing import OneHotEncoder

In [149]:
onehotencoder = OneHotEncoder(categorical_features =[0])

In [152]:
X = onehotencoder.fit_transform(X).toarray()

# On Question
What is the difference between fit() methond and fit_transform()
It is all about Standard_score
https://datascience.stackexchange.com/questions/12321/difference-between-fit-and-fit-transform-in-scikit-learn-models
https://en.wikipedia.org/wiki/Standard_score

In [153]:
X

array([[  0.00000000e+00,   1.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   4.40000000e+01,   7.20000000e+04],
       [  1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          1.00000000e+00,   2.70000000e+01,   4.80000000e+04],
       [  1.00000000e+00,   0.00000000e+00,   1.00000000e+00,
          0.00000000e+00,   3.00000000e+01,   5.40000000e+04],
       [  1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          1.00000000e+00,   3.80000000e+01,   6.10000000e+04],
       [  1.00000000e+00,   0.00000000e+00,   1.00000000e+00,
          0.00000000e+00,   4.00000000e+01,   6.37777778e+04],
       [  0.00000000e+00,   1.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   3.50000000e+01,   5.80000000e+04],
       [  1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          1.00000000e+00,   3.87777778e+01,   5.20000000e+04],
       [  0.00000000e+00,   1.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   4.80000000e+01,   7.90000000e+04],


# Note
One hot encoder only takes numerical categorical values, hence any value of string type should be label encoded before one hot encoded.

# Ordinal Categorical Variables and Nominal Categoriacal Variables.
# Dummy variables Trap
https://towardsdatascience.com/one-hot-encoding-multicollinearity-and-the-dummy-variable-trap-b5840be3c41a

# Ordinal Categorical Variables: low, medium, high
# Nominal Categorical Variables: Male, Female

Multicollinearity occurs when two or more variables are dependent on each other.

# Definition of Dummy Variables Trap
We intended to solve the problem of using categorical variables, but got trapped by the problem of Multicollinearity. This is called the Dummy Variable Trap.