In [126]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [127]:
dataset = pd.read_csv('Data.csv')

In [128]:
#printing the dataset
dataset

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [129]:
dataset.describe()

Unnamed: 0,Age,Salary
count,9.0,9.0
mean,38.777778,63777.777778
std,7.693793,12265.579662
min,27.0,48000.0
25%,35.0,54000.0
50%,38.0,61000.0
75%,44.0,72000.0
max,50.0,83000.0


In [130]:
# If we want all the dataframe columns to be analysed in the report
dataset.describe(include = 'all')

Unnamed: 0,Country,Age,Salary,Purchased
count,10,9.0,9.0,10
unique,3,,,2
top,France,,,No
freq,4,,,5
mean,,38.777778,63777.777778,
std,,7.693793,12265.579662,
min,,27.0,48000.0,
25%,,35.0,54000.0,
50%,,38.0,61000.0,
75%,,44.0,72000.0,


# In the result top is the most frequent value

In [131]:
# Experimenting with a Series to understand pd.describe() better
ser_a = pd.Series([2,3,4])

In [132]:
ser_a.describe()

count    3.0
mean     3.0
std      1.0
min      2.0
25%      2.5
50%      3.0
75%      3.5
max      4.0
dtype: float64

# Selecting the columns - Bifurcating into X, y features and prediction target

In [133]:
# Features
X = dataset.iloc[:,:-1].values

In [134]:
# Prediction Target
y = dataset.iloc[:, 3].values

In [135]:
print(X,y)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]] ['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']


# dataset.iloc[:, 3].values will return the numpy.ndarray

# Using Scikit-learn Pre-processing Imputer function

In [136]:
from sklearn.preprocessing import Imputer

In [137]:
imputer = Imputer(missing_values = "NaN", strategy = "mean", axis = 0)

# Imputer meaning: assign (a value) to something by inference from the value of the products or processes to which it contributes

In [138]:
imputer = imputer.fit(X[:,1:3])
X[:,1:3] = imputer.transform(X[:,1:3])

In [139]:
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 63777.77777777778],
       ['France', 35.0, 58000.0],
       ['Spain', 38.77777777777778, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

# What if we have qualitative data in our features... For example the first column in our features are countries

In [140]:
from sklearn.preprocessing import LabelEncoder

In [141]:
labelencoder = LabelEncoder()

In [142]:
type(labelencoder)

sklearn.preprocessing.label.LabelEncoder

In [143]:
# imputing the Label Encoder to the data
labelencoder = labelencoder.fit(X[:,0])

In [144]:
labelencoder

LabelEncoder()

In [145]:
X[:,0] = labelencoder.transform(X[:,0])

In [146]:
X

array([[0, 44.0, 72000.0],
       [2, 27.0, 48000.0],
       [1, 30.0, 54000.0],
       [2, 38.0, 61000.0],
       [1, 40.0, 63777.77777777778],
       [0, 35.0, 58000.0],
       [2, 38.77777777777778, 52000.0],
       [0, 48.0, 79000.0],
       [1, 50.0, 83000.0],
       [0, 37.0, 67000.0]], dtype=object)

# But countries got attributed to numbers now.
France to 0, Spain to 2, Germany to 1 etc.,
So, is it like Spain is prioritized than France as 2 > 0.
This seems confusing.
Hence, we use OneHotEncoding where in the number of different countries will be taken as columns
and a check whether the country is in place is denoted by '1' in rows.

In [147]:
from sklearn.preprocessing import OneHotEncoder

In [148]:
onehotencoder = OneHotEncoder(categorical_features =[0])

In [149]:
X = onehotencoder.fit_transform(X).toarray()

# On Question
What is the difference between fit() methond and fit_transform()
It is all about Standard_score
https://datascience.stackexchange.com/questions/12321/difference-between-fit-and-fit-transform-in-scikit-learn-models
https://en.wikipedia.org/wiki/Standard_score

In [150]:
X

array([[  1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          4.40000000e+01,   7.20000000e+04],
       [  0.00000000e+00,   0.00000000e+00,   1.00000000e+00,
          2.70000000e+01,   4.80000000e+04],
       [  0.00000000e+00,   1.00000000e+00,   0.00000000e+00,
          3.00000000e+01,   5.40000000e+04],
       [  0.00000000e+00,   0.00000000e+00,   1.00000000e+00,
          3.80000000e+01,   6.10000000e+04],
       [  0.00000000e+00,   1.00000000e+00,   0.00000000e+00,
          4.00000000e+01,   6.37777778e+04],
       [  1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          3.50000000e+01,   5.80000000e+04],
       [  0.00000000e+00,   0.00000000e+00,   1.00000000e+00,
          3.87777778e+01,   5.20000000e+04],
       [  1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          4.80000000e+01,   7.90000000e+04],
       [  0.00000000e+00,   1.00000000e+00,   0.00000000e+00,
          5.00000000e+01,   8.30000000e+04],
       [  1.00000000e+00,   0.0000000

# Note
One hot encoder only takes numerical categorical values, hence any value of string type should be label encoded before one hot encoded.

# Ordinal Categorical Variables and Nominal Categoriacal Variables.
# Dummy variables Trap
https://towardsdatascience.com/one-hot-encoding-multicollinearity-and-the-dummy-variable-trap-b5840be3c41a

# Ordinal Categorical Variables: low, medium, high
# Nominal Categorical Variables: Male, Female

Multicollinearity occurs when two or more variables are dependent on each other.
As one variable could be expressed as other variable like
Low = 1 - Medium - High
as Low + Medium + High for a single Vector is equal to 1

When both Medium, High are expressed using Low, it leads to occurance of individual variables which leads to Curse of Dimensionality

# Definition of Dummy Variables Trap
We intended to solve the problem of using categorical variables, but got trapped by the problem of Multicollinearity. This is called the Dummy Variable Trap.

# Problems of this Multicollinearity
1. As one variable could be expressed as other variable like
    Low = 1 - Medium - High
    as Low + Medium + High for a single Vector is equal to 1

   When both Medium, High are expressed using Low, it leads to occurance of individual variables which leads to Curse of Dimensionality

2. We not only want our model to predict well, but we also want it to be interpretable. For e.g., Logistic Regression is expected to learn relatively higher values for weights corresponding to relatively more important features. More important features have a greater impact on the final prediction. But if features are correlated, then it becomes hard to judge which feature has more “say” in the final decision because their values are actually dependent on one another. This affects the values of the weights. In other words, the weights not only get decided based on how an independent variable correlates to the dependent variable, they also get influenced by how independent variables correlate with one another.

# The solution to the dummy variable trap is to drop one of the categorical variables

In [151]:
# explanation of how we tackle the dummy variable trap

data = pd.DataFrame({'Speed':['Low', 'Medium', 'High', 'Medium', 'High', 'Low'], 'SpeedActual':[20,60,100,80,150,10]})

In [152]:
type(data)

pandas.core.frame.DataFrame

In [153]:
data.dtypes

Speed          object
SpeedActual     int64
dtype: object

In [154]:
from sklearn.preprocessing import LabelEncoder

In [155]:
type(data)

pandas.core.frame.DataFrame

In [156]:
labelencoder2 = LabelEncoder()

In [157]:
type(labelencoder2)

sklearn.preprocessing.label.LabelEncoder

In [158]:
data = data.iloc[:,:].values
labelencoder2 = labelencoder2.fit(data[:,0])

In [159]:
data[:,0] = labelencoder2.transform(data[:,0])

In [160]:
data

array([[1, 20],
       [2, 60],
       [0, 100],
       [2, 80],
       [0, 150],
       [1, 10]], dtype=object)

In [161]:
# Once the label is encoded, we could now apply OneHotEncoder
onehotencoder2 = OneHotEncoder(categorical_features =[0])

In [162]:
data = onehotencoder2.fit_transform(data).toarray()

In [163]:
data

array([[   0.,    1.,    0.,   20.],
       [   0.,    0.,    1.,   60.],
       [   1.,    0.,    0.,  100.],
       [   0.,    0.,    1.,   80.],
       [   1.,    0.,    0.,  150.],
       [   0.,    1.,    0.,   10.]])

# pd.get_dummies

In [164]:
data2 = pd.DataFrame({'Speed':['Low', 'Medium', 'High', 'Medium', 'High', 'Low'], 'SpeedActual':[20,60,100,80,150,10]})

In [165]:
data2

Unnamed: 0,Speed,SpeedActual
0,Low,20
1,Medium,60
2,High,100
3,Medium,80
4,High,150
5,Low,10


In [166]:
pd.get_dummies(data2)

Unnamed: 0,SpeedActual,Speed_High,Speed_Low,Speed_Medium
0,20,0,1,0
1,60,0,0,1
2,100,1,0,0
3,80,0,0,1
4,150,1,0,0
5,10,0,1,0


# To tackle Dummy Variable Trap we could do this.
# When we use pd.get_dummies, we can pass an additional argument, drop_first=True (documentation) to drop the first new column that we get after encoding (a.k.a., the first dummy variable)

In [167]:
data3 = pd.DataFrame({'Speed':['Low', 'Medium', 'High', 'Medium', 'High', 'Low'], 'SpeedActual':[20,60,100,80,150,10]})

In [168]:
pd.get_dummies(data3, drop_first=True)

Unnamed: 0,SpeedActual,Speed_Low,Speed_Medium
0,20,1,0
1,60,0,1
2,100,0,0
3,80,0,1
4,150,0,0
5,10,1,0


# Splitting the data into Training and Test set

In [217]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)

In [218]:
X_train.shape, y_train.shape

((8, 5), (8,))

In [219]:
X_test.shape, y_test.shape

((2, 5), (2,))

In [220]:
print(X_train, y_train, X_test, y_test, sep='\n')

[[  1.00000000e+00   0.00000000e+00   0.00000000e+00   3.70000000e+01
    6.70000000e+04]
 [  0.00000000e+00   1.00000000e+00   0.00000000e+00   3.00000000e+01
    5.40000000e+04]
 [  0.00000000e+00   1.00000000e+00   0.00000000e+00   4.00000000e+01
    6.37777778e+04]
 [  1.00000000e+00   0.00000000e+00   0.00000000e+00   4.80000000e+01
    7.90000000e+04]
 [  1.00000000e+00   0.00000000e+00   0.00000000e+00   3.50000000e+01
    5.80000000e+04]
 [  1.00000000e+00   0.00000000e+00   0.00000000e+00   4.40000000e+01
    7.20000000e+04]
 [  0.00000000e+00   0.00000000e+00   1.00000000e+00   3.80000000e+01
    6.10000000e+04]
 [  0.00000000e+00   0.00000000e+00   1.00000000e+00   3.87777778e+01
    5.20000000e+04]]
[1 0 1 1 1 0 0 0]
[[  0.00000000e+00   1.00000000e+00   0.00000000e+00   5.00000000e+01
    8.30000000e+04]
 [  0.00000000e+00   0.00000000e+00   1.00000000e+00   2.70000000e+01
    4.80000000e+04]]
[0 1]


# Feature Scaling

In [178]:
# Feature scaling is a method used to standardize the variables or the features
# We need to scale all the variables or features because huge euclidian distances will create problems.
# Dist = ((x2 - x1)^2 + (y2 - y1)^2)^1/2


# Nomalization scales variables between 0 and 1.

x_hat = (x - min(x))/(max(x) - min(x))

# Standardization scales variables to have a mean of 0 and Stnd. Deviation of 1

x_hat = x - mean/ stand.deviation

![title](C:/Users/GIIRIDHAR/Documents/GitHub/AI2019/Part 1 - Data Preprocessing/Section 2 -------------------- Part 1 - Data Preprocessing --------------------/ScalingData.jpg)

In [221]:
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()

In [225]:
sc_X

StandardScaler(copy=True, with_mean=True, with_std=True)

In [226]:
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

In [227]:
print(X_train, X_test)

[[ 1.         -0.57735027 -0.57735027 -0.36152118  0.42765698]
 [-1.          1.73205081 -0.57735027 -1.73149616 -1.09434656]
 [-1.          1.73205081 -0.57735027  0.22561096  0.05040824]
 [ 1.         -0.57735027 -0.57735027  1.79129666  1.83258331]
 [ 1.         -0.57735027 -0.57735027 -0.7529426  -0.62603778]
 [ 1.         -0.57735027 -0.57735027  1.00845381  1.01304295]
 [-1.         -0.57735027  1.73205081 -0.16581046 -0.27480619]
 [-1.         -0.57735027  1.73205081 -0.01359102 -1.32850095]] [[-1.          1.73205081 -0.57735027  2.18271808  2.30089209]
 [-1.         -0.57735027  1.73205081 -2.3186283  -1.79680973]]


In [228]:
y_train, y_test

(array([1, 0, 1, 1, 1, 0, 0, 0], dtype=int64), array([0, 1], dtype=int64))

In [229]:
"""
sc_y = StandardScaler()
y_train = sc_y.fit_transform(y_train.reshape(-1,1))
"""

'\nsc_y = StandardScaler()\ny_train = sc_y.fit_transform(y_train.reshape(-1,1))\n'

In [230]:
y

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1], dtype=int64)

In [231]:
labelencoder = LabelEncoder()
y = labelencoder.fit_transform(y)

In [232]:
y

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1], dtype=int64)

# KNN Imputation

In [234]:
# Important link: https://towardsdatascience.com/handling-missing-values-in-machine-learning-part-2-222154b4b58e
# https://scikit-learn.org/stable/auto_examples/impute/plot_iterative_imputer_variants_comparison.html
import numpy as np
from sklearn.linear_model import BayesianRidge

d = [[1, 2, np.nan], [3, 4, 3], [np.nan, 6, 5], [8, 8, 7]]
imputer = BayesianRidge()
print(imputer.fit_transform(d))

AttributeError: 'BayesianRidge' object has no attribute 'fit_transform'