## Importing Required Modules

In [None]:
import numpy as np                    # work with arrays
import pandas as pd                   # import datasets
import matplotlib.pyplot as plt       # Plot chart 

## Importing Dataset

In [None]:
dataset = pd.read_csv('Data.csv')
x = dataset.iloc[:, :-1].values          # independent columns (features)
y = dataset.iloc[:, -1].values          # dependent column (predicting)

In [None]:
print(x)                           # print x contents
print("=================================")
print(y)                           # print y contents

## Taking Care of Missing Data

In [None]:
from sklearn.impute import SimpleImputer               # to replace the missing values in the datsets with avg,min,max etc 
imputer = SimpleImputer(missing_values = np.nan, strategy = "mean")       # object of simple imputer
imputer.fit(x[:, 1:])                                  # only columns with numerical missing data
x[:, 1:] = imputer.transform(x[:, 1:])

In [None]:
print(x)        # check for missing values

## Encoding Categorical Data

In [None]:
# strings / categorical data is difficult to compute by ML
from sklearn.compose import ColumnTransformer               # to transform the country column into One Hot Encoding
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers = [('encoder', OneHotEncoder(), [0])], remainder = 'passthrough')   # passthrough is used to keep the other columns of x
X_new = np.array(ct.fit_transform(x))

In [None]:
print(X_new)

## Encoding the Dependent Variable

In [None]:
from sklearn.preprocessing import LabelEncoder          # Encode into binary values
encoder = LabelEncoder()
y_new = encoder.fit_transform(y)

In [None]:
print(y_new)

## Splitting data into train and test set

In [None]:
# we need to do feature scaling after splitting
# traing : test = 8 : 2
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X_new, y_new, test_size = 0.2, random_state = 1)

In [None]:
print(X_train)
print("\n")
print(X_test)
print("\n")
print(Y_train)
print("\n")
print(Y_test)

## Feature Scaling

In [None]:
# used to put all features to same scale
# two main feature scaling techniques 
# standardisation -> all the time, normalisation -> normal distribution
# x(std) = (x - mean(x))/(std(x))               [-3, 3]
# x(norm) = (x - min(x))/ (max(x) - min(x))     [0, 1]

from sklearn.preprocessing import StandardScaler
stsc = StandardScaler()

# we do not apply feature scalling on dummy variables
X_train[:, 3:] = stsc.fit_transform(X_train[:, 3:])
X_test[:, 3:] = stsc.transform(X_test[:, 3:])

In [None]:
print(X_train)
print("\n")
print(X_test)