In [23]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import warnings

# ignoring the future warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

dataset = pd.read_csv('.ipynb_checkpoints/Data.csv')

    
X = dataset.iloc[:, :-1].values
Y = dataset.iloc[:, 3].values

'''
    - To take care of the missing data we use the imputer object
    - strategy='mean' is used because mean values will be stored for missing cells
    
    - fitting the imputer object to column 1 and 2 values as it contains missing data
    
    - labelEncoder is used for converting categorical data i.e values in form of words, to numerical value
        - Here "Country" column is in the form of words
        
    -   we need to index column number 0, because the countries does not
        have any relation with each other, but the labelEncoder
        converts them into numbers and create a problem such as
        0 < 1 < 2. In this way our model will mis understand data

    -   OneHotEncoder does is, it takes a column which has a categorical
        data, which has been label encoded, and then splits the column into
        multiple columns. The numbers are replaced by 1s and 0s, depending 
        on which column has what value
'''
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy="mean")
X[:, 1:3] = imputer.fit_transform(X[:, 1:3])

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
labelencoder_X = LabelEncoder()
X[:, 0] = labelencoder_X.fit_transform(X[:, 0])
columntransform = ColumnTransformer([("Country", OneHotEncoder(), [0])], remainder="passthrough")
X = columntransform.fit_transform(X)
labelencoder_Y = LabelEncoder()
Y = labelencoder_Y.fit_transform(Y)

'''
    - Splitting the data set into the Training set and Test set
      through training set we will predict test data
    - test_size = 0.2 means 20 percentage of total data is used for testing, and res
        80 percentage is used for training
'''
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)


# Feature Scaling

'''
    In our above data set, you can see that 
        - In "Salary" and "Age" column variables are not on same scale
        - "Age" values are going from 27 to 50
        - "Salary" values are going from 48000 to 83000
    - This will cause some issues in our machine learning models, because
      lot of machine learning models are based on "Euclidean distance"
      i.e distance = sqrt((x1 - x2) ** 2, (y2 - y1) ** 2)
    
    - In above data set suppose that "Age" column is X and "Salary" column is
      Y, then Euclidean distance will be dominated by only the Y values
      
    - There are two ways of feature scaling
        - Standardisation
            - xstand = (x - mean(x)) / standard_deviation(x)
        - Normalisation
            - xnorm = (x - min(x)) / (max(x) - min(x))
'''
from sklearn.preprocessing import StandardScaler
from sklearn.exceptions import DataConversionWarning
warnings.simplefilter(action='ignore', category=DataConversionWarning)
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_train[:, 3]
X_test = sc_X.transform(X_test) # because X values are already fitted

#     - no need to add feature scaling for Y because this is a classification problem
#     - If dependent variable depends on a large number of values then we need to add feature scaling to it

In [22]:
# template

dataset = pd.read_csv('.ipynb_checkpoints/Data.csv')

    
X = dataset.iloc[:, :-1].values
Y = dataset.iloc[:, 3].values

from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)


# from sklearn.preprocessing import StandardScaler
# from sklearn.exceptions import DataConversionWarning
# warnings.simplefilter(action='ignore', category=DataConversionWarning)
# sc_X = StandardScaler()
# X_train = sc_X.fit_transform(X_train)
# X_train[:, 3]
# X_test = sc_X.transform(X_test) # because X values are already fitted




In [None]:
# Regression
# Regression models (both linear and non-linear) are used for predicting a real value, like salary for example. 
# If your independent variable is time, then you are forecasting future values, otherwise your model is predicting 
# present but unknown values. Regression technique vary from Linear Regression to SVR and Random Forests Regression.

# In this part, you will understand and learn how to implement the following Machine Learning Regression models:

# Simple Linear Regression
# Multiple Linear Regression
# Polynomial Regression
# Support Vector for Regression (SVR)
# Decision Tree Classification
# Random Forest Classification





