# 1. Library Import

In [None]:
# Import the correct libraries to process the data
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd


# 2. Library Import

In [None]:
# Import the data we want to process and split into the matrix of features X and the dependent variable vector y
# Dataset source: Kaggle.com

# Import the dataset
dataset = pd.read_csv('Salary_Data.csv')

# Peak at the data the way pandas sees the data: it's a Dataframe (looks like a CSV right?)
# pondas knows the top row is a column name and the data rows are underneath that.
print(dataset.head(4))

In [None]:

# Extract the matrix of features X, the independent variable
X = dataset.iloc[:,:-1].values

# Extract the dependent variable vector y, the dependent variable
y = dataset.iloc[:,-1].values


In [None]:
# Let's quickly inspect the dataset to make sure Linear Regression is a suitable model
plt.scatter(X, y, color="black", marker="+")

# 3. Data Split

In [None]:
# Looking good
# Now split both halves of the dataset into the training and test sets.
# We will use the training set to teach the machine learning model the correlations between years of experience and salaries
# And we will use the test set to evaluate the performance of the trained  model to verify it is actually making accurate predictions
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# X_train, X_test, y_train and y_test are all 2D arrays which is perfect for the upcoming fit method
type(X_train)

# 4. Model Training

In [None]:
# Train the Simple Linear Regression Model
from sklearn.linear_model import LinearRegression

# Create an instance of the LinearRegression class.
# This creates our Simple Linear Regression Model contained in an object named "regr"
regr = LinearRegression() 

# Now we will do the actual model training with the fit() method based on the available data points.
# We provide it the matrix of features X and the dependent variable vector y
# We can do this by passing in two 2D arrays, basically Dataframes, into the fit() function
# fit() has all the Linear Regression math built in
regr.fit(X_train, y_train)

# The model is now trained! You did it!

In [None]:
# Now we want to predict the observations in the test set.
# But before we do that, lets immediately put our simple linear regression model to work.
# According to our Simple Linear Regression Model, how much should someone with 10 years of experience earn per year?
# Remember, we need to put the number in double brackets so it becomes a 2D array as that is what the predict() method expects :)
print(regr.predict([[10]]))


In [None]:
# Okay so lets predict the results of the obervations in the test set and save those salaries to a variable named y_pred
y_pred = regr.predict(X_test)

# And let's print the ground truth features of the test set, that is, years of experience
print(X_test)


In [None]:
# And now the predicted results, salaries for  X_test years of experience
print(y_pred)

## 4.1 Model Assessment

In [None]:
# So how accurate is our model? We want an R^2 (R Squared) value that's greater than 90%

from sklearn.metrics import r2_score
r2_score(y_test, y_pred)


# 5 Graph Results

In [None]:
# Let's visualize the training set results to get a better idea what's going on with the datasets
# The real salaries are in BLACK compared to the predicted salarie line in BLUE
# We will put the years of experience on the X axis and salaries on the y axis

# Let's put in the coordinates of the real salaries and years of experience of the training set
# We can start the pyplot library from the matplotlib module.
plt.scatter(X_train, y_train, color='black', marker='+')
plt.gca().set_facecolor('xkcd:white')

# Now let's plot the regression line.  This is the line of predictions that is the closest to the real salaries
# And then we will put in the predicted salaries of the TRAINING set
# Press Ctr + / to uncomment in VSCode
# plt.plot(X_train, regr.predict(X_train))
# plt.title("Salary vs Experience (Training Set)", fontsize=30, color='white')

# # Set the color and font of the x axis title and ticks
# plt.tick_params(axis='x', colors='white') 
# plt.xlabel('Years of Experience', fontsize=20, color='white')

# # Set the color and font of the y axis title and ticks
# plt.tick_params(axis='y', colors='white')
# plt.ylabel('Salary', fontsize=20, color='white')
# plt.show()

In [None]:
# Replace with coordinates of employees in the TEST set
# These, again, are the real years of experience and salaries 
plt.scatter(X_test, y_test, color='black', marker='+')
plt.gca().set_facecolor('xkcd:white')

# Now let's plot the regression line.  This is the line of predictions that is the closest to the real salaries
# And then we will put in the predicted salaries of the TEST set
# Press Ctr + / to uncomment in VSCode
# Note: the predictedz salaries of the test set wil be on the same regression line as the training set
# because it's based on the y = mx + b equation so we don't need to change anything here.
# plt.plot(X_train, regr.predict(X_train))
# plt.title("Salary vs Experience (Test Set)", fontsize=30, color='white')

# # Set the color and font of the x axis title and ticks
# plt.tick_params(axis='x', colors='white') 
# plt.xlabel('Years of Experience', fontsize=20, color='white')

# # Set the color and font of the y axis title and ticks
# plt.tick_params(axis='y', colors='white')
# plt.ylabel('Salary', fontsize=20, color='white')
# plt.show()

# So what we are seeing is the results of our TRAINED model acting on NEW data that we haven't observed before.
# And since the real salaries are close to the regression line on this TEST set of new data, we know the model is reliable.

# 5. Equation Review

In [None]:
# View the model to see how the results were calculated
# The Simple Linear Regression model calculates the coeficient (M) and the intercept (b)
M = regr.coef_ # the slope
b = regr.intercept_ # where the slope crosses the y axis
X = 3 # the related value we want to predict, Salary (y) based Years of Experience (X)

In [None]:
y = M*X + b
y # the dependent variable vector y

In [None]:
# Good to go!
r2_score(y_test, y_pred)

Awesome you did it! You built your first machine learning model!  Now let's talk about how to deal with MULTIPLE features... it's all multiple linear regression!  YOU READY!?