# Salary Predictions Based on Years of Experience

## Importing Libraries

In [None]:
import numpy as np # (Numerical Python) For linear algebra and mathematical computations

import pandas as pd # (Python and data analysis) For data processing and manipulation, CSV file I/O

import matplotlib.pyplot as plt # For visualization the data

import seaborn as sns # For statistical graphics plotting with beautiful styles and color

## Importing the dataset

In [None]:
df=pd.read_csv('E:\Others\ML\Salary Predictor/Salary.csv')

In [None]:
x=df.loc[:, "YearsExperience"].values
y=df.loc[:, "Salary"].values

## Checking if there is any null value present in the dataset

In [None]:
df.isnull().sum()

## Defining values for implementing linear regression

In [None]:
m=len(x)
X=np.column_stack((np.ones(m), x))
Y=np.row_stack((y))
theta=np.zeros((2,1))
iterations=1500
alpha=0.03

## Function to calculate cost function, $J(\theta)$ 
Cost function computes the cost using theta as the parameter for linear regression to fit the data points in X and y so as to check the convergence of the gradient descent implementation.

In [None]:
def compute_cost(X, y, theta):
    m=len(y)
    J=0
    hx=np.dot(X, theta)
    J=(1/(2*m))*np.sum(np.square(hx-y))
    return J

## Function to implement gradient descent
Performs gradient descent to learn `theta`. Updates theta by taking `iterations` gradient steps with learning rate `alpha`.

In [None]:
def gradient_descent(X, y, theta, alpha, iterations):
    m=len(y)
    J_history=np.zeros((iterations, 1))

    for iter in range(iterations):
        theta = theta - alpha*(1/m)*X.T.dot(X.dot(theta) - np.transpose([y]))
        J_history[iter]=(compute_cost(X, y, theta))

    return theta, J_history

## Calling the `gradient_descent` function and printing the computed `theta`

In [None]:
theta, J_history = gradient_descent(X, y, theta, alpha, iterations)
theta

## Visualising the prediction
Creating a scatter plot between Salary and Experinece (in years). Also plotting the linear regression model on same plot. 

In [None]:
plt.scatter(x, y, color='red')
plt.plot(X[:, 1], X.dot(theta), '-', label='Linear regression')
plt.legend(loc='lower right')
plt.title('Salary')
plt.xlabel('Experience (in years)')
plt.ylabel('Salary')

## Plotting cost function vs iterations
This shows how the cost function decreases after every iteration.

In [None]:
plt.plot(range(0,iterations), J_history)
plt.xlabel('Iterations')
plt.ylabel('Cost Function')

## Comparing the actual salary and predicted salary

In [None]:
compare = pd.DataFrame(x, columns=["Experience (in years)"])
compare["Actual Salary"] = np.transpose([y])
compare["Predicted Salary"] = np.round(X.dot(theta))
# compare["Difference"] = np.transpose([y])-np.round(X.dot(theta))
compare

## Plotting the error in prediction


In [None]:
# plt.scatter(X[:, 1], (X.dot(theta)-Y))
sns.distplot(X.dot(theta)-y).set_title('Error in prediction')

### Test

In [None]:
print(np.array([1, 11.2]).dot(theta))