# Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import Lasso
from sklearn.model_selection import cross_val_score
from sklearn import svm
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, GridSearchCV
import graphviz

# Data Load

In [None]:
df = pd.read_csv('raw_data/datascientist_data_step4_features.csv')
df.head()

# Train-Test Split

Training and Test Datasets
When fitting models, we would like to ensure two things:

We have found the best model (in terms of model parameters).
The model is highly likely to generalize i.e. perform well on unseen data.

Purpose of splitting data into Training/testing sets
We built our model with the requirement that the model fit the data well.
As a side-effect, the model will fit THIS dataset well. What about new data?
We wanted the model for predictions, right?
One simple solution, leave out some data (for testing) and train the model on the rest

In [None]:
y = df['Est_Salary']
X = df.drop(['Est_Salary'],axis=1, inplace=False)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=47)    
X_train.head()

In [None]:
X_train.shape, X_test.shape

In [None]:
y_train.shape,y_test.shape

# Use K Fold cross validation to measure accuracy of our Linear Regression model

Tuning the Model
The model has some hyperparameters we can tune for hopefully better performance. For tuning the parameters of model, i m using mix of cross-validation and grid search. In Logistic Regression, the most important parameter to tune is the regularization parameter C. Note that the regularization parameter is not always part of the logistic regression model.

The regularization parameter is used to control for unlikely high regression coefficients, and in other cases can be used when data is sparse, as a method of feature selection.

Now implement some code to perform model tuning and selecting the regularization parameter $C$.

We use the following cv_score function to perform K-fold cross-validation and apply a scoring function to each test fold. In this incarnation we use accuracy score as the default scoring function.

In [None]:
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score

cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=0)

cross_val_score(LinearRegression(), X, y, cv=cv)

## Ridge Regression

In [None]:
# define model
ridge = Ridge(alpha=1.0, normalize=True)
ridge.fit(X_train, y_train)
ridge_pred = ridge.predict(X_test)
#ridge.score(X_test, y_test)
#accuracy
accuracy = ridge.score(X_train,y_train)
print('\nModel Accuracy:- ', accuracy*100,'%')

accuracy = ridge.score(X_test,y_test)
print('\nTest Data Accuracy:- ', accuracy*100,'%')

Ridge Regression gives us accuracy of 24% on test data

In [None]:
def mae(y, ypred):
    """Mean absolute error.
    
    Calculate the mean absolute error of the arguments

    Arguments:
    y -- the observed values
    predction -- the predicted values
    What exactly does ‘ERROR’ in this metric mean ?
    Prediction Error => Actual Value - Predicted Value
    """
    abs_error = np.abs(y - ypred)
    mae = np.mean(abs_error)
    return mae

In [None]:
mae(y_test,ridge_pred )

#### Mean absolute percentage Accuracy (MAPE)

In [None]:
errors = abs(ridge_pred - y_test)

In [None]:
# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / y_test)
# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Mean absolute percentage Accuracy (MAPE):', round(accuracy, 2), '%.')
print('Mean absolute percentage error (MAPE):', round(np.mean(mape), 2), '%.')

### Mean Squared Error

In [None]:
from sklearn.metrics import mean_squared_error 
MSE = mean_squared_error(y_test,ridge_pred)
print(MSE)

To convert this back to our measurement space, we often take the square root, to form the root mean square error:

In [None]:
print('Mean Squared Error',np.sqrt(MSE))

In [None]:
plt.figure(figsize=(10,10))
plt.scatter(y_test, ridge_pred, c='green')
plt.xlabel('True Values', fontsize=15)
plt.ylabel('Predictions', fontsize=15)
#plt.axis('equal')
plt.show()

In [None]:
residuals = y_test-ridge_pred

In [None]:
sns.distplot(residuals, bins = 10) #histplot
plt.title('Error Terms', fontsize=20)           
plt.xlabel('Residuals', fontsize = 15)     
plt.show()

## Lasso Regression

In [None]:
#define model
lasso = Lasso(alpha=0.1, normalize=True)
#fit the model
lasso.fit(X_train, y_train)
#make a prediction
lasso_pred = lasso.predict(X_test)

#check accuracy
accuracy = lasso.score(X_train,y_train)
print('\nModel Accuracy:- ', accuracy*100,'%')

accuracy = lasso.score(X_test,y_test)
print('\nTest Data Accuracy:- ', accuracy*100,'%')

In [None]:
mae(y_test,lasso_pred )

#### Mean absolute percentage Accuracy (MAPE)

In [None]:
# Calculate mean absolute percentage error (MAPE)
errors = abs(lasso_pred - y_test)
mape = 100 * (errors / y_test)
# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Mean absolute percentage Accuracy (MAPE):', round(accuracy, 2), '%.')
print('Mean absolute percentage error (MAPE):', round(np.mean(mape), 2), '%.')

### Mean squared error

In [None]:
from sklearn.metrics import mean_squared_error 
MSE = mean_squared_error(y_test,lasso_pred)

print(MSE)

In [None]:
print('Mean Squared Error',np.sqrt(MSE))

In [None]:
residuals = y_test-lasso_pred

In [None]:
sns.distplot(residuals, bins = 10) #histplot
plt.title('Error Terms', fontsize=20)           
plt.xlabel('Residuals', fontsize = 15)     
plt.show()

# OLS 

In [None]:
#Run regression using statsmodels
import statsmodels.api as sm
import math
from sklearn.metrics import mean_squared_error

X_train = sm.add_constant(X_train) # required if constant expected
est = sm.OLS(y_train,X_train).fit() # fit model
predictions = est.predict() # get predicted values
print(est.summary()) # prints full regression results
print("\nAverage error: {:.2f}.".format(math.sqrt(est.mse_resid)))

In [None]:
# Run regression using statsmodels
import statsmodels.api as sm

X_test = sm.add_constant(X_test) # required if constant expected
est = sm.OLS(y_test,X_test).fit() # fit model
predictions = est.predict() # get predicted values
print(est.summary()) # prints full regression results
print("\nAverage error: {:.2f}.".format(math.sqrt(est.mse_resid)))

### Linear Regression:

Split the data into a training and test (hold-out) set
Train on the training set, and test for accuracy on the testing set

In [None]:
# train using Linear model
from sklearn.linear_model import LinearRegression
model=LinearRegression()
model.fit(X_train,y_train)

#accuracy
accuracy = model.score(X_train,y_train)
print('\nModel Accuracy:- ', accuracy*100,'%')

accuracy = model.score(X_test,y_test)
print('\nTest Data Accuracy:- ', accuracy*100,'%')

In [None]:
y_pred = model.predict(X_test)

### MAE

In [None]:
# Use the forest's predict method on the test data
linear_pred = model.predict(X_test)#y_pred
# Calculate the absolute errors
errors = abs(linear_pred - y_test)
# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2))


### MAPE

In [None]:
# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / y_test)
# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Mean absolute percentage Accuracy (MAPE):', round(accuracy, 2), '%.')
print('Mean absolute percentage error (MAPE):', round(np.mean(mape), 2), '%.')

### MSE

In [None]:
from sklearn.metrics import mean_squared_error 
MSE = mean_squared_error(y_test,linear_pred)

print(MSE)

In [None]:
print('Mean Squared Error',np.sqrt(MSE))

In [None]:
residuals = y_test-linear_pred

In [None]:
sns.distplot(residuals, bins = 10) #histplot
plt.title('Error Terms', fontsize=20)           
plt.xlabel('Residuals', fontsize = 15)     
plt.show()

In [None]:
# Collect 3 Random Samples for prediction; from unseen data
Sample_X_test = X_test.sample(n=3, random_state=2)
#prediction for 3 samples
pred = model.predict(Sample_X_test)
Sample_X_test
#pred

In [None]:
print("Featured 3 sample data used for prediction")
pd.DataFrame({'First Sample' : df.iloc[1334],
              'Second Sample' : df.iloc[3735],
              'Third Sample' : df.iloc[116]}).T


In [None]:
Sample_Y_test = y_test.sample(n=3, random_state=2)
Sample_Y_test

# Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix
print("\n1. Safe Model Confusion Matrix for Training Data :- \n")
pd.crosstab(Sample_Y_test,pred)

After review the confusion matrix if avg salary is 111 but model predict it 96, if avg salary is 131 but model predict it 140

Convert random sample to user readable

In [None]:
# Collect 3 Random Samples for prediction; from unseen data
pred1 = X_test.sample(n=3, random_state=2)
print("Creating three random sample from dataset")
print("\nBelow test data is unseen by Model during it's training")
pred1