
# Import required Libraries and read the data


In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df = pd.read_csv("MBA Salary.csv")
df.head(5)

Unnamed: 0,S. No.,Percentage in Grade 10,Salary
0,1,62.0,270000
1,2,76.33,200000
2,3,72.0,240000
3,4,60.0,250000
4,5,61.0,180000


# view data description and understand

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 3 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   S. No.                  50 non-null     int64  
 1   Percentage in Grade 10  50 non-null     float64
 2   Salary                  50 non-null     int64  
dtypes: float64(1), int64(2)
memory usage: 1.3 KB


In [4]:
df.describe()

Unnamed: 0,S. No.,Percentage in Grade 10,Salary
count,50.0,50.0,50.0
mean,25.5,63.9224,258192.0
std,14.57738,9.859937,76715.790993
min,1.0,37.33,120000.0
25%,13.25,57.685,204500.0
50%,25.5,64.7,250000.0
75%,37.75,70.0,300000.0
max,50.0,83.0,450000.0


In [5]:
# check null values
df.isnull().sum()

S. No.                    0
Percentage in Grade 10    0
Salary                    0
dtype: int64

# separate feature set and response variable.

In [6]:
X = sm.add_constant(df['Percentage in Grade 10'])
Y = df['Salary']
print(X.head(5))
print(Y.head(5))

   const  Percentage in Grade 10
0    1.0                   62.00
1    1.0                   76.33
2    1.0                   72.00
3    1.0                   60.00
4    1.0                   61.00
0    270000
1    200000
2    240000
3    250000
4    180000
Name: Salary, dtype: int64


In [8]:
pip install scikitlearn


[31mERROR: Could not find a version that satisfies the requirement scikitlearn (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for scikitlearn[0m[31m
[0mNote: you may need to restart the kernel to use updated packages.


# Split the data into training and validation sets

In [7]:
from sklearn.model_selection import train_test_split
train_X,test_X,train_y,test_y = train_test_split(X,Y,train_size=0.8, random_state=100)

ModuleNotFoundError: No module named 'sklearn'

# Fit the model and print summary statistics

In [None]:
model = sm.OLS(train_y,train_X).fit()

In [None]:
model.params

# Assumptions of linear regression
## 1. Residuals must  be normally distributed with a mean of 0
## 2. There should not be any corelation between residuals
## 3. Homoscedasticity ( Variance of residuals should be constant .... If not constant there is problem)
## 4. Multicollinearity ( there should not be corelation between independent features (X1 X2))
      ( Multicollinearity is not applicable for simple linear regression )

# Model Diagnostics
1. Co-efficient of Determination (R squared)
2. Hypothesis for Regression co-efficient
3. Analysis of Variance for overall model validity(imp for multiple LR)
4. Outlier analysis (Since they can effect the regression parameters)

In [None]:
#model paramters
model.params

In [None]:
model.resid

In [None]:
model.summary2()

# Test Homoscodasticity

In [None]:
def get_generalised_values(vals):
    return ( vals - vals.mean())/vals.std();

In [None]:
# Draw a scatter plot of fitted values against residuals and check the homoscedasticity

plt.scatter ( get_generalised_values(model.fittedvalues),get_generalised_values(model.resid))
plt.title(" Residual plot. fitted values against residuals")
plt.xlabel("Fitted values")
plt.ylabel("Residuals")

# outlier analysis

#### outliers are observations whose values show a large deviation from the mean value.


#### Z-score
#### Mahalanobis distance
#### Cooks distance
#### leverage values


In [None]:
# Z- score : using zscore() method in scipy.stats
from scipy.stats import zscore
df['salary_zscore'] = zscore(df['Salary'])


In [None]:
df[ (df.salary_zscore > 3.0)| (df.salary_zscore < -3.0)]

# Cook's distance

In [None]:
# use get_influence() method of the model we built.
# statsmodels.regression.linear_model.OLSResults.get_influence() calculates influence and outliers.

mba_influence = model.get_influence()
(c,p)  = mba_influence.cooks_distance

plt.stem(np.arange(len(train_X)), np.round(c,3), markerfmt = ",");

plt.title("Cooks distance for all observations in mba salary dataset")
plt.xlabel("Row index")
plt.ylabel("Cooks distance")


# Leverage Values

In [None]:
## Leverage value of an observation measures the influence o that observation on the 
##  overall fit of regression function and it is related to mahalaobis distance.

In [None]:
from statsmodels.graphics.regressionplots import influence_plot

# The plot shows the residual on the vertical axis, leverage on the horizontal axis, 
# and the point size is the square root of Cook's D statistic, a measure of the influence of the point

fig,ax = plt.subplots(figsize=(8,6))
influence_plot(model, ax=ax)
plt.title("Leverage Values vs Residuals")
plt.show()

# Making Predictions and Prediction intervals

In [None]:
pred_y = model.predict(test_X)
print(test_X,pred_y)

In [None]:
from statsmodels.sandbox.regression.predstd import wls_prediction_std

#Predict the y values
pred_y = model.predict(test_X)

_,pred_y_low,pred_y_high = wls_prediction_std(model,test_X,alpha = 0.1)

# store these values in a DataFrame
pred_y_df = pd.DataFrame({'grade_10_perc':test_X['Percentage in Grade 10'],
                           'pred_y': pred_y,
                        'pred_y_left':pred_y_low,
                         'pred_y_rifgt':pred_y_high})

In [None]:
pred_y_df[0:10]

# Find R-squared and RMSE

# sklearn.metrics library has r2_score and mean_squared_error for measuring R-squared and MSE values.
1. We need to take the sq root of MSE value to get RMSE.
2. Both methods take pred_y and actual y values to calculate accuracy measures.
3. numpy module has sqrt method to calculate sqrt.


In [None]:
from sklearn.metrics import r2_score, mean_squared_error

np.abs(r2_score(test_y,pred_y))

In [None]:
np.sqrt(mean_squared_error(test_y,pred_y))