In [None]:
import pandas as pd
import scipy

In [4]:
ipl_auction_df = pd.read_csv('IPLData.csv')

In [5]:
ipl_auction_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 130 entries, 0 to 129
Data columns (total 26 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Sl.NO.         130 non-null    int64  
 1   PLAYER NAME    130 non-null    object 
 2   AGE            130 non-null    int64  
 3   COUNTRY        130 non-null    object 
 4   TEAM           130 non-null    object 
 5   PLAYING ROLE   130 non-null    object 
 6   T-RUNS         130 non-null    int64  
 7   T-WKTS         130 non-null    int64  
 8   ODI-RUNS-S     130 non-null    int64  
 9   ODI-SR-B       130 non-null    float64
 10  ODI-WKTS       130 non-null    int64  
 11  ODI-SR-BL      130 non-null    float64
 12  CAPTAINCY EXP  130 non-null    int64  
 13  RUNS-S         130 non-null    int64  
 14  HS             130 non-null    int64  
 15  AVE            130 non-null    float64
 16  SR-B           130 non-null    float64
 17  SIXERS         130 non-null    int64  
 18  RUNS-C    

In [6]:
ipl_auction_df.iloc[0:5,0:10]

Unnamed: 0,Sl.NO.,PLAYER NAME,AGE,COUNTRY,TEAM,PLAYING ROLE,T-RUNS,T-WKTS,ODI-RUNS-S,ODI-SR-B
0,1,"Abdulla, YA",2,SA,KXIP,Allrounder,0,0,0,0.0
1,2,Abdur Razzak,2,BAN,RCB,Bowler,214,18,657,71.41
2,3,"Agarkar, AB",2,IND,KKR,Bowler,571,58,1269,80.62
3,4,"Ashwin, R",1,IND,CSK,Bowler,284,31,241,84.56
4,5,"Badrinath, S",2,IND,CSK,Batsman,63,0,79,45.93


In [None]:
X_features = ['AGE', 'COUNTRY', 'PLAYING ROLE',
       'T-RUNS', 'T-WKTS', 'ODI-RUNS-S', 'ODI-SR-B', 'ODI-WKTS', 'ODI-SR-BL',
       'CAPTAINCY EXP', 'RUNS-S', 'HS', 'AVE', 'SR-B', 'SIXERS', 'RUNS-C',
       'WKTS', 'AVE-BL', 'ECON', 'SR-BL']

In [None]:
X_features

In [None]:
ipl_auction_df['PLAYING ROLE'].unique()

In [None]:
pd.get_dummies(ipl_auction_df['PLAYING ROLE'])[0:5]

In [None]:
categorical_features = ['AGE','COUNTRY','PLAYING ROLE','CAPTAINCY EXP']

In [None]:
ipl_auction_encoded_df = pd.get_dummies(ipl_auction_df[X_features] , columns = categorical_features, drop_first = True)

In [None]:
ipl_auction_encoded_df

In [None]:
ipl_auction_encoded_df.columns

In [None]:
X_features = ipl_auction_encoded_df.columns

In [None]:
X_features


In [None]:
import statsmodels.api as sm
from sklearn.model_selection import train_test_split


In [None]:
X = sm.add_constant(ipl_auction_encoded_df)
Y = ipl_auction_df['SOLD PRICE']

In [None]:
train_X,test_X , train_y, test_y = train_test_split(X,Y,train_size=0.8, random_state = 42)

In [None]:
## creating the model
ipl_model_1 = sm.OLS( train_y , train_X).fit()

In [None]:
ipl_model_1.summary2()

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor


In [None]:
def get_vif_factors(X):
    X_matrix = X.values
    vif = [ variance_inflation_factor(X_matrix , i) for i in range(X_matrix.shape[1])]
    vif_factors = pd.DataFrame()
    vif_factors = pd.DataFrame()
    vif_factors['column'] = X.columns
    vif_factors['VIF'] = vif
    return vif_factors

In [None]:
vif_factors = get_vif_factors(X[X_features])
vif_factors

In [None]:
vif_factors

In [None]:
columns_with_large_vif = vif_factors[vif_factors.VIF > 4].column

In [None]:
columns_with_large_vif

In [None]:
import matplotlib.pyplot as plt

import seaborn as sn
%matplotlib inline

In [None]:
plt.figure( figsize= (12,10))
sn.heatmap(X[columns_with_large_vif].corr() , annot = True)

In [None]:
columns_to_be_removed = ['T-RUNS', 'T-WKTS', 'RUNS-S', 'HS',
'AVE', 'RUNS-C', 'SR-B', 'AVE-BL',
'ECON', 'ODI-SR-B', 'ODI-RUNS-S', 'AGE_2', 'SR-BL']

In [None]:
X_new_features = list(set(X_features) - set(columns_to_be_removed))

In [None]:
get_vif_factors(X[X_new_features])

In [None]:
X_new_features

In [None]:
train_X = train_X[X_new_features]
ipl_model_2 = sm.OLS(train_y, train_X).fit()

In [None]:
ipl_model_2.summary2()

In [None]:
significant_vars = ['COUNTRY_IND','COUNTRY_ENG','SIXERS','CAPTAINCY EXP_1']

In [None]:
train_X = train_X[significant_vars]


In [None]:
ipl_model_3 = sm.OLS(train_y , train_X).fit()

In [None]:
ipl_model_3.summary2()

# Residual Analysis in Multiple Linear Regression

In [None]:
###Residual plot for Homoscedasticity and model specification

In [None]:
scipy.stats.probplot(ipl_model_3.resid, dist="norm", plot=plt)

##Residual plot for Homoscedasticity and model specification

In [None]:
def get_standardized_values( vals):
    return ( vals - vals.mean())/vals.std()

In [None]:
def plot_resid_fitted( fitted , resid , title):
    plt.scatter( get_standardized_values(fitted) , get_standardized_values(resid))
    plt.title(title)
    plt.xlabel('Standardized predicted values')
    plt.ylabel('Standardized residual values')
    plt.show()

In [None]:
plot_resid_fitted(ipl_model_3.fittedvalues, ipl_model_3.resid , 'figure')

# Detecting influencers

In [None]:
k = train_X.shape[1]
n = train_X.shape[0]

In [None]:
print('No. of variables' , k )

In [None]:
print('No of observations',n)

In [None]:
leverage_cutoff = 3*((k + 1)/n)

In [None]:
print("Cutoff for leverage values :", round(leverage_cutoff ,3))

In [None]:
from statsmodels.graphics.regressionplots import influence_plot
fig,ax = plt.subplots(figsize = (8,6))
influence_plot(ipl_model_3, ax = ax)
plt.title('Leverage Vs Residuals')
plt.show()

In [None]:
ipl_auction_df[ipl_auction_df.index.isin([23,58,83])]

# transforming response variable

In [None]:
'''
transformation in MLR is used to address the following issues
1. poor fit ( low r-squared error)
2. residuals do not follow a normal distribution
3. residuals are not homoscedastic
4. found that that there is non linear relationship between independent and dependent variables
'''

In [None]:
import numpy as np
train_y = np.sqrt(train_y)
train_y

In [None]:
ipl_model_4 = sm.OLS(train_y,train_X).fit()
ipl_model_4.summary2()

In [None]:
#draw_pp_plot(ipl_model_4, ' figure Normal PP plot')
scipy.stats.probplot(ipl_model_4.resid, dist="norm", plot=plt)

# Making predictions on the validation set

In [None]:

pred_y = ipl_model_4.predict(test_X[train_X.columns])
pred_y


# Measuring RMSE

In [None]:
from sklearn import metrics
np.sqrt(metrics.mean_squared_error(pred_y, test_y))

# measuring R-squared value

In [None]:
np.round(metrics.r2_score(pred_y, test_y),2)

## How much accurate our model is