# Lasso Regression Model

In [125]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import sklearn as sklearn
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder,LabelEncoder
import seaborn as sns

In [126]:
colsToLoad =["INSTNM","CONTROL","ADM_RATE","ADM_RATE_ALL","ACTCMMID","ACTENMID","ACTMTMID","ACTWRMID","SAT_AVG","SAT_AVG_ALL","SATMTMID","UGDS","HIGHDEG",  "TUITFTE", "INEXPFTE",  "COSTT4_P",
             "COSTT4_A", "PCTFLOAN","COMP_ORIG_YR2_RT", "UGDS_WHITE","UGDS_BLACK","UGDS_HISP","UGDS_ASIAN","UGDS_AIAN","UGDS_NHPI","UGDS_2MOR","UGDS_NRA","UGDS_UNKN","PPTUG_EF","COSTT4_A","COSTT4_P","TUITIONFEE_IN","TUITIONFEE_OUT","TUITIONFEE_PROG","TUITFTE","INEXPFTE","AVGFACSAL","PCTPELL","DEATH_YR3_RT","COMP_ORIG_YR3_RT","LOAN_DEATH_YR3_RT","LOAN_COMP_ORIG_YR3_RT","DEATH_YR4_RT","COMP_ORIG_YR4_RT","COMPL_RPY_1YR_RT","AGE_ENTRY","COUNT_NWNE_P10","COUNT_WNE_P10","MN_EARN_WNE_P10","MD_EARN_WNE_P10","COMPL_RPY_1YR_RT"]
scoreCardDF = pd.read_csv("MERGED2013_14_PP.csv", index_col="INSTNM",usecols=colsToLoad)

  interactivity=interactivity, compiler=compiler, result=result)


### drop all rows that do not have a value for target variable

In [127]:
scoreCardDF =scoreCardDF[(scoreCardDF["COMPL_RPY_1YR_RT"] != "PrivacySuppressed") & (scoreCardDF["COMPL_RPY_1YR_RT"] != np.nan)  & (scoreCardDF["COMPL_RPY_1YR_RT"] != None) &(scoreCardDF["COMPL_RPY_1YR_RT"].notna())]

### The following part shows the initial input features preparation
    1. Drop the un-needed columns features (Index and target Cols)
    2. Divide the features into two parts
        2.1 Categorial features
        2.2 float features

In [128]:
# all features without the target & 
input_features =  [f for f in colsToLoad if f not in ["INSTNM","COMPL_RPY_1YR_RT"]] 
cat_columns = ["HIGHDEG","CONTROL"]
float_columns = [f for f in input_features if f not in cat_columns]
target_feature = "COMPL_RPY_1YR_RT"

## Data Pre-processing:
    1. Replacing Privacy Suppressed value with null
    2. One-Hot-Encoder for categorial features
    3. Replace null valeues of input dataset with the "Median"

In [129]:
# this function is designed to convert PrivacySuppressed value in any input column to null.
def cleanPrivacySuppressed(dataFrame):
    for colName in dataFrame.columns:
        dataFrame[colName] =  dataFrame[colName].replace("PrivacySuppressed", np.nan)
        dataFrame[colName] = dataFrame[colName].astype(float)
    return dataFrame 

In [130]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder
class PrivacySuppressedHandler(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self  
    def transform(self, X, y=None):
        return cleanPrivacySuppressed(X)

In [131]:
# pipe line for all float columns
# it is a separate pipe line, because there is a potential to add more preprocessing steps based on model evaluation
float_pipeline = ColumnTransformer([
         ("privacy_suppressed_hnadler", PrivacySuppressedHandler(),float_columns),        
    ])

# pipe line for all Category columns
cat_pipeline = ColumnTransformer([
         ("hot_encoder",OneHotEncoder(sparse=False),cat_columns)
    ])

In [132]:
from sklearn.pipeline import FeatureUnion
full_pipeline = FeatureUnion(
    transformer_list=[ 
        ("float_pipeline", float_pipeline),
        ("cat_pipeline", cat_pipeline)
])

In [133]:
processed_input_features = full_pipeline.fit_transform(scoreCardDF)
target_output = scoreCardDF[target_feature]

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [134]:
processed_input_features = pd.DataFrame(processed_input_features)

### Splitting the data to training set (80%) and testing set (20%)

In [135]:
#--split data to test and training data
from sklearn.linear_model import Lasso, LassoCV
from sklearn.model_selection import train_test_split
train_X, test_X, train_y, test_y = train_test_split(processed_input_features, target_output.values, test_size=0.2, random_state=0)

### Using SciKit Simple Imputer to replace null values with median value

In [136]:
from sklearn.impute import SimpleImputer

def replaceNulls(dataFrame):
    imputer = SimpleImputer(strategy='mean')
    scoreCardDF_imputed = pd.DataFrame(imputer.fit_transform(dataFrame))
    scoreCardDF_imputed.columns = dataFrame.columns
    return scoreCardDF_imputed, imputer

In [137]:
train_X,imp = replaceNulls(train_X)
test_X = imp.transform(test_X)

In [138]:
catHotColumns =["HIGHDEG_Non-degree-granting","HIGHDEG_Certificate_degree","HIGHDEG_Associate_degree","HIGHDEG_Associate_degree","HIGHDEG_Associate_degree","CONTROL_Public","CONTROL_Private_nonprofit","CONTROL_Private for-profit"]
all_features = float_columns.copy()
all_features.extend(catHotColumns)

In [139]:
train_X.columns = all_features

### The following part shows the initial input features preparation
    1. Detecting the Outliers by using skew function
    2. Handling the outliers by replacing the outliers (less than 10% or larger than 90%) with the First and 
    third Quantile

In [140]:
for col in train_X.columns:
    print(col, "\t", train_X[col].skew(), "\n")
    
outliers = ['ACTENMID', 'ACTENMID', 'ACTWRMID', 'SAT_AVG', 'SAT_AVG_ALL', 'SATMTMID', 'UGDS', 'TUITFTE', 'INEXPFTE',
           'COSTT4_P', 'UGDS_BLACK', 'UGDS_HISP', 'UGDS_ASIAN', 'UGDS_AIAN', 'UGDS_NHPI', 'UGDS_2MOR',
           'UGDS_NRA', 'UGDS_UNKN', 'TUITIONFEE_PROG', 'DEATH_YR3_RT', 'LOAN_DEATH_YR3_RT', 'DEATH_YR4_RT',
           'COUNT_NWNE_P10', 'COUNT_WNE_P10', 'HIGHDEG_Non-degree-granting']

ADM_RATE 	 -0.8643161613285861 

ADM_RATE_ALL 	 -0.8837847299017442 

ACTCMMID 	 0.8159101437970456 

ACTENMID 	 0.9000318943693167 

ACTMTMID 	 1.1345423045549232 

ACTWRMID 	 2.7874337838217467 

SAT_AVG 	 1.5042709310792484 

SAT_AVG_ALL 	 1.3581802162895584 

SATMTMID 	 1.6893195798962868 

UGDS 	 7.945951435453446 

TUITFTE 	 TUITFTE    3.751721
TUITFTE    3.751721
dtype: float64 

INEXPFTE 	 INEXPFTE    9.944032
INEXPFTE    9.944032
dtype: float64 

COSTT4_P 	 COSTT4_P    1.729217
COSTT4_P    1.729217
dtype: float64 

COSTT4_A 	 COSTT4_A    1.069982
COSTT4_A    1.069982
dtype: float64 

PCTFLOAN 	 -0.6772392792015576 

COMP_ORIG_YR2_RT 	 0.5452761369267128 

UGDS_WHITE 	 -0.33057709573385596 

UGDS_BLACK 	 1.768974467813123 

UGDS_HISP 	 2.435865412604955 

UGDS_ASIAN 	 4.906249978305263 

UGDS_AIAN 	 14.595488788052869 

UGDS_NHPI 	 19.701681065713895 

UGDS_2MOR 	 4.251356015858286 

UGDS_NRA 	 5.064075792440333 

UGDS_UNKN 	 3.71523355691396 

PPTUG_EF 	 1.0818979388962324 

C

In [141]:
def handleOutliers(outliers):
    for col in outliers:
        Q1 = train_X[col].quantile(0.25)
        Q3 = train_X[col].quantile(0.75)
        #print("col: ", col, "Q1: ", Q1, "Q3: ", Q3)
        
        train_X[col] = np.where(train_X[col] <Q1, Q1,train_X[col])
        train_X[col] = np.where(train_X[col] >Q3, Q3,train_X[col])
        print("New Skew Value: ", train_X[col].skew())

In [142]:
#handleOutliers(outliers)

### Exploring features importance by applying "RFE": Recursive Feature Estimation

In [143]:
from sklearn.feature_selection import RFE
def recursive_feat_estimate(model):
    rfe = RFE(model, 10)
    fit = rfe.fit(train_X, train_y)
    print("Features sorted by their rank:")
    print(sorted(zip(map(lambda x: x, rfe.ranking_), all_features)))
    return rfe

### Fit, Train and Predict the model with initial alpha "1e-3"

In [144]:
#--fit the Lasso model
lassoReg = Lasso(alpha=1e-3, normalize=True)

In [145]:
lassoReg.fit(train_X, train_y)

Lasso(alpha=0.001, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=True, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [146]:
pred = lassoReg.predict(test_X)
trainingPred = lassoReg.predict(train_X)

In [147]:
#--Calculate Root mean square error
from sklearn.metrics import mean_squared_error

mean = mean_squared_error(test_y, pred) 
rmse = np.sqrt(mean_squared_error(test_y,pred))

#--Calculate Root mean square error on training set
trainMean = mean_squared_error(train_y, trainingPred) 
trainRmse = np.sqrt(mean_squared_error(train_y,trainingPred)) 

print("Mean SqError: ", mean)
#print("RMSE: ", rmse)

print("TrainSet Mean SqError: ", trainMean)
#print("TrainSet RMSE: ", trainRmse)

Mean SqError:  0.020834836013656967
TrainSet Mean SqError:  0.0210725553381249


In [148]:
from sklearn.metrics import r2_score
r2_score(test_y, pred)

0.471695525042935

In [149]:
r2_score(train_y, trainingPred)

0.45863254605246373

In [150]:
recursive_feat_estimate(lassoReg)

Features sorted by their rank:
[(1, 'AGE_ENTRY'), (1, 'CONTROL_Private for-profit'), (1, 'CONTROL_Private_nonprofit'), (1, 'CONTROL_Public'), (1, 'HIGHDEG_Associate_degree'), (1, 'HIGHDEG_Associate_degree'), (1, 'HIGHDEG_Associate_degree'), (1, 'HIGHDEG_Certificate_degree'), (1, 'MD_EARN_WNE_P10'), (1, 'PCTPELL'), (2, 'HIGHDEG_Non-degree-granting'), (3, 'COMP_ORIG_YR4_RT'), (4, 'DEATH_YR4_RT'), (5, 'LOAN_COMP_ORIG_YR3_RT'), (6, 'LOAN_DEATH_YR3_RT'), (7, 'COMP_ORIG_YR3_RT'), (8, 'DEATH_YR3_RT'), (9, 'COUNT_NWNE_P10'), (10, 'MN_EARN_WNE_P10'), (11, 'COUNT_WNE_P10'), (12, 'AVGFACSAL'), (13, 'INEXPFTE'), (14, 'TUITFTE'), (15, 'TUITIONFEE_PROG'), (16, 'TUITIONFEE_OUT'), (17, 'TUITIONFEE_IN'), (18, 'COSTT4_P'), (19, 'COSTT4_A'), (20, 'PPTUG_EF'), (21, 'UGDS_UNKN'), (22, 'UGDS_NRA'), (23, 'UGDS_2MOR'), (24, 'UGDS_NHPI'), (25, 'UGDS_AIAN'), (26, 'UGDS_ASIAN'), (27, 'UGDS_HISP'), (28, 'UGDS_BLACK'), (29, 'UGDS_WHITE'), (30, 'COMP_ORIG_YR2_RT'), (31, 'PCTFLOAN'), (32, 'COSTT4_A'), (33, 'COSTT4_P

RFE(estimator=Lasso(alpha=0.001, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=True, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False),
  n_features_to_select=10, step=1, verbose=0)

In [151]:
lassoReg.coef_

array([-0.00000000e+00, -0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
       -0.00000000e+00,  0.00000000e+00, -0.00000000e+00, -0.00000000e+00,
        0.00000000e+00, -0.00000000e+00, -0.00000000e+00,  0.00000000e+00,
       -0.00000000e+00, -0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
       -0.00000000e+00, -0.00000000e+00,  0.00000000e+00, -0.00000000e+00,
        0.00000000e+00,  0.00000000e+00, -0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00, -1.78693066e-01, -0.00000000e+00,
        0.00000000e+00, -0.00000000e+00, -0.00000000e+00,  0.00000000e+00,
        0.00000000e+00, -4.00732680e-03, -0.00000000e+00, -0.00000000e+00,
        0.00000000e+00,  1.26318951e-06, -0.00000000e+00, -0.00000000e+00,
       -0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00, -

# Explore Ridge regression with built-in cross-validation.

In [152]:
#--Trying different alphas with CV = 10
a = [1e-5, 1e-4, 1e-3, 0.01, 0.1, 1, 10, 100, 1000, 10000]
lassoCV = LassoCV(alphas=a, normalize=True, cv=10)

In [153]:
lassoCV.fit(train_X, train_y)
print("Best alpha using built-in RidgeCV: %f" % lassoCV.alpha_)

Best alpha using built-in RidgeCV: 0.000010


In [154]:
#--using the best alpha to predict the test data 
alpha = lassoCV.alpha_
lassoCV = Lasso(alpha=alpha, normalize=True)
lassoCV.fit(train_X, train_y)

Lasso(alpha=1e-05, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=True, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [155]:
predCV = lassoCV.predict(test_X)
trainingPredCV = lassoCV.predict(train_X)

In [156]:
#--Calculate Root mean square error
mean = mean_squared_error(test_y, predCV) 
rmse = np.sqrt(mean_squared_error(test_y,predCV))

#--Calculate Root mean square error on training set
trainMean = mean_squared_error(train_y, trainingPredCV) 
trainRmse = np.sqrt(mean_squared_error(train_y,trainingPredCV)) 

print("Mean SqError: ", mean)
#print("RMSE: ", rmse)

print("TrainSet Mean SqError: ", trainMean)
#print("TrainSet RMSE: ", trainRmse)

Mean SqError:  0.00975163827557336
TrainSet Mean SqError:  0.010115004201836787


In [157]:
from sklearn.metrics import r2_score
r2_score(test_y, predCV)

0.7527297965882217

In [158]:
r2_score(train_y, trainingPredCV)

0.7401390584316161

In [159]:
lassoCV.coef_

array([ 1.11608936e-02,  0.00000000e+00, -0.00000000e+00, -0.00000000e+00,
       -0.00000000e+00, -5.80668470e-03,  0.00000000e+00,  1.41032619e-04,
       -7.13735440e-05, -8.63602862e-07, -0.00000000e+00,  2.71384111e-07,
       -1.19007646e-06, -2.04499923e-07,  4.64353744e-02, -3.68500783e-02,
        9.11834323e-02, -1.29784830e-01, -0.00000000e+00,  1.52330363e-01,
        1.10612630e-03, -0.00000000e+00, -5.15043754e-02,  1.31976880e-01,
       -1.84804455e-02,  6.39361420e-02, -5.17603396e-08, -5.49336093e-07,
        0.00000000e+00, -5.19051898e-07,  0.00000000e+00, -7.85461159e-08,
        0.00000000e+00, -6.55543605e-07, -1.67568805e-01, -0.00000000e+00,
        4.66916500e-02, -3.14510145e+02, -4.03164238e-02,  2.16875223e+00,
        6.79137802e-02, -1.22173653e-02, -4.25324631e-06, -0.00000000e+00,
        8.03180426e-07,  3.37210624e-06, -3.83123630e-02,  1.65427386e-02,
       -1.96372180e-02, -0.00000000e+00,  2.19776282e-02,  0.00000000e+00,
        4.94449202e-04, -

In [160]:
recursive_feat_estimate(lassoCV)

Features sorted by their rank:
[(1, 'COMP_ORIG_YR2_RT'), (1, 'COMP_ORIG_YR4_RT'), (1, 'CONTROL_Private for-profit'), (1, 'DEATH_YR4_RT'), (1, 'LOAN_DEATH_YR3_RT'), (1, 'PCTFLOAN'), (1, 'PCTPELL'), (1, 'UGDS_ASIAN'), (1, 'UGDS_BLACK'), (1, 'UGDS_NRA'), (2, 'UGDS_WHITE'), (3, 'COMP_ORIG_YR3_RT'), (4, 'HIGHDEG_Non-degree-granting'), (5, 'LOAN_COMP_ORIG_YR3_RT'), (6, 'HIGHDEG_Associate_degree'), (7, 'PPTUG_EF'), (8, 'HIGHDEG_Associate_degree'), (9, 'UGDS_2MOR'), (10, 'AGE_ENTRY'), (11, 'UGDS_UNKN'), (12, 'HIGHDEG_Certificate_degree'), (13, 'ACTWRMID'), (14, 'ADM_RATE'), (15, 'SAT_AVG_ALL'), (16, 'SATMTMID'), (17, 'UGDS_AIAN'), (18, 'MD_EARN_WNE_P10'), (19, 'COUNT_NWNE_P10'), (20, 'COSTT4_P'), (21, 'MN_EARN_WNE_P10'), (22, 'UGDS'), (23, 'TUITIONFEE_OUT'), (24, 'COSTT4_P'), (25, 'AVGFACSAL'), (26, 'COSTT4_A'), (27, 'CONTROL_Private_nonprofit'), (28, 'INEXPFTE'), (29, 'TUITFTE'), (30, 'TUITFTE'), (31, 'COSTT4_A'), (32, 'UGDS_HISP'), (33, 'INEXPFTE'), (34, 'SAT_AVG'), (35, 'ACTMTMID'), (36, 'A

RFE(estimator=Lasso(alpha=1e-05, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=True, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False),
  n_features_to_select=10, step=1, verbose=0)