# Pair Problem

Practice Lasso regularization technique in four steps on the given data set:

1) Use the KFold function from sklearn to divide the data into 5 training/test sets.

2) Tune the lambda parameter in the lasso model by looping over a grid of possible lambdas (sklearn: ridge)

For each candidate lambda, loop over the 5 training/test sets.  
On each training/test set run the lasso model on the training set and then compute and record the prediction error in the test set.  
Finally total the prediction error for the 5 training/test sets.
3) Set lambda to be the value that minimizes prediction error.

4) Run the lasso model again with the optimal lambda determined in step 3. Which variables would you consider excluding on the basis of these results?

In [156]:
import pandas as pd
from sklearn import cross_validation
import statsmodels.api as sm
from sklearn.cross_validation import KFold
from sklearn.linear_model import Lasso


In [157]:

# >>> import numpy as np
# >>> from sklearn.cross_validation import KFold

# >>> kf = KFold(20, n_folds=4)
# >>> for train, test in kf:
#     print("%s %s" % (train, test))


In [114]:
data = pd.read_csv("Lasso_practice_data.csv")

In [115]:
data.shape

(2000, 21)

In [116]:
print data.describe()

                x1           x2           x3           x4           x5  \
count  2000.000000  2000.000000  2000.000000  2000.000000  2000.000000   
mean     -0.011640     0.027039    -0.016687    -0.017795     0.007121   
std       1.012714     0.995109     1.006237     1.004680     1.015677   
min      -3.488603    -3.900697    -4.326393    -3.091918    -3.182018   
25%      -0.699153    -0.639319    -0.708722    -0.677776    -0.662261   
50%      -0.016876     0.013312    -0.019923    -0.015773     0.006291   
75%       0.677067     0.714541     0.672212     0.659879     0.699014   
max       3.025285     4.354047     2.956734     4.352949     3.312204   

                x6           x7           x8           x9          x10  \
count  2000.000000  2000.000000  2000.000000  2000.000000  2000.000000   
mean      0.020547     0.012657    -0.018621     0.014379     0.001625   
std       0.998204     0.987151     0.984812     0.985713     1.034154   
min      -3.161528    -3.552433    -3

In [117]:
# Note!!! It is very important that you convert it into a numpy array and not enter it as a pandas array
y = np.array(data["y"])
X = np.array(data.drop("y", 1))

In [118]:
# kf_scrap = KFold(15, n_folds=5)
# for train_index, test_index in kf_scrap:
#     print("TRAIN:", train_index, "TEST:", test_index)  
#     #print("%s %s" % (train_index, test_index))

In [119]:

# # This function will take a sample with a certain number of elements (the first parameter), and
# # and break it into the number of folds specified, with that many subsets as well.

# kf_scrap = KFold(20, n_folds=5)
# #, shuffle=False,random_state=None)
# print len(kf_scrap)   # This is the number of folds and iterations

# for train_index, test_index in kf_scrap:
#     print("%s %s" % (train_index, test_index))
#     #print("TRAIN:", train_index, "TEST:", test_index)
#     #print("%s %s" % (train_index.shape, test_index.shape))
#     X_train, X_test = X[train_index], X[test_index]
#     y_train, y_test = y[train_index], y[test_index]
    
#     # After this step is where we do the fitting and etc...
#     # For each fold, train on the training sets (X_train & y_train)
    
    
#     # For each fold, get a cost for the test sets (X_test & y_test)

    
#     # Store the cost in a separate cost list


In [193]:
kf = KFold(n = 2000, n_folds=5)
#, shuffle=True,random_state=0)
print X.shape
print len(kf)

#lambda_range = np.arange(0,2,0.001)    # Ideal to start with a course grid and then make it finer. E.g. (0,1,0.05) -> (0,0.05, 0.001)
lambda_range = np.arange(0,0.033,0.0001)    # Ideal to start with a course grid and then make it finer. E.g. (0,1,0.05) -> (0,0.05, 0.001)

error_per_lambda = []
#print lambda_range

for lambda_value in lambda_range:
    errors_per_fold_list = []
    for train_index, test_index in kf:
        #print("TRAIN:", train_index, "TEST:", test_index)
        #print("%s %s" % (train_index.shape, test_index.shape))

        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        # After this step is where we do the fitting and etc...
        # For each fold, train on the training sets (X_train & y_train)
        model = Lasso(lambda_value)    # The number entered here is the lambda /alpha variable
        model.fit(X_train,y_train)

        # For each fold, get a cost for the test sets (X_test & y_test)
        y_hat = model.predict(X_test)      # this will generate the predictions
        errors = y_test - y_hat
        squared_errors = errors * errors
        sum_squared_errors = sum(squared_errors)
   
    #   Returns the coefficient of determination R^2 of the prediction.
    #   The coefficient R^2 is defined as (1 - u/v), where u is the regression sum of squares ((y_true - y_pred) ** 2).sum() and v is the residual sum of squares ((y_true - y_true.mean()) ** 2).sum(). Best possible score is 1.0 and it can be negative (because the model can be arbitrarily worse). A constant model that always predicts the expected value of y, disregarding the input features, would get a R^2 score of 0.0.

        #r_squared = model.score(X_test,y_test)    # R_squared = 1 - (SSE/SST)
        #sse_divied_sst = 1 - r_squared

        # Store the cost in a separate cost list

        errors_per_fold_list.append(sum_squared_errors)
        # print errors_per_fold_list
        # Compute the average cost across all folds for the given lambda

    final_lambda_cost = np.mean(errors_per_fold_list)

    # Store the cost of the lambda

    error_per_lambda.append(final_lambda_cost)

(2000, 20)
5




In [194]:
lambda_error_df = pd.DataFrame({"Alpha/Lambda Value":lambda_range,"Mean Error per Fold": error_per_lambda}).sort(columns = "Mean Error per Fold")
print lambda_error_df.head(20)

     Alpha/Lambda Value  Mean Error per Fold
151              0.0151           404.992063
150              0.0150           404.992282
149              0.0149           404.992611
148              0.0148           404.993050
152              0.0152           404.993363
147              0.0147           404.993600
146              0.0146           404.994260
153              0.0153           404.994771
145              0.0145           404.995037
144              0.0144           404.995924
154              0.0154           404.996284
143              0.0143           404.996639
142              0.0142           404.997147
141              0.0141           404.997766
155              0.0155           404.997905
140              0.0140           404.998497
139              0.0139           404.999340
156              0.0156           404.999635
138              0.0138           405.000294
157              0.0157           405.000714


In [186]:
best_lambda =  0.0243
model_final = Lasso(best_lambda)
model_final.fit(X,y)
print model_final.coef_

[-0.         -1.78806882 -0.11457123  0.         -0.          1.83363665
  0.          0.         -0.18350592  0.15521782 -0.         -0.         -0.
 -2.19425204  0.          0.99649116  0.03126941 -0.0025321   0.         -0.3317956 ]
