# Cross-Validation & Regularization

In [None]:
from sklearn.datasets import load_diabetes
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
diabetes = load_diabetes()
df = pd.DataFrame(diabetes.data, columns=diabetes.feature_names)
df['y'] = diabetes.target
sdf = spark.createDataFrame(df)

assembler = VectorAssembler(inputCols=[c for c in sdf.columns if c != 'y'],
                            outputCol='features')
dataset = assembler.transform(sdf)
dataset.show(5)

## We haven't tested models on unseen data!

In [None]:
train, test = dataset.randomSplit([0.5, 0.5])
lr = LinearRegression(featuresCol='features', labelCol='y')
model = lr.fit(train)

# Evaluate on training data
summary_train = model.evaluate(train)
print('R2 (training): ', summary_train.r2)

# Evaluate on training data
summary_test = model.evaluate(test)
print('R2 (testing): ', summary_test.r2)

## Increase training fraction?

In [None]:
p_train = [0.6, 0.7, 0.8, 0.9]
train_r2 = []
test_r2 = []

for p in p_train:
    train, test = dataset.randomSplit([p, 1 - p])
    model = lr.fit(train)
    
    # Evaluate on training data
    summary_train = model.evaluate(train)
    train_r2.append(summary_train.r2)

    # Evaluate on training data
    summary_test = model.evaluate(test)
    test_r2.append(summary_test.r2)

plt.plot(p_train, train_r2, 'ro-', label='train')
plt.plot(p_train, test_r2, 'g', label='test')
plt.xlabel('Training Size')
plt.ylabel('R2')
plt.legend()

## Question: How do we interpret this chart?

# Regularization for solving Ill-conditioned problems

## L2 Regularization:

$$ \mathbf{E(\mathbf{\beta})} = SSE(\mathbf{\beta}) + \frac{\lambda}{2}\|\mathbf{\beta}\|_{2}^{2}$$


### The _L2_ norm of a vector $\mathbf{x}$ is denoted as $\|\mathbf{x}\|^{2}_{2}$ (aka the _Euclidean_ or _Frobenius_ norm)

## L1 Regularization (aka Lasso):

$$ \mathbf{E(\mathbf{\beta})} = SSE(\mathbf{\beta}) + \lambda\|\mathbf{\beta}\|_{1}$$


### The _L1_ norm of a vector $\mathbf{x}$ is denoted as $\|\mathbf{x}\|_{1}$ (aka the _Taxicab_ or _Manhattan_ norm)

---

## ElasticNet Regularization (Combining L1 and L2)

$$ \mathbf{E(\mathbf{\beta})} = SSE(\mathbf{\beta}) + \lambda\left[\frac{1}{2}(1 - \alpha)\|\mathbf{\beta}\|_{2}^{2} + \alpha\|\mathbf{\beta}\|_{1}\right]$$


**Note**: In MLlib models, `elasticNetParam` corresponds to $\alpha$ and `regParam` corresponds to $\lambda$, therefore:
  * For using L2 regularization set `elasticNetParam = 0` and `regParam > 0`
  * For using L1 regularization set `elasticNetParam = 1` and `regParam > 0`
  * For using ElasticNet, set both to a nonzero value

### Using L2 regularization on the Diabetes dataset

In [None]:
train, test = dataset.randomSplit([0.7, 0.3])
lr = LinearRegression(featuresCol='features', labelCol='y', elasticNetParam=0, regParam=0.3)
model = lr.fit(train)

# Evaluate on training data
summary_train = model.evaluate(train)
print('R2 (training): ', summary_train.r2)

# Evaluate on training data
summary_test = model.evaluate(test)
print('R2 (testing): ', summary_test.r2)

## Choosing the right value of $\lambda$

In [None]:
valid_lambdas = np.logspace(-1, 1, 20)

In [None]:
train_r2 = []
test_r2 = []
train, test = dataset.randomSplit([0.7, 0.3])

for l in valid_lambdas:    
    lr = LinearRegression(featuresCol='features', labelCol='y', elasticNetParam=0, regParam=l)
    model = lr.fit(train)

    # Evaluate on training data
    summary_train = model.evaluate(train)
    train_r2.append(summary_train.r2)

    # Evaluate on training data
    summary_test = model.evaluate(test)
    test_r2.append(summary_test.r2)

    
plt.plot(valid_lambdas, train_r2, 'ro-', label='train')
plt.plot(valid_lambdas, test_r2, 'go-', label='test')
plt.xlabel('$\lambda$')
plt.ylabel('R2')
plt.xscale('log')
plt.legend()

### Exercise: Extend the search for $\lambda$ further, by inspecting more values. (already provided)

In [None]:
valid_lambdas = np.logspace(0, 2, 20)

In [None]:
# enter code here

## Using L1 Regularization

In [None]:
train, test = dataset.randomSplit([0.7, 0.3])
lr = LinearRegression(featuresCol='features', labelCol='y', elasticNetParam=1, regParam=0.3)
model = lr.fit(train)

# Evaluate on training data
summary_train = model.evaluate(train)
print('R2 (training): ', summary_train.r2)

# Evaluate on training data
summary_test = model.evaluate(test)
print('R2 (testing): ', summary_test.r2)

### Choosing the right value of $\lambda$ for L1

In [None]:
valid_lambdas = np.linspace(0, 4, 20)

In [None]:
train_r2 = []
test_r2 = []
train, test = dataset.randomSplit([0.7, 0.3])

for l in valid_lambdas:    
    lr = LinearRegression(featuresCol='features', labelCol='y', elasticNetParam=1, regParam=l)
    model = lr.fit(train)

    # Evaluate on training data
    summary_train = model.evaluate(train)
    train_r2.append(summary_train.r2)

    # Evaluate on training data
    summary_test = model.evaluate(test)
    test_r2.append(summary_test.r2)

    
plt.plot(valid_lambdas, train_r2, 'ro-', label='train')
plt.plot(valid_lambdas, test_r2, 'go-', label='test')
plt.xlabel('$\lambda$')
plt.ylabel('R2')
#plt.xscale('log')
plt.legend()

### Sparsity from the L1 Norm

In [None]:
# Fit the model
l = 3
lr = LinearRegression(featuresCol='features', labelCol='y', elasticNetParam=1, regParam=l)
model = lr.fit(dataset)

In [None]:
# See coefficients
pd.Series(np.array(model.coefficients), index=assembler.getInputCols()).plot(kind='bar')