In [1]:
import numpy as np
import matplotlib.pyplot as plt

from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

In [2]:
filename = "CoEPrA.csv"
raw_data = open(filename, 'rt')
data = np. loadtxt(raw_data, delimiter = ',')

In [3]:
data.shape

(89, 5788)

In [4]:
X = data[:, 0:5787]
y = data[:, -1]

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [6]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(71, 5787)
(71,)
(18, 5787)
(18,)


## Linear Regression Without Using Regularization

In [7]:
model = linear_model.LinearRegression()
model.fit(X_train,y_train)  # Training the model using 'Training Set'

LinearRegression()

In [8]:
y_pred = model.predict(X_train) # Using 5787 features to predict the output

In [9]:
print("Mean Squared Error: %.2f" % mean_squared_error(y_train,y_pred)) #As per 'MSE,model fits perfectly, indicating overfitting

Mean Squared Error: 0.00


## Validating Overfitting with Cross Validation

In [10]:
# K-Fold Cross Validation
scores = cross_val_score(model, X_train, y_train, scoring= 'neg_mean_squared_error', cv = 5)

In [11]:
print(np.mean(scores))

-4.321758759171588e+16


## Very High Mean Squared Error With Test Dataset

In [12]:
y_pred_test = model.predict(X_test)

In [13]:
print("Mean Squared Error: %.2f" % mean_squared_error(y_test,y_pred_test))

Mean Squared Error: 7973557088888348.00


## L1 / Lasso Regression

In [14]:
model = linear_model.Lasso(alpha = 0.3, max_iter = 1000000)
model.fit(X_train, y_train)

Lasso(alpha=0.3, max_iter=1000000)

## Checking the Weights

In [15]:
print(model.coef_)

[-0.  0. -0. ... -0. -0.  0.]


## Many Cofficients have become zero

In [16]:
# Printing the indices of non zero coefficients
index = np.nonzero(model.coef_)
print(index[0])

[  64  136  445  451  653  715  760  787  858 1236 1358 1422 1430 1732
 1737 1874 1879 2065 2247 2374 2380 2581 2644 2689 2708 2890 3224 3351
 3666 3931 3994 4002 4221 4303 4510 4573 4574 4637 4645 4819 4952 5153
 5154 5280 5589 5595 5648 5732]


In [17]:
# Create a new feature matrix with only selected features

In [18]:
X_train_filter = X_train[:, index[0]]

In [19]:
X_train_filter.shape

(71, 48)

In [20]:
# Make prediction using the testing set
y_pred = model.predict(X_train)

In [21]:
print("Mean Squared Error: %.2f" % mean_squared_error(y_train, y_pred))

Mean Squared Error: 0.05


In [22]:
# K-Fold Cross Validation
scores = cross_val_score(model, X_train, y_train, scoring= 'neg_mean_squared_error', cv = 5)

In [23]:
print(np.mean(scores))

-1.1615211159922427


In [24]:
y_pred_test = model.predict(X_test)
print("Mean Squared Error: %.2f" % mean_squared_error(y_test,y_pred_test))

Mean Squared Error: 0.69


#### Conclusion: Overfitting has recuded

## L2 Ridge Regularization

In [25]:
model = linear_model.Ridge(alpha = 0.8, max_iter = 1000000)
model.fit(X_train_filter, y_train)
y_pred = model.predict(X_train_filter)

In [26]:
print("Mean Squared Error: %.2f" % mean_squared_error(y_train,y_pred))

Mean Squared Error: 0.03


In [27]:
# K-Fold Cross Validation
scores = cross_val_score(model, X_train_filter, y_train, scoring= 'neg_mean_squared_error', cv = 5)
print(np.mean(scores))

-1.2017669016787922


#### Conclusion: Cross validation values does not change much

In [28]:
X_test_filter = X_test[:, index[0]]

In [29]:
y_pred_test = model.predict(X_test_filter)

In [30]:
print("Mean Squared Error: %.2f" % mean_squared_error(y_test,y_pred_test))

Mean Squared Error: 1.80
