# Add features

In [1]:
import pandas as pd
import os  

os.getcwd()
data = pd.read_csv('dataProcessed.csv') # 11 terms

for i in range(2, 12):
    for j in range(i + 1, 12):
        name = data.columns.values.tolist()[i] + " * " + data.columns.values.tolist()[j]
        data[name] = data.iloc[:, i] * data.iloc[:, j]

data.to_csv('result.csv') # 50 terms

# Lasso regression

In [2]:
import numpy as np
import pandas as pd
import os

from sklearn.linear_model import Lasso,LassoCV,LassoLarsCV

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

os.getcwd()
data = pd.read_csv('result.csv')

X = data.iloc[:,2:]
Y = data.iloc[:,1]

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,Y,random_state=0)

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

linlasso = Lasso(alpha=2.0).fit(X_train_scaled, y_train)

print('lasso regression linear model intercept: {}'.format(linlasso.intercept_))

print('lasso regression linear model coeff:\n{}'.format(linlasso.coef_))

print('Non-zero features: {}'.format(np.sum(linlasso.coef_ != 0)))

print('R-squared score (training): {:.3f}'.format(linlasso.score(X_train, y_train)))

print('R_squared score (test): {:.3f}\n'.format(linlasso.score(X_test, y_test)))

print('Features with non-zero weight (sorted by absolute magnitude):')

for e in sorted (list(zip(list(X),linlasso.coef_)), key = lambda e: -abs(e[1])):
    if e[1] != 0:
        print('\t{}, {:.3f}'.format(e[0], e[1]))

lasso regression linear model intercept: 0.9265930000000001
lasso regression linear model coeff:
[ 0.  0.  0.  0.  0. -0.  0.  0.  0. -0.  0.  0.  0.  0.  0. -0. -0.  0.
  0.  0. -0. -0.  0.  0.  0.  0.  0. -0. -0.  0.  0.  0.  0.  0.  0.  0.
 -0.  0. -0. -0.  0. -0.  0.  0. -0.  0.  0.  0. -0.  0.]
Non-zero features: 0
R-squared score (training): 0.000
R_squared score (test): -0.013

Features with non-zero weight (sorted by absolute magnitude):


  return self.partial_fit(X, y)
  return self.partial_fit(X, y)


In [5]:
print('Lasso regression: effect of alpha regularization\n\
      parameter on number of features kept in final model\n')

for alpha in [0.0001, 0.0005, 0.001, 0.01, 0.1, 1, 5, 10]:
    linlasso = Lasso(alpha).fit(X_train_scaled, y_train)
    r2_train = linlasso.score(X_train_scaled, y_train)
    r2_test = linlasso.score(X_test_scaled, y_test)
    
    print('Alpha = {:.4f}\nFeatures kept: {}, r-squared training: {:.2f}, \
          r_squared test: {:.2f}'
          .format(alpha, np.sum(linlasso.coef_ != 0), r2_train, r2_test))

Lasso regression: effect of alpha regularization
      parameter on number of features kept in final model

Alpha = 0.0001
Features kept: 49, r-squared training: 0.76,           r_squared test: 0.26
Alpha = 0.0005
Features kept: 44, r-squared training: 0.75,           r_squared test: 0.36
Alpha = 0.0010
Features kept: 40, r-squared training: 0.74,           r_squared test: 0.40
Alpha = 0.0100
Features kept: 13, r-squared training: 0.58,           r_squared test: 0.34
Alpha = 0.1000
Features kept: 1, r-squared training: 0.09,           r_squared test: 0.08
Alpha = 1.0000
Features kept: 0, r-squared training: 0.00,           r_squared test: -0.01
Alpha = 5.0000
Features kept: 0, r-squared training: 0.00,           r_squared test: -0.01
Alpha = 10.0000
Features kept: 0, r-squared training: 0.00,           r_squared test: -0.01


In [7]:
linlasso = Lasso(0.001).fit(X_train_scaled, y_train)
r2_train = linlasso.score(X_train_scaled, y_train)
r2_test = linlasso.score(X_test_scaled, y_test)
print('Alpha = {:.4f}\nFeatures kept: {}, r-squared training: {:.2f}, \
     r_squared test: {:.2f}'
     .format(alpha, np.sum(linlasso.coef_ != 0), r2_train, r2_test))
print('Features with non-zero weight (sorted by absolute magnitude):')
for e in sorted (list(zip(list(X),linlasso.coef_)), key = lambda e: -abs(e[1])):
    if e[1] != 0:
        print('\t{}, {:.4f}'.format(e[0], e[1]))

Alpha = 10.0000
Features kept: 40, r-squared training: 0.74,      r_squared test: 0.40
Features with non-zero weight (sorted by absolute magnitude):
	X * XY, 1.7916
	x, 1.5078
	x * x2, -1.3596
	Y2 * y, 1.3125
	xy * y2, -1.2889
	Y * y2, 1.2299
	xy, 1.0902
	x2, -0.9315
	x2 * y2, 0.8358
	X2 * x2, 0.6444
	X2 * y, 0.6291
	X, 0.5765
	Y * x2, 0.5570
	Y2 * x, -0.5277
	XY * x, 0.4734
	Y * y, -0.4575
	Y2, -0.3927
	X2, -0.3113
	X * y2, 0.3033
	X * x2, 0.3003
	XY * y2, -0.2953
	Y, -0.2858
	y2, 0.2148
	X2 * xy, 0.1843
	X * x, 0.1720
	XY * x2, 0.1594
	x * y2, 0.1583
	XY * X2, 0.1536
	y, 0.1445
	Y * x, 0.1414
	XY * y, -0.1397
	Y * xy, 0.1379
	XY * xy, -0.1339
	Y2 * x2, 0.0843
	X * y, -0.0830
	X2 * x, 0.0532
	X2 * y2, 0.0522
	X2 * Y2, -0.0479
	y * y2, -0.0322
	X * X2, -0.0322
