In [11]:
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
from sklearn import linear_model
import seaborn as sns
%matplotlib inline
pd.options.display.float_format = '{:.3f}'.format

# Suppress annoying harmless error.
import warnings
warnings.filterwarnings(action="ignore", module="scipy", message="^internal gelsd")

# Set plot style
sns.set_style('dark')

In [12]:
cal = pd.read_excel('table_8_offenses_known_to_law_enforcement_california_by_city_2013.xls')

cal.head()

Unnamed: 0,City,Population,Violent crime,Murder and nonnegligent manslaughter,Rape (revised definition)1,Rape (legacy definition)2,Robbery,Aggravated assault,Property crime,Burglary,Larceny- theft,Motor vehicle theft,Arson
0,Adelanto,31165.0,198.0,2.0,,15.0,52.0,129.0,886.0,381.0,372.0,133.0,17.0
1,Agoura Hills,20762.0,19.0,0.0,,2.0,10.0,7.0,306.0,109.0,185.0,12.0,7.0
2,Alameda,76206.0,158.0,0.0,,10.0,85.0,63.0,1902.0,287.0,1285.0,330.0,17.0
3,Albany,19104.0,29.0,0.0,,1.0,24.0,4.0,557.0,94.0,388.0,75.0,7.0
4,Alhambra,84710.0,163.0,1.0,,9.0,81.0,72.0,1774.0,344.0,1196.0,234.0,7.0


In [13]:
cal.columns = ['city', 'population', 'violent', 'murder', 'rape1', 'rape', 'robbery', 
              'ag_aslt', 'property', 'burglary', 'larce_th', 'motor_th', 'arson']
cal = cal.drop(columns='rape1')

cal.head()

Unnamed: 0,city,population,violent,murder,rape,robbery,ag_aslt,property,burglary,larce_th,motor_th,arson
0,Adelanto,31165.0,198.0,2.0,15.0,52.0,129.0,886.0,381.0,372.0,133.0,17.0
1,Agoura Hills,20762.0,19.0,0.0,2.0,10.0,7.0,306.0,109.0,185.0,12.0,7.0
2,Alameda,76206.0,158.0,0.0,10.0,85.0,63.0,1902.0,287.0,1285.0,330.0,17.0
3,Albany,19104.0,29.0,0.0,1.0,24.0,4.0,557.0,94.0,388.0,75.0,7.0
4,Alhambra,84710.0,163.0,1.0,9.0,81.0,72.0,1774.0,344.0,1196.0,234.0,7.0


In [14]:
# A bit of feature engineering:

# Making a Binary Violent Crime Column:
cal['bin_violent'] = np.where(cal['violent']>0, 1, 0)

#Same for Murder, Rape, Property_Crime and Larceny:
cal['bin_arson'] = np.where(cal['arson']>0, 1, 0)

cal['bin_murder'] = np.where(cal['murder']>0, 1, 0)

cal['bin_rape'] = np.where(cal['rape']>0, 1, 0)

cal['bin_property'] = np.where(cal['property']>0, 1, 0)

In [16]:
# some non-binary features:

cal['larce/violent'] = cal.larce_th / cal.violent
cal['property/rape'] = cal.property / cal.rape

In [17]:
cal.loc[cal['larce/violent'] == np.inf] = np.nan
cal.loc[cal['property/rape'] == np.inf] = np.nan
cal = cal.dropna(axis = 0, how = 'any')

In [22]:
cal.describe()

Unnamed: 0,population,violent,murder,rape,robbery,ag_aslt,property,burglary,larce_th,motor_th,arson,bin_violent,bin_arson,bin_murder,bin_rape,bin_property,larce/violent,property/rape
count,402.0,402.0,402.0,402.0,402.0,402.0,402.0,402.0,402.0,402.0,402.0,402.0,402.0,402.0,402.0,402.0,402.0,402.0
mean,77454.279,307.704,3.463,15.085,118.776,170.381,2137.749,466.453,1326.607,344.689,15.294,1.0,0.858,0.517,1.0,1.0,6.964,194.824
std,220742.282,1071.676,14.65,46.078,527.361,507.072,5775.551,1089.909,3740.146,1046.619,75.316,0.0,0.349,0.5,0.0,0.0,5.388,210.161
min,223.0,1.0,0.0,1.0,0.0,0.0,6.0,2.0,4.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.4,6.0
25%,16925.5,41.0,0.0,3.0,8.0,27.0,400.5,101.5,228.75,36.25,1.0,1.0,1.0,0.0,1.0,1.0,3.329,84.895
50%,40057.5,103.0,1.0,7.0,26.0,63.0,893.5,212.5,560.5,100.0,4.0,1.0,1.0,1.0,1.0,1.0,5.73,141.058
75%,79614.25,221.75,2.0,13.0,77.0,133.75,2064.25,467.0,1320.0,296.0,12.0,1.0,1.0,1.0,1.0,1.0,8.858,222.792
max,3878725.0,16524.0,251.0,764.0,7885.0,7624.0,85844.0,15728.0,55734.0,14382.0,1430.0,1.0,1.0,1.0,1.0,1.0,47.6,2038.0


In [23]:
cal.columns

Index(['city', 'population', 'violent', 'murder', 'rape', 'robbery', 'ag_aslt',
       'property', 'burglary', 'larce_th', 'motor_th', 'arson', 'bin_violent',
       'bin_arson', 'bin_murder', 'bin_rape', 'bin_property', 'larce/violent',
       'property/rape'],
      dtype='object')

In [24]:
features = ['population', 'violent', 'murder', 'rape', 'robbery', 'ag_aslt',
       'property', 'burglary', 'larce_th', 'motor_th', 'bin_violent',
       'bin_murder', 'bin_rape', 'bin_property', 'larce/violent',
       'property/rape']

from sklearn.preprocessing import StandardScaler

# Separating out the features
x = cal.loc[:, features].values
# Separating out the target
y = cal.loc[:,['bin_arson']].values
# Standardizing the features
x = StandardScaler().fit_transform(x)

In [25]:
from sklearn.decomposition import PCA
pca = PCA(n_components=3)
principalComponents = pca.fit_transform(x)
principalDf = pd.DataFrame(data = principalComponents
             , columns = ['pc1','pc2', 'pc3'])

In [26]:
cal_m = pd.concat([principalDf, cal], axis = 1).dropna()
cal_m.head()

Unnamed: 0,pc1,pc2,pc3,city,population,violent,murder,rape,robbery,ag_aslt,...,larce_th,motor_th,arson,bin_violent,bin_arson,bin_murder,bin_rape,bin_property,larce/violent,property/rape
0,-0.305,-1.497,0.316,Adelanto,31165.0,198.0,2.0,15.0,52.0,129.0,...,372.0,133.0,17.0,1.0,1.0,1.0,1.0,1.0,1.879,59.067
1,-1.007,0.656,-0.86,Agoura Hills,20762.0,19.0,0.0,2.0,10.0,7.0,...,185.0,12.0,7.0,1.0,1.0,0.0,1.0,1.0,9.737,153.0
2,-0.413,0.6,-0.733,Alameda,76206.0,158.0,0.0,10.0,85.0,63.0,...,1285.0,330.0,17.0,1.0,1.0,0.0,1.0,1.0,8.133,190.2
3,-1.002,2.176,0.432,Albany,19104.0,29.0,0.0,1.0,24.0,4.0,...,388.0,75.0,7.0,1.0,1.0,0.0,1.0,1.0,13.379,557.0
4,-0.233,-0.386,0.693,Alhambra,84710.0,163.0,1.0,9.0,81.0,72.0,...,1196.0,234.0,7.0,1.0,1.0,1.0,1.0,1.0,7.337,197.111


In [28]:
features = ['population', 'violent', 'murder', 'rape', 'robbery', 'ag_aslt',
       'property', 'burglary', 'larce_th', 'motor_th', 'bin_violent',
       'bin_murder', 'bin_rape', 'bin_property', 'larce/violent',
       'property/rape', 'pc1','pc2', 'pc3']

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import SelectKBest, f_classif

X = cal_m[features]
y = cal_m['bin_arson']

# place results into a dataframe
selector=SelectKBest(score_func=f_classif,k=19)
selector.fit(X,y)
scores = pd.DataFrame()
scores["Attribute"] = features
scores["F Score"] = selector.scores_
scores["P Value"] = selector.pvalues_
scores["Support"] = selector.get_support()

scores.sort_values(by=['F Score'], ascending=False)

  f = msb / msw


Unnamed: 0,Attribute,F Score,P Value,Support
11,bin_murder,22.883,0.0,True
7,burglary,4.981,0.026,True
6,property,4.563,0.033,True
15,property/rape,4.526,0.034,True
8,larce_th,4.302,0.039,True
9,motor_th,4.278,0.039,True
0,population,3.709,0.055,True
5,ag_aslt,3.232,0.073,True
1,violent,2.722,0.1,True
18,pc3,2.636,0.105,True


In [30]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import confusion_matrix

In [31]:
# Separating out the features
X = cal_m[features]
# Separating out the target
Y = cal_m['bin_arson']
# Standardizing the features
X = StandardScaler().fit_transform(X)

#split out a training and test set
x_train, x_test, y_train, y_test = train_test_split(X, Y, random_state=0)

### Logistic Regression

We'll start off with a standard logistic regression. When you run and fit the model, you see that it performs decently well, with probably a slight bit of overfitting. The main issues here are the large standard deviation among the coefficients, let's see if we can reduce some of that through Lasso and Ridge regressions.

In [80]:
warnings.filterwarnings(action="ignore", module="sklearn")

lr = LogisticRegression(C=1e9, max_iter = 150)
lr.fit(x_train, y_train)

# Display.
print('Coefficients:')
print(lr.coef_)

print('\nCoefficients Mean:')
print(lr.coef_.mean())

print('\nCoefficients Standard Deviation:')
print(lr.coef_.std())

print('\nIntercept:')
print(lr.intercept_)

print('\nTrain Percentage accuracy:')
print(lr.score(x_train, y_train))

print('\nTest Percentage accuracy:')
print(lr.score(x_test, y_test))

print('\nCross Validation Score:')
print(cross_val_score(lr, x_train, y_train, cv = 4))

print('\nCross Validation Mean:')
print(cross_val_score(lr, x_train, y_train, cv = 4).mean())

print('\nCross Validation Standard Deviation:')
print(cross_val_score(lr, x_train, y_train, cv = 4).std())

Coefficients:
[[  1.92469855  -2.56136224  -5.06633118 -12.09712425  -5.90780109
    2.06900561  13.6344707   -8.16466569  17.75518878  19.96859363
    0.           1.01158762   0.           0.           0.03642959
   -0.72296135  -0.29683064  -0.33159003   0.2159957 ]]

Coefficients Mean:
1.1298580901126074

Coefficients Standard Deviation:
7.8322716363220195

Intercept:
[9.14608741]

Train Percentage accuracy:
0.8977272727272727

Test Percentage accuracy:
0.8089887640449438

Cross Validation Score:
[0.85074627 0.86363636 0.86363636 0.83076923]

Cross Validation Mean:
0.8521970566746687

Cross Validation Standard Deviation:
0.013444069291276301


### Ridge Regression

To start the ridge regressions, we will set the alpha to 5 and see how it plays out. Immediately you can see a drop in the variance among the coefficients, as well as the mean of coefficients approaching 0. Still though, it would be ideal if we could bring some of the coefficients even farther down.

In [60]:
# Lower the C and see if it changes the coefficients
ridgeregr = LogisticRegression(penalty = 'l2', C = 5)
ridgeregr.fit(x_train, y_train)

# Display.
print('Coefficients:')
print(ridgeregr.coef_)

print('\nCoefficients Mean:')
print(ridgeregr.coef_.mean())

print('\nCoefficients Standard Deviation:')
print(ridgeregr.coef_.std())

print('\nIntercept:')
print(ridgeregr.intercept_)

origparams = ridgeregr.coef_[0]
print('\nParameter estimates:')
print(origparams)

print('\nTrain Percentage accuracy:')
print(ridgeregr.score(x_train, y_train))

print('\nTest Percentage accuracy:')
print(ridgeregr.score(x_test, y_test))

print('\nCross Validation Score:')
print(cross_val_score(ridgeregr, x_train, y_train, cv = 4))

print('\nCross Validation Mean:')
print(cross_val_score(ridgeregr, x_train, y_train, cv = 4).mean())

print('\nCross Validation Standard Deviation:')
print(cross_val_score(ridgeregr, x_train, y_train, cv = 4).std())

Coefficients:
[[ 2.18446684  0.19419907 -1.15396196 -1.54254879  0.47822391  0.08481588
   1.99347019  0.45576246  2.40510195  1.91685103  0.          0.65124456
   0.          0.          0.21609317  0.15810681 -0.04837251 -0.2650119
   0.08127158]]

Coefficients Mean:
0.4110374882048433

Coefficients Standard Deviation:
1.0198546540280176

Intercept:
[3.71117164]

Parameter estimates:
[ 2.18446684  0.19419907 -1.15396196 -1.54254879  0.47822391  0.08481588
  1.99347019  0.45576246  2.40510195  1.91685103  0.          0.65124456
  0.          0.          0.21609317  0.15810681 -0.04837251 -0.2650119
  0.08127158]

Train Percentage accuracy:
0.8787878787878788

Test Percentage accuracy:
0.8426966292134831

Cross Validation Score:
[0.86567164 0.86363636 0.89393939 0.87692308]

Cross Validation Mean:
0.8750426190724698

Cross Validation Standard Deviation:
0.01202645105987019


#### Lower the Alpha
Lowering the alpha to .25 puts brings the mean coefficient even lower, as well as greatly reduces the standard deviation. Our accuracy has been improved a small amount as well.

In [58]:
#ridgeregr = linear_model.Ridge(alpha=25, fit_intercept=False) 
ridgeregr2 = LogisticRegression(penalty = 'l2', C = .25)
ridgeregr2.fit(x_train, y_train)

# Display.
print('Coefficients:')
print(ridgeregr2.coef_)

print('\nCoefficients Mean:')
print(ridgeregr2.coef_.mean())

print('\nCoefficients Standard Deviation:')
print(ridgeregr2.coef_.std())

print('\nIntercept:')
print(ridgeregr2.intercept_)

origparams = ridgeregr2.coef_[0]
print('\nParameter estimates:')
print(origparams)

print('\nTrain Percentage accuracy:')
print(ridgeregr2.score(x_train, y_train))

print('\nTest Percentage accuracy:')
print(ridgeregr2.score(x_test, y_test))

print('\nCross Validation Score:')
print(cross_val_score(ridgeregr2, x_train, y_train, cv = 4))

print('\nCross Validation Mean:')
print(cross_val_score(ridgeregr2, x_train, y_train, cv = 4).mean())

print('\nCross Validation Standard Deviation:')
print(cross_val_score(ridgeregr2, x_train, y_train, cv = 4).std())

Coefficients:
[[ 0.296665    0.05089034 -0.11156334  0.02160474  0.02178413  0.08717233
   0.25267216  0.20751747  0.27643275  0.19152254  0.          0.55573187
   0.          0.          0.19087628  0.29016742 -0.01208969 -0.20274429
   0.05080321]]

Coefficients Mean:
0.11407594318883658

Coefficients Standard Deviation:
0.1713164809581057

Intercept:
[2.01780506]

Parameter estimates:
[ 0.296665    0.05089034 -0.11156334  0.02160474  0.02178413  0.08717233
  0.25267216  0.20751747  0.27643275  0.19152254  0.          0.55573187
  0.          0.          0.19087628  0.29016742 -0.01208969 -0.20274429
  0.05080321]

Train Percentage accuracy:
0.875

Test Percentage accuracy:
0.8426966292134831

Cross Validation Score:
[0.86567164 0.87878788 0.87878788 0.87692308]

Cross Validation Mean:
0.8750426190724698

Cross Validation Standard Deviation:
0.005463636097156949


#### Even More

Lowering the alpha again helps even more, the mean coeefficient, intercept, and standard deviation among coefficents all lowered.

In [74]:
#ridgeregr = linear_model.Ridge(alpha=25, fit_intercept=False) 
ridgeregr3 = LogisticRegression(penalty = 'l2', C = .05)
ridgeregr3.fit(x_train, y_train)

# Display.
print('Coefficients:')
print(ridgeregr3.coef_)

print('\nCoefficients Mean:')
print(ridgeregr3.coef_.mean())

print('\nCoefficients Standard Deviation:')
print(ridgeregr3.coef_.std())

print('\nIntercept:')
print(ridgeregr3.intercept_)

origparams = ridgeregr3.coef_[0]
print('\nParameter estimates:')
print(origparams)

print('\nTrain Percentage accuracy:')
print(ridgeregr3.score(x_train, y_train))

print('\nTest Percentage accuracy:')
print(ridgeregr3.score(x_test, y_test))

print('\nCross Validation Score:')
print(cross_val_score(ridgeregr3, x_train, y_train, cv = 4))

print('\nCross Validation Mean:')
print(cross_val_score(ridgeregr3, x_train, y_train, cv = 4).mean())

print('\nCross Validation Standard Deviation:')
print(cross_val_score(ridgeregr3, x_train, y_train, cv = 4).std())

Coefficients:
[[ 0.06102135  0.01279617 -0.02064798  0.01646353  0.0021074   0.02426108
   0.05993288  0.06164304  0.05908914  0.05588521  0.          0.34145264
   0.          0.          0.09522114  0.16145362 -0.00354075 -0.10856677
   0.03608011]]

Coefficients Mean:
0.04498167371404938

Coefficients Standard Deviation:
0.08752921410115516

Intercept:
[1.30468925]

Parameter estimates:
[ 0.06102135  0.01279617 -0.02064798  0.01646353  0.0021074   0.02426108
  0.05993288  0.06164304  0.05908914  0.05588521  0.          0.34145264
  0.          0.          0.09522114  0.16145362 -0.00354075 -0.10856677
  0.03608011]

Train Percentage accuracy:
0.875

Test Percentage accuracy:
0.8426966292134831

Cross Validation Score:
[0.86567164 0.87878788 0.87878788 0.87692308]

Cross Validation Mean:
0.8750426190724698

Cross Validation Standard Deviation:
0.005463636097156949


### Lasso

We'll start off the lasso with the same parameters as the ridge. With the high alpha on the lasso, it performs similarly to the logistic and the ridge. The coefficients are not super consistent but the model performs similarly.

In [77]:
# C at 5 to match the ridge
lass = LogisticRegression(penalty = 'l1', C = 5)
lass.fit(x_train, y_train)

# Display.
print('Coefficients:')
print(lass.coef_)

print('\nCoefficients Mean:')
print(lass.coef_.mean())

print('\nCoefficients Standard Deviation:')
print(lass.coef_.std())

print('\nIntercept:')
print(lass.intercept_)

print('\nTrain Percentage accuracy:')
print(lass.score(x_train, y_train))

print('\nTest Percentage accuracy:')
print(lass.score(x_test, y_test))

origparams = np.append(lass.coef_, lass.intercept_)
print('\nParameter estimates:')
print(origparams)

print('\nCross Validation Score:')
print(cross_val_score(lass, x_train, y_train, cv = 4))

print('\nCross Validation Mean:')
print(cross_val_score(lass, x_train, y_train, cv = 4).mean())

print('\nCross Validation Standard Deviation:')
print(cross_val_score(lass, x_train, y_train, cv = 4).std())

Coefficients:
[[ 1.99165129  0.         -1.6744339  -4.69149268  0.          0.
   0.          0.         10.40843402  5.22983725  0.          0.70398575
   0.          0.          0.12560287 -0.12607902 -0.07784994 -0.26789971
   0.07453674]]

Coefficients Mean:
0.6155943507752297

Coefficients Standard Deviation:
2.882594650803918

Intercept:
[5.03902692]

Train Percentage accuracy:
0.8977272727272727

Test Percentage accuracy:
0.8202247191011236

Parameter estimates:
[ 1.99165129  0.         -1.6744339  -4.69149268  0.          0.
  0.          0.         10.40843402  5.22983725  0.          0.70398575
  0.          0.          0.12560287 -0.12607902 -0.07784994 -0.26789971
  0.07453674  5.03902692]

Cross Validation Score:
[0.86567164 0.86363636 0.89393939 0.87692308]

Cross Validation Mean:
0.8750426190724698

Cross Validation Standard Deviation:
0.01202645105987019


#### Lower the Alpha

Lowering the alpha here improves the model similarly as it did in the ridge model but even more so. The coefficients for the most part are lower but the deviation is a bit higher. The mean is of course thrown off a bit as more features get eliminated but fortunately we haven't lost any accuracy here.

In [78]:
# C at 5 to match the ridge
lass2 = LogisticRegression(penalty = 'l1', C = .25)
lass2.fit(x_train, y_train)

# Display.
print('Coefficients:')
print(lass2.coef_)

print('\nCoefficients Mean:')
print(lass2.coef_.mean())

print('\nCoefficients Standard Deviation:')
print(lass2.coef_.std())

print('\nIntercept:')
print(lass2.intercept_)

print('\nTrain Percentage accuracy:')
print(lass2.score(x_train, y_train))

print('\nTest Percentage accuracy:')
print(lass2.score(x_test, y_test))

origparams = np.append(lass2.coef_, lass2.intercept_)
print('\nParameter estimates:')
print(origparams)

print('\nCross Validation Score:')
print(cross_val_score(lass2, x_train, y_train, cv = 4))

print('\nCross Validation Mean:')
print(cross_val_score(lass2, x_train, y_train, cv = 4).mean())

print('\nCross Validation Standard Deviation:')
print(cross_val_score(lass2, x_train, y_train, cv = 4).std())

Coefficients:
[[ 0.          0.          0.          0.          0.          0.
   0.11210265  0.          0.          0.          0.          0.64003438
   0.          0.          0.08391891  0.22736844  0.         -0.10750313
   0.        ]]

Coefficients Mean:
0.050311644664460746

Coefficients Standard Deviation:
0.15293836907254643

Intercept:
[1.95979343]

Train Percentage accuracy:
0.875

Test Percentage accuracy:
0.8426966292134831

Parameter estimates:
[ 0.          0.          0.          0.          0.          0.
  0.11210265  0.          0.          0.          0.          0.64003438
  0.          0.          0.08391891  0.22736844  0.         -0.10750313
  0.          1.95979343]

Cross Validation Score:
[0.86567164 0.87878788 0.87878788 0.87692308]

Cross Validation Mean:
0.8750426190724698

Cross Validation Standard Deviation:
0.005463636097156949


#### Even More

Moving the alpha down doesn't seem to affect our accuracy but does lower the intercept and coefficients even further.

In [71]:
lass3 = LogisticRegression(penalty = 'l1', C = .05)
lass3.fit(x_train, y_train)

# Display.
print('Coefficients:')
print(lass3.coef_)

print('\nCoefficients Mean:')
print(lass3.coef_.mean())

print('\nCoefficients Standard Deviation:')
print(lass3.coef_.std())

print('\nIntercept:')
print(lass3.intercept_)

print('\nTrain Percentage accuracy:')
print(lass3.score(x_train, y_train))

print('\nTest Percentage accuracy:')
print(lass3.score(x_test, y_test))

origparams = np.append(lass3.coef_, lass3.intercept_)
print('\nParameter estimates:')
print(origparams)

print('\nCross Validation Score:')
print(cross_val_score(lass3, x_train, y_train, cv = 4))

print('\nCross Validation Mean:')
print(cross_val_score(lass3, x_train, y_train, cv = 4).mean())

print('\nCross Validation Standard Deviation:')
print(cross_val_score(lass3, x_train, y_train, cv = 4).std())

Coefficients:
[[0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.06439847
  0.         0.         0.         0.         0.         0.
  0.        ]]

Coefficients Mean:
0.003389393410066565

Coefficients Standard Deviation:
0.014379978386202388

Intercept:
[1.3812554]

Train Percentage accuracy:
0.875

Test Percentage accuracy:
0.8426966292134831

Parameter estimates:
[0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.06439847
 0.         0.         0.         0.         0.         0.
 0.         1.3812554 ]

Cross Validation Score:
[0.86567164 0.87878788 0.87878788 0.87692308]

Cross Validation Mean:
0.8750426190724698

Cross Validation Standard Deviation:
0.005463636097156949


### Conclusion

For the most effective route, the lass or ridge model with an alpha of .05 works with the highest accuracy at what seems to be the lowest cost. For a dataset that only has 17 features in it, I would recommend sticking with the ridge model. With so few features you would most likely be able to extract the most effective features manually instead of setting up the lasso.