In [1]:
# import dependencies and global settings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score, cross_validate, KFold
from sklearn.linear_model import LinearRegression, ElasticNet, Lasso
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.metrics import mean_absolute_error


from scipy.io import arff

In [2]:
# load linear features (X) and target variable (y) dataframes
X = pd.read_csv('./data_frames/housing_X_features.csv', index_col=0)
y = pd.read_csv('./data_frames/housing_y_target.csv', index_col=0)

In [3]:
# categorical and numeric feature count and feature list
num_count = X.dtypes.value_counts().iloc[0] + X.dtypes.value_counts().iloc[2].sum() # numeric feature count
num_cols = X.select_dtypes(exclude='object').columns
#num_cols

In [4]:
cat_count = X.dtypes.value_counts().iloc[1] # categorical feature count
cat_cols = X.columns[X.dtypes==object]
#cat_cols

In [5]:
# train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=6, test_size=0.2)

In [6]:
#pipeline

cat_vals = Pipeline([("imputer", SimpleImputer(strategy='most_frequent')), 
                     ("ohe", OneHotEncoder(handle_unknown='ignore'))])

# 7. Create numerical preprocessing pipeline
# Using mean to fill in missing values and standard scaling of features
num_vals = Pipeline([("imputer", SimpleImputer(strategy='mean'))])#, 
                     #("scale", StandardScaler())])

# 8. Create column transformer that will preprocess the numerical and categorical features separately
preprocess = ColumnTransformer(transformers=[("cat_process", cat_vals, cat_cols), ("num_process", num_vals, num_cols)])

## Linear Regression Baseline

In [7]:
# 9. Create a pipeline with preprocess, PCA, and a logistic regresssion model
lr_base = Pipeline([("preprocess",preprocess), 
                    ("MLR_baseline", LinearRegression())])

In [8]:
# 10. Fit the pipeline on the training data
lr_base.fit(X_train, y_train)

# cross validation
k=5
cv = KFold(n_splits=k, shuffle=True, random_state=1) # shuffle=true due to many classes involved in data 
train_scores = cross_val_score(lr_base, X_train, y_train, cv=cv)

for score in train_scores:
    print(score)

print('Mean=' + str(train_scores.mean()) + '    ' + 'Standard Divation=' + str(train_scores.std()))

0.7999510746847209
0.9055261436149149
0.9273785025019143
0.8989053749894982
0.7833014848274479
Mean=0.8630125161236993    Standard Divation=0.059277786647506055


In [9]:
#Predict the pipeline on the train data
print('Pipeline Accuracy Train Set:')
print(lr_base.score(X_train, y_train))

y_pred_train = lr_base.predict(X_train)
#Predict the pipeline on the test data
print('Pipeline Mean Absolute Error Train Set:')
print(mean_absolute_error(y_train, y_pred_train))

Pipeline Accuracy Train Set:
0.9282235722877771
Pipeline Mean Absolute Error Train Set:
13407.92170738989


In [10]:
#Predict the pipeline on the test data
print('Pipeline Accuracy Test Set:')
print(lr_base.score(X_test, y_test))

y_pred_test = lr_base.predict(X_test)
#Predict the pipeline on the test data
print('Pipeline Mean Absolute Error Train Set:')
print(mean_absolute_error(y_test, y_pred_test))

Pipeline Accuracy Test Set:
-271714.3998848176
Pipeline Mean Absolute Error Train Set:
2565713.925852694


In [11]:
# lr_base fitted model
#lr_base.get_params()
model = lr_base['MLR_baseline']

## Penalized (coarse search)
### ElasticNet Linear Regression

In [12]:
# 9. Create a pipeline with preprocess, PCA, and a logistic regresssion model
# alpha = 1000
# l1_ratio = 0.5
lr_elastic_net = Pipeline([("preprocess", preprocess),
                           ("lr_elastic_net", ElasticNet(max_iter=100000))])

In [13]:
# set GridSearch parameters
parameters = {'lr_elastic_net__alpha': [100, 1000, 10000], 
              'lr_elastic_net__l1_ratio': [0, 0.5, 1]} 

In [14]:
# define clf GridSearch object and inputs 
clf = GridSearchCV(lr_elastic_net, parameters, scoring='r2') #defaults to cv=5 fold cross-validation

In [15]:
clf.fit(X_train, y_train)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


KeyboardInterrupt: 

In [None]:
#Predict the pipeline on the train data
print('CLF Accuracy Train Set:')
print(clf.score(X_train, y_train))

y_pred_train = clf.predict(X_train)
#Predict the pipeline on the test data
print('CLF Mean Absolute Error Train Set:')
print(mean_absolute_error(y_train, y_pred_train))

In [None]:
#Predict the pipeline on the train data
print('CLF Accuracy Test Set:')
print(clf.score(X_test, y_test))

y_pred_test = clf.predict(X_test)
#Predict the pipeline on the test data
print('CLF Mean Absolute Error Test Set:')
print(mean_absolute_error(y_test, y_pred_test))

In [None]:
# print best estimator from GridSearch
print(clf.best_estimator_)

In [None]:
# print search list of parameter configurations
print(clf.cv_results_['params'])

# print mean_test_score from cross validations
print(clf.cv_results_['mean_test_score'])

In [None]:
# create dataframe of results and pivot table
df = pd.concat([pd.DataFrame(clf.cv_results_['params']), pd.DataFrame(clf.cv_results_['mean_test_score'], columns=['Score'])] ,axis=1)
cv_table = df.pivot(index='lr_elastic_net__alpha', columns='lr_elastic_net__l1_ratio')
print(cv_table)

## Penalized (fine search)
### Lasso Linear Regression

In [18]:
alpha_array = np.logspace(-3, 3, 100)
alpha_array

array([1.00000000e-03, 1.14975700e-03, 1.32194115e-03, 1.51991108e-03,
       1.74752840e-03, 2.00923300e-03, 2.31012970e-03, 2.65608778e-03,
       3.05385551e-03, 3.51119173e-03, 4.03701726e-03, 4.64158883e-03,
       5.33669923e-03, 6.13590727e-03, 7.05480231e-03, 8.11130831e-03,
       9.32603347e-03, 1.07226722e-02, 1.23284674e-02, 1.41747416e-02,
       1.62975083e-02, 1.87381742e-02, 2.15443469e-02, 2.47707636e-02,
       2.84803587e-02, 3.27454916e-02, 3.76493581e-02, 4.32876128e-02,
       4.97702356e-02, 5.72236766e-02, 6.57933225e-02, 7.56463328e-02,
       8.69749003e-02, 1.00000000e-01, 1.14975700e-01, 1.32194115e-01,
       1.51991108e-01, 1.74752840e-01, 2.00923300e-01, 2.31012970e-01,
       2.65608778e-01, 3.05385551e-01, 3.51119173e-01, 4.03701726e-01,
       4.64158883e-01, 5.33669923e-01, 6.13590727e-01, 7.05480231e-01,
       8.11130831e-01, 9.32603347e-01, 1.07226722e+00, 1.23284674e+00,
       1.41747416e+00, 1.62975083e+00, 1.87381742e+00, 2.15443469e+00,
      

In [None]:
# 9. Create a pipeline with preprocess, PCA, and a logistic regresssion model
# alpha = 1000
# l1_ratio = 0.5
lr_lasso = Pipeline([("preprocess", preprocess),
                     ("lr_lasso", Lasso(max_iter=100000))])

In [None]:
# set GridSearch parameters
parameters_lasso = {'lr_lasso__alpha': alpha_array}#, 
              #'lr_lasso__l1_ratio': [0, 0.5, 1]} 

In [None]:
# define clf GridSearch object and inputs 
clf_lasso = GridSearchCV(lr_lasso, parameters_lasso, scoring='r2') #defaults to cv=5 fold cross-validation

In [None]:
clf_lasso.fit(X_train, y_train)

In [None]:
#Predict the pipeline on the train data
print('CLF Accuracy Train Set:')
print(clf_lasso.score(X_train, y_train))

y_pred_train = clf_lasso.predict(X_train)
#Predict the pipeline on the test data
print('CLF Mean Absolute Error Train Set:')
print(mean_absolute_error(y_train, y_pred_train))

In [None]:
#Predict the pipeline on the train data
print('CLF Accuracy Test Set:')
print(clf_lasso.score(X_test, y_test))

y_pred_test = clf_lasso.predict(X_test)
#Predict the pipeline on the test data
print('CLF Mean Absolute Error Test Set:')
print(mean_absolute_error(y_test, y_pred_test))

## SFR Regression

In [None]:
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs

In [None]:
# 9. Create a pipeline with preprocess, PCA, and a logistic regresssion model
# alpha = 1000
# l1_ratio = 0.5
lr_elastic_net = Pipeline([("preprocess", preprocess),
                           ("lr_elastic_net", ElasticNet(max_iter=100000))])

In [None]:
# define instance of LinearRegression()
lr = LinearRegression()
k_features=20
sfs = SFS(lr, #instance of model to be used
          k_features=k_features, # number of features to select
          forward=False,
          floating=False,
          scoring='r2',
          cv=0)

In [None]:
# Fit the sequential forward selection model
sfs.fit(X_train, y_train)

In [None]:
# Plot model accuracy
plot_sfs(sfs.get_metric_dict())
plt.show()

In [None]:
sfs_features = sfs.get_metric_dict()[k_features]['feature_names']
sfs_features

In [None]:
# train score
sfs.score(X_train, y_train)

In [None]:
# test score
sfs.score(X_test, y_test)