## Below here is working on PCA and Lasso

In [1]:
%matplotlib inline
from google.colab import drive
drive.mount('/content/gdrive')
%cd '/content/gdrive/My Drive/Econ 484/project'

Mounted at /content/gdrive
/content/gdrive/My Drive/Econ 484/project


In [2]:
from sklearn import preprocessing as pp
from pandas import DataFrame

def PolynomialFeatures_labeled(input_df,power):
    '''Basically this is a cover for the sklearn preprocessing function. 
    The problem with that function is if you give it a labeled dataframe, it ouputs an unlabeled dataframe with potentially
    a whole bunch of unlabeled columns. 
    Inputs:
    input_df = Your labeled pandas dataframe (list of x's not raised to any power) 
    power = what order polynomial you want variables up to. (use the same power as you want entered into pp.PolynomialFeatures(power) directly)
    Ouput:
    Output: This function relies on the powers_ matrix which is one of the preprocessing function's outputs to create logical labels and 
    outputs a labeled pandas dataframe   
    '''
    poly = pp.PolynomialFeatures(power, include_bias = False)
    output_nparray = poly.fit_transform(input_df)
    powers_nparray = poly.powers_

    input_feature_names = list(input_df.columns)
    target_feature_names = ["Constant Term"]
    for feature_distillation in powers_nparray[1:]:
        intermediary_label = ""
        final_label = ""
        for i in range(len(input_feature_names)):
            if feature_distillation[i] == 0:
                continue
            else:
                variable = input_feature_names[i]
                power = feature_distillation[i]
                intermediary_label = "%s^%d" % (variable,power)
                if final_label == "":         #If the final label isn't yet specified
                    final_label = intermediary_label
                else:
                    final_label = final_label + " x " + intermediary_label
        target_feature_names.append(final_label)
    output_df = pd.DataFrame(output_nparray, columns = target_feature_names)
    return output_df

In [3]:
#Import the correct packages 
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
import pandas as pd
from sklearn.linear_model import LassoCV
import numpy as np


#Data cleaning from Hannah's code
shooter = pd.read_csv('Firearms_cleaned_Apr7.csv')
shooter = shooter.drop(['Unnamed: 0'], axis = 1)
casualties = pd.read_csv('numInj_with_index_Apr7.csv', index_col = 'Unnamed: 0')
casualties = casualties.drop(['Case #'], axis = 1)
casualties.rename(columns={'0': 'Casualties'}, inplace=True)

# Turn the caliber and classification categorical variables into dummy variables
shooter = pd.get_dummies(shooter, prefix = ['Caliber', 'Classification'], columns = ['Caliber', 'Classification'])

# First, we create three new dataframes with our variables of interest. 
class1 = pd.DataFrame(shooter.loc[:, 'Classification_1.0'])
class2 = pd.DataFrame(shooter.loc[:, 'Classification_2.0'])
class3 = pd.DataFrame(shooter.loc[:, 'Classification_3.0'])

# Next, we drop all of the classification variables from our dataframe - everything will be in reference to handguns which is Classification 0.
data = shooter.drop(['Classification_0.0', 'Classification_1.0', 'Classification_2.0', 'Classification_3.0'], axis = 1)

In [4]:
#data = PolynomialFeatures_labeled(data, 2)

In [5]:
#Make train/test split on all the datasets so they match up
train, test, casualTrain, casualTest, class1Train, class1Test, class2Train, class2Test, class3Train, class3Test = train_test_split(data, casualties, class1, class2, class3, random_state = 0)
print(train.shape,test.shape)

#Standardize before doing PCA
scaler = StandardScaler()
trainScaled = scaler.fit_transform(train)
testScaled = scaler.transform(test)

pca = PCA(n_components=3)
pcaDataTrain = pca.fit_transform(trainScaled)

print(pcaDataTrain.shape)

#Transform the test set
pcaDataTest = pca.transform(testScaled)
print(pcaDataTest.shape)

#Now we only have the principle components to run on Lasso 2SLS

(141, 422) (47, 422)
(141, 3)
(47, 3)


In [6]:
# Run Lasso on a random training set and test set to check the score so that we can compare it to the score we'd get on ridge.
lassocv = LassoCV(cv = 5, max_iter=100000).fit(pcaDataTrain, casualTrain)
print('Lasso score on training set: {:.4f}'.format(lassocv.score(pcaDataTrain, casualTrain)))
print('Lasso score on test set: {:.4f}'.format(lassocv.score(pcaDataTest, casualTest)))

# do lasso on a randomly chosen training set and lasso.score to the test set, preliminary step what method we chose and how we chose them. test

Lasso score on training set: 0.0150
Lasso score on test set: -0.0697


  y = column_or_1d(y, warn=True)


In [7]:
alpha = lassocv.alpha_

# Print selected alpha value
print("Selected alpha value: {:.4f}".format(alpha))

Selected alpha value: 30.4721


If it had worked, this is what we would have done. Unfortunately, it did not.

In [8]:
#Run Lasso on all of the models
lassoy1 = LassoCV(max_iter=1000).fit(pcaDataTrain, casualTrain)

lassod1 = LassoCV(max_iter=1000).fit(pcaDataTrain, class1Train)
lassod2 = LassoCV(max_iter=1000).fit(pcaDataTrain, class2Train)
lassod3 = LassoCV(max_iter=1000).fit(pcaDataTrain, class3Train)

#These seem relatively reasonable
lassoy1.alpha_,lassod1.alpha_,lassod2.alpha_,lassod3.alpha_

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(30.47213259004944,
 0.04662503362812325,
 0.00014581871868651514,
 0.004223342318581071)

In [9]:
#Print coefficients from Lasso 
lassod1.coef_,lassod2.coef_,lassod3.coef_

(array([ 0.00000000e+00, -0.00000000e+00, -1.16982136e-18]),
 array([-0.00992772,  0.0051752 ,  0.01352656]),
 array([ 0.04129261, -0.0071584 , -0.03649744]))

In [10]:
#Find which coefficients are nonzero
nonZero = (lassod1.coef_ != 0) + (lassod2.coef_ != 0) + (lassod3.coef_ != 0)
Xun = pcaDataTrain[:,nonZero]

assert Xun.shape[1] == np.sum(nonZero)  #Make sure it is getting the right columns

In [11]:
#Add constant and run regression
Xun = sm.add_constant(Xun)

rhs = np.hstack([class1Train,class2Train,class3Train,Xun])
model = sm.OLS(casualTrain,rhs)
res = model.fit()

print(res.summary())


                            OLS Regression Results                            
Dep. Variable:             Casualties   R-squared:                       0.050
Model:                            OLS   Adj. R-squared:                  0.007
Method:                 Least Squares   F-statistic:                     1.170
Date:                Thu, 20 Apr 2023   Prob (F-statistic):              0.326
Time:                        03:47:55   Log-Likelihood:                -810.38
No. Observations:                 141   AIC:                             1635.
Df Residuals:                     134   BIC:                             1655.
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
x1            -5.4468     22.201     -0.245      0.8

### Note that x1, x2, x3 are the classifications of guns and then x4-x10 are the pca components that were selected by lasso

## Same thing as above but using lasso on entire dataset

In [12]:
from sklearn.linear_model import LassoCV

fullScaled = scaler.transform(data)
pcaDataFull = pca.transform(fullScaled)

#Run Lasso on all of the models
lassoy1 = LassoCV(max_iter=1000).fit(pcaDataFull, casualties)

lassod1 = LassoCV(max_iter=1000).fit(pcaDataFull, class1)
lassod2 = LassoCV(max_iter=1000).fit(pcaDataFull, class2)
lassod3 = LassoCV(max_iter=1000).fit(pcaDataFull, class3)


#Find which coefficients are nonzero
nonZero = (lassod1.coef_ != 0) + (lassod2.coef_ != 0) + (lassod3.coef_ != 0)
Xun = pcaDataTrain[:,nonZero]

assert Xun.shape[1] == np.sum(nonZero)  #Make sure it is getting the right columns

#Add constant and run regression
Xun = sm.add_constant(Xun)

rhs = np.hstack([class1Train,class2Train,class3Train,Xun])
model = sm.OLS(casualTrain,rhs)
res = model.fit()

print(res.summary())


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


                            OLS Regression Results                            
Dep. Variable:             Casualties   R-squared:                       0.050
Model:                            OLS   Adj. R-squared:                  0.007
Method:                 Least Squares   F-statistic:                     1.170
Date:                Thu, 20 Apr 2023   Prob (F-statistic):              0.326
Time:                        03:47:55   Log-Likelihood:                -810.38
No. Observations:                 141   AIC:                             1635.
Df Residuals:                     134   BIC:                             1655.
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
x1            -5.4468     22.201     -0.245      0.8

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
