In [1]:
%matplotlib inline

import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
from __future__ import division
import random
import itertools
import math as math
import sys
import time
from pprint import pprint
from sklearn import model_selection
from sklearn import metrics
from sklearn.preprocessing import StandardScaler

### 보스턴 주택 가격 데이터셋(toy data)

#### 데이터 설명 (14 columns)

* CRIM : 자치시(town)별 1인당 범죄율
* ZN : 25,000 평방피트를 초과하는 거주지역의 비율
* INDUS : 비소매상업지역이 점유하고 있는 토지의 비율
* CHAS : 찰스강에 대한 더미변수(강의 겨예에 위치한 경우는 1, 아니면 0)
* NOX : 10ppm 당 농축 일산화질소
* RM : 주택 1가구당 평균 방의 개수
* AGE : 1940년 이전에 건축된 소유주택의 비율
* DIS : 5개의 보스턴 직업센터까지의 접근성 지수
* RAD : 방사형 도로까지의 접근성 지수
* TAX : 10,000 달러 당 재산세율
* PTRATIO : 자치시(town)별 학생/교사 비율
* B : 1000(Bk-0.63)^2, 여기서 Bk는 자치시별 흑인의 비율을 말함
* LSTAT : 모집단의 하위계층의 비율(%)
* MEDV : 본인 소유의 주택가격(중앙값) (단위: $1,000)

In [17]:
from sklearn.datasets import load_boston
boston = load_boston()

In [18]:
df = pd.DataFrame(boston.data, columns= boston.feature_names)
df['target']= boston.target

In [31]:
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,target
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


In [46]:
y = df['target'].values

In [43]:
X = df.iloc[:, : -1]
X = sm.add_constant(X)
X = X.values

In [47]:
print(X.shape)
print(y.shape)

(506, 14)
(506,)


### statsmodels QuantReg

In [53]:
qr_reg = sm.QuantReg(y, X).fit(q = 0.75)
print(qr_reg.summary())

                         QuantReg Regression Results                          
Dep. Variable:                      y   Pseudo R-squared:               0.5348
Model:                       QuantReg   Bandwidth:                       2.015
Method:                 Least Squares   Sparsity:                        11.52
Date:                Tue, 24 May 2022   No. Observations:                  506
Time:                        15:12:28   Df Residuals:                      492
                                        Df Model:                           13
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         21.0912      6.034      3.495      0.001       9.235      32.947
x1            -0.0799      0.031     -2.619      0.009      -0.140      -0.020
x2             0.0668      0.016      4.296      0.000       0.036       0.097
x3            -0.0385      0.066     -0.588      0.5



In [58]:
import statsmodels.regression.quantile_regression as Q_reg

In [79]:
import asgl

In [83]:
asgl_model = asgl.ASGL(model='qr', penalization='lasso', intercept=True, tol=1e-5,
                        lambda1=[0.01, 0.1, 1, 10], tau=0.75)

In [84]:
asgl_model.fit(X, y)
lasso_fit_qr = asgl_model.coef_

In [85]:
lasso_fit_qr

[array([ 9.59703931,  0.        , -0.01814777,  0.06144131, -0.10134443,
         0.        ,  0.        ,  5.80151339, -0.00972714, -1.03533483,
         0.22034986, -0.01074466, -0.88371527,  0.01657517, -0.30888975]),
 array([ 4.66297445e+01,  0.00000000e+00, -1.00076957e-02,  7.74206183e-02,
        -6.53785695e-03,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         3.75619337e-02, -3.99256100e-01,  2.53530034e-01, -1.73314961e-02,
        -7.13691289e-01,  8.46071462e-03, -7.44849794e-01]),
 array([ 2.82340513e+01,  0.00000000e+00,  0.00000000e+00,  6.28655701e-02,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00, -1.47302175e-02,
         0.00000000e+00,  1.52676103e-02, -2.81365314e-01]),
 array([ 3.26500000e+01,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.0000