In [1]:
import numpy as np
import pandas as pd
from sklearn import datasets
#
from sklearn import model_selection
from sklearn import metrics

dataset = datasets.load_boston()
print(dataset.DESCR)

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pu

  * **CRIM**: 자치 시(town) 별 1인당 범죄율
  * **ZN**: 25,000 평방피트를 초과하는 거주지역의 비율
  * **INDUS**: 비소매상업지역이 점유하고 있는 토지의 비율
  * **CHAS**: 찰스강의 경계에 위치해 있으면 1, 그렇지 않으면 0
  * **NOX**: 10ppm당 농축 일산화질소
  * **RM**: 주택 1가구당 평균 방의 개수
  * **AGE**: 1940년 이전에 건축된 소유주택의 비율
  * **DIS**: 5개의 보스턴 직업센터까지의 접근성 지수
  * **RAD**: 방사형 도로까지의 접근성 지수
  * **TAX**: 10,000 달러 당 재산세율
  * **PTRATIO**: 자치 시(town)별 학생/교사 비율
  * **B**: 1000(Bk-0.63)^2, 여기서 Bk는 자치시별 흑인의 비율을 말함.
  * **LSTAT**: 모집단의 하위계층 비율(%)
  * **MEDV**: 본인 소유의 주택가격(중앙값) (단위: $1,000)

In [2]:
df = pd.DataFrame(dataset.data, columns=dataset.feature_names)
df['target'] = dataset.target
print(df.head())

      CRIM    ZN  INDUS  CHAS    NOX     RM   AGE     DIS  RAD    TAX  \
0  0.00632  18.0   2.31   0.0  0.538  6.575  65.2  4.0900  1.0  296.0   
1  0.02731   0.0   7.07   0.0  0.469  6.421  78.9  4.9671  2.0  242.0   
2  0.02729   0.0   7.07   0.0  0.469  7.185  61.1  4.9671  2.0  242.0   
3  0.03237   0.0   2.18   0.0  0.458  6.998  45.8  6.0622  3.0  222.0   
4  0.06905   0.0   2.18   0.0  0.458  7.147  54.2  6.0622  3.0  222.0   

   PTRATIO       B  LSTAT  target  
0     15.3  396.90   4.98    24.0  
1     17.8  396.90   9.14    21.6  
2     17.8  392.83   4.03    34.7  
3     18.7  394.63   2.94    33.4  
4     18.7  396.90   5.33    36.2  


In [3]:
print(df.shape) #(506, 14)
print(df.describe())

(506, 14)
             CRIM          ZN       INDUS        CHAS         NOX          RM  \
count  506.000000  506.000000  506.000000  506.000000  506.000000  506.000000   
mean     3.613524   11.363636   11.136779    0.069170    0.554695    6.284634   
std      8.601545   23.322453    6.860353    0.253994    0.115878    0.702617   
min      0.006320    0.000000    0.460000    0.000000    0.385000    3.561000   
25%      0.082045    0.000000    5.190000    0.000000    0.449000    5.885500   
50%      0.256510    0.000000    9.690000    0.000000    0.538000    6.208500   
75%      3.677083   12.500000   18.100000    0.000000    0.624000    6.623500   
max     88.976200  100.000000   27.740000    1.000000    0.871000    8.780000   

              AGE         DIS         RAD         TAX     PTRATIO           B  \
count  506.000000  506.000000  506.000000  506.000000  506.000000  506.000000   
mean    68.574901    3.795043    9.549407  408.237154   18.455534  356.674032   
std     28.148861

### 가격

In [61]:
df.iloc[:,-1]

0      24.0
1      21.6
2      34.7
3      33.4
4      36.2
       ... 
501    22.4
502    20.6
503    23.9
504    22.0
505    11.9
Name: target, Length: 506, dtype: float64

In [69]:
x_data = dataset.data
y_data = dataset.target

# Machine Learning

In [96]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics

In [97]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = model_selection.train_test_split(x_data, y_data, test_size=0.3)

In [98]:
from sklearn.linear_model import LinearRegression
linreg = LinearRegression()
linreg.fit(x_train, y_train)
y_predict = linreg.predict(x_test) 
acc_linreg = round(metrics.r2_score(y_test, y_predict) * 100, 2)
print(score)

0.6499606516776741


In [99]:
pd.concat([pd.DataFrame(y_test), pd.DataFrame(y_predict)], axis=1)

Unnamed: 0,0,0.1
0,13.9,13.153856
1,32.2,32.101350
2,16.1,18.572036
3,48.3,37.763396
4,12.7,11.756500
...,...,...
147,35.4,31.084919
148,28.6,28.890914
149,8.5,16.332489
150,20.4,20.615294


In [100]:
from sklearn.neighbors import KNeighborsRegressor

knr = KNeighborsRegressor(n_neighbors=5, metric='minkowski', weights='uniform')
knr.fit(x_train, y_train)
y_predict = knr.predict(x_test)
acc_knr = metrics.r2_score(y_test, y_predict)
print(acc_knr)

0.4634716708811871


In [101]:
pd.concat([pd.DataFrame(y_test), pd.DataFrame(y_predict)], axis=1)

Unnamed: 0,0,0.1
0,13.9,17.20
1,32.2,30.38
2,16.1,16.18
3,48.3,33.30
4,12.7,17.56
...,...,...
147,35.4,40.98
148,28.6,24.64
149,8.5,13.32
150,20.4,19.60


In [113]:
from sklearn.linear_model import Lasso
lasso = Lasso(alpha=1.0)
lasso.fit(x_train, y_train)
y_predict = lasso.predict(x_test) 
acc_lasso = metrics.r2_score(y_test, y_predict)
print(acc_lasso)

0.6900366572379633


In [117]:
from sklearn.linear_model import Ridge
ridge = Ridge(alpha=1.0)
ridge.fit(x_train, y_train)
y_predict = ridge.predict(x_test) 
acc_ridge = metrics.r2_score(y_test, y_predict)
print(acc_ridge)

0.7117354789822578


In [118]:
from sklearn.ensemble import RandomForestRegressor

random_forest = RandomForestRegressor()
random_forest.fit(x_train, y_train)
y_predict = random_forest.predict(x_test) 
acc_random_forest = metrics.r2_score(y_test, y_predict)
print(acc_random_forest)

0.8743419431490416


In [119]:
pd.concat([pd.DataFrame(y_test), pd.DataFrame(y_predict)], axis=1)

Unnamed: 0,0,0.1
0,13.9,15.744
1,32.2,30.701
2,16.1,16.601
3,48.3,43.821
4,12.7,15.333
...,...,...
147,35.4,34.718
148,28.6,26.045
149,8.5,13.028
150,20.4,18.704


In [120]:
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(degree=2)
poly.fit(x_data)
x_data = poly.transform(x_data)

x_train, x_test, y_train, y_test = model_selection.train_test_split(x_data, y_data, test_size=0.3)

random_forest_poly = RandomForestRegressor()
random_forest_poly.fit(x_train, y_train)
y_predict = random_forest_poly.predict(x_test) 
acc_random_forest_poly = metrics.r2_score(y_test, y_predict)
print(acc_random_forest_poly)

0.886849431774893


In [121]:
pd.concat([pd.DataFrame(y_test), pd.DataFrame(y_predict)], axis=1)

Unnamed: 0,0,0.1
0,20.8,21.507
1,17.4,19.730
2,23.8,24.403
3,30.3,28.491
4,22.8,24.763
...,...,...
147,25.0,24.410
148,13.4,14.722
149,31.2,30.703
150,37.2,35.529
