In [1]:
import numpy as np
import numpy.random as random
import scipy as sp
import pandas as pd
from pandas import Series, DataFrame

import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
%matplotlib inline

import sklearn

%precision 3

'%.3f'

In [2]:
import requests, zipfile
import io

In [3]:
# 自動車価格データを取得
url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data'
res = requests.get(url).content

In [4]:
# 取得したデータをDataFrameで読み込み
auto = pd.read_csv(io.StringIO(res.decode('utf-8')), header=None)

In [5]:
auto.columns = ['symboling', 'normalized-losses', 'make', 'fuel-type', 'aspiration', 'num-of-doors',
               'body-style', 'drive-wheels', 'engine-location', 'wheel-base', 'length', 'width', 'height',
               'curb-weight', 'engine-type', 'num-of-cylinders', 'engine-size', 'fuel-system', 'bore', 
               'stroke', 'compression-ratio', 'horsepower', 'peak-rpm', 'city-mpg', 'highway-mpg', 'price'
               ]

In [6]:
auto = auto[['price', 'horsepower', 'width', 'height']]
auto = auto.replace('?', np.nan).dropna()

In [11]:
# リッジ回帰用のクラス
from sklearn.linear_model import Ridge
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [12]:
# 訓練データとテストデータに分割
X = auto.drop('price', axis=1)
y = auto['price']
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.5, random_state=0)

In [13]:
# モデルの構築と評価
linear = LinearRegression()
ridge = Ridge(random_state=0)

In [14]:
for model in [linear, ridge]:
        model.fit(X_train, y_train)
        print(model.score(X_train, y_train))
        print(model.score(X_test, y_test))

0.7333575683901375
0.7370688738125756
0.7333547383511861
0.7377676885006825


In [33]:
from sklearn.linear_model import Lasso

In [34]:
# 訓練データとテストデータに分割
X = auto.drop('price', axis=1)
y = auto['price']
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.5, random_state=0)

In [35]:
models = {
    'linear':LinearRegression(),
    'lasso1':Lasso(alpha=1.0, random_state=0),
    'lasso2':Lasso(alpha=200.0, random_state=0)
}

In [39]:
scores={}
for model_name, model in models.items():
        model.fit(X_train, y_train)
        print((model_name),model.score(X_train, y_train))
        print((model_name),model.score(X_test, y_test))
        scores[(model_name, 'train')] = model.score(X_train, y_train)
        scores[(model_name, 'test')] = model.score(X_test, y_test)

linear 0.7333575683901375
linear 0.7370688738125756
lasso1 0.7333575605856103
lasso1 0.7371065049203236
lasso2 0.7330815028189964
lasso2 0.7432353918372592


In [40]:
pd.Series(scores).unstack()

Unnamed: 0,test,train
lasso1,0.737107,0.733358
lasso2,0.743235,0.733082
linear,0.737069,0.733358


In [41]:
# きのこデータを取得
url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data'
res = requests.get(url).content

In [42]:
# 取得したデータをDataFrameで読み込み
mushroom = pd.read_csv(io.StringIO(res.decode('utf-8')), header=None)

In [44]:
# データの列にラベルを設定
mushroom.columns = ['classes', 'cap_shape', 'cap_surface', 'cap_color', 'odor', 'bruises',
                   'gill_attachment', 'gill_spacing', 'gill_size', 'gill_color', 'stalk_shape', 
                   'stalk_root', 'stalk_surface_above_ring', 'stalk_surface_below_ring',
                   'stalk_color_above_ring', 'stalk_color_below_ring', 'veil_type', 'veil_color',
                   'ring_number', 'ring_type', 'spore_print_color', 'population', 'habitat']

In [45]:
mushroom.head()

Unnamed: 0,classes,cap_shape,cap_surface,cap_color,odor,bruises,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [49]:
mushroom_dummy = pd.get_dummies(mushroom[['gill_color', 'gill_attachment', 'odor', 'cap_color']])
mushroom_dummy.head()

Unnamed: 0,gill_color_b,gill_color_e,gill_color_g,gill_color_h,gill_color_k,gill_color_n,gill_color_o,gill_color_p,gill_color_r,gill_color_u,...,cap_color_b,cap_color_c,cap_color_e,cap_color_g,cap_color_n,cap_color_p,cap_color_r,cap_color_u,cap_color_w,cap_color_y
0,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,0,0,0,0,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [50]:
mushroom_dummy['flg'] = mushroom['classes'].map(lambda x: 1 if x =='p' else 0)

In [51]:
mushroom_dummy.groupby(['cap_color_c', 'flg'])['flg'].count().unstack()

flg,0,1
cap_color_c,Unnamed: 1_level_1,Unnamed: 2_level_1
0,4176,3904
1,32,12


In [55]:
mushroom_dummy.groupby(['gill_color_b', 'flg'])['flg'].count().unstack()

flg,0,1
gill_color_b,Unnamed: 1_level_1,Unnamed: 2_level_1
0,4208.0,2188.0
1,,1728.0


In [57]:
print(mushroom_dummy.groupby('flg')['flg'].count())
entropy_init = - (0.518 * np.log2(0.518) + 0.482 * np.log2(0.482))
print(entropy_init)

flg
0    4208
1    3916
Name: flg, dtype: int64
0.9990649315776107


In [58]:
# cap_colorがcでない場合のエントロピー
p1 = 4176 / (4176 + 3904)
p2 = 1 - p1
entropy_c0 = -(p1*np.log2(p1)+p2*np.log2(p2))
print(entropy_c0)

0.9991823984904757


In [62]:
# cap_colorがcである場合のエントロピー
p1 = 32 / (32+12)
p2 = 1 - p1
entropy_c1 = -(p1*np.log2(p1) + p2*np.log2(p2))
print(entropy_c1)

0.8453509366224364


In [64]:
entropy_after = (4176 + 3904)/8124 * entropy_c0 + (32+12)/8124*entropy_c1
# データ分割後の平均エントロピー
print(entropy_after)

0.9983492394158581


In [65]:
#cap_colorの分割によって得られる情報利得　
print(entropy_init - entropy_after)

0.0007156921617526013


In [66]:
# gill_colorがbでない場合のエントロピー
p1 = 4208/(4208+2188)
p2 = 1-p1
entropy_b0 = - (p1*np.log2(p1) + p2*np.log2(p2))
print(entropy_b0)

# gill_colorがbである場合のエントロピー
p1 = 0/(0+1728)
p2 = 1 - p1
entropy_b1 = - (p2*np.log2(p2))
print(entropy_b1)

0.926803536674184
-0.0


In [68]:
entropy_after = (4208+2188)/8124*entropy_b0 +(0+1728)/8124*entropy_b1
print(entropy_init - entropy_after)

0.26939538202467106


In [69]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

In [70]:
X = mushroom_dummy.drop('flg', axis=1)
y = mushroom_dummy['flg']

In [71]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)

In [72]:
model = DecisionTreeClassifier(criterion='entropy', max_depth=5, random_state=0)
model.fit(X_train, y_train)

DecisionTreeClassifier(criterion='entropy', max_depth=5, random_state=0)

In [73]:
print(model.score(X_train, y_train))
print(model.score(X_test, y_test))

0.8829804693911045
0.8941408173313639
