In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import mglearn
import os

In [None]:
adult_path=os.path.join(mglearn.datasets.DATA_PATH,'adult.data')
raw_data=pd.read_csv(
    adult_path,
    header=None,
    index_col=False,
    names=[
        'age',
        'workclass',
        'fnlwgt',
        'education',
        'education_num',
        'marital-status',
        'occupation',
        'relationship',
        'race',
        'gender',
        'capital-gain',
        'capital-loss',
        'hours-per-week',
        'native-country',
        'income',
    ]
)
data=raw_data[[
        'age',
        'workclass',
        'education',
        'gender',
        'hours-per-week',
        'occupation',
        'income',
]]
data.head()

In [None]:
data['gender'].value_counts()

In [None]:
data_dammies=pd.get_dummies(data)
data_dammies.columns

In [None]:
features=data_dammies.loc[:,'age':'occupation_ Transport-moving']
X=features.to_numpy()
y=data_dammies['income_ >50K'].to_numpy()

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=0)

scaler=StandardScaler().fit(X_train)
X_train_scaled=scaler.transform(X_train)
X_test_scaled=scaler.transform(X_test)

lgr=LogisticRegression(max_iter=2000).fit(X_train_scaled,y_train)
print(f'accurancy on train : {lgr.score(X_train_scaled,y_train):.2f}')
print(f'accurancy on test : {lgr.score(X_test_scaled,y_test):.2f}')

In [None]:
from sklearn.linear_model import LogisticRegression
# from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=0)

lgr=LogisticRegression(max_iter=2000).fit(X_train,y_train)
print(f'accurancy on train : {lgr.score(X_train,y_train):.2f}')
print(f'accurancy on test : {lgr.score(X_test,y_test):.2f}')

In [None]:
X,y=mglearn.datasets.make_wave(n_samples=100)
X=X.reshape(-1,1)
line=np.linspace(-3,3,1000,endpoint=False).reshape((-1,1))


In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

reg=LinearRegression().fit(X,y)
rfr=RandomForestRegressor(min_samples_split=3).fit(X,y)
plt.scatter(X,y)
plt.plot(line,reg.predict(line))
plt.plot(line,rfr.predict(line))

In [None]:
from sklearn.preprocessing import OneHotEncoder

bins=np.linspace(-3,3,11)
X_which_bin=np.digitize(X,bins)

encoder=OneHotEncoder(sparse_output=False).fit(X_which_bin)
X_binned=encoder.transform(X_which_bin)

line_which_bin=np.digitize(line,bins)
line_binned=encoder.transform(line_which_bin)

reg=LinearRegression().fit(X_binned,y)
rfr=RandomForestRegressor(min_samples_split=3).fit(X_binned,y)
plt.vlines(bins,-3,3,linestyles='--',linewidth=1,color='gray',alpha=.2)
plt.scatter(X,y)
plt.plot(line,reg.predict(line_binned))
plt.plot(line,rfr.predict(line_binned))

In [None]:
X_combined=np.hstack([X,X_binned])
line_combined=np.hstack([line,line_binned])
reg=LinearRegression().fit(X_combined,y)
plt.vlines(bins,-3,3,linestyles='--',linewidth=1,color='gray',alpha=.2)
plt.scatter(X,y)

plt.plot(line,reg.predict(line_combined))

In [None]:
X_product=np.hstack([X_binned,X*X_binned])
line_product=np.hstack([line_binned,line*line_binned])
reg=LinearRegression().fit(X_product,y)
plt.vlines(bins,-3,3,linestyles='--',linewidth=1,color='gray',alpha=.2)
plt.scatter(X,y)

plt.plot(line,reg.predict(line_product))

In [None]:
from sklearn.preprocessing import PolynomialFeatures
poly=PolynomialFeatures(degree=10,include_bias=False).fit(X)
X_poly=poly.transform(X)
X_poly.shape

In [None]:
line_poly=poly.transform(line)
reg=LinearRegression().fit(X_poly,y)
plt.scatter(X,y)

plt.plot(line,reg.predict(line_poly))

In [None]:
from sklearn.svm import SVR
plt.scatter(X,y,color='gray')
plt.plot(line,reg.predict(line_poly),label='poly')

for g in [1,5,10]:
    svr=SVR(gamma=g).fit(X,y)
    plt.plot(line,svr.predict(line),label=f'svr gamma={g}')
plt.legend(loc='best')

In [None]:
from sklearn.datasets import fetch_california_housing
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

cal_h = fetch_california_housing()
X_train,X_test,y_train,y_test=train_test_split(
    cal_h.data,cal_h.target,random_state=0
)
scaler=MinMaxScaler().fit(X_train)
X_train_scaled=scaler.transform(X_train)
X_test_scaled=scaler.transform(X_test)


In [None]:
pd.DataFrame(
    cal_h.data,
    columns=cal_h.feature_names
)

In [None]:
poly=PolynomialFeatures(degree=2).fit(X_train_scaled)
X_train_poly=poly.transform(X_train_scaled)
X_test_poly=poly.transform(X_test_scaled)

In [None]:
from sklearn.linear_model import Ridge
ridge=Ridge().fit(X_train_scaled,y_train)
print(f'accuracy on train without interaction :{ridge.score(X_train_scaled,y_train):.2f}')
print(f'accuracy on test without interaction  :{ridge.score(X_test_scaled,y_test):.2f}')
ridge=Ridge().fit(X_train_poly,y_train)
print(f'accuracy on train with interaction    :{ridge.score(X_train_poly,y_train):.2f}')
print(f'accuracy on test with interaction     :{ridge.score(X_test_poly,y_test):.2f}')

In [None]:
rf=RandomForestRegressor(n_estimators=100).fit(X_train_scaled,y_train)
print(f'accuracy on train without interaction :{rf.score(X_train_scaled,y_train):.2f}')
print(f'accuracy on test without interaction  :{rf.score(X_test_scaled,y_test):.2f}')
rf=RandomForestRegressor(n_estimators=100).fit(X_train_poly,y_train)
print(f'accuracy on train with interaction    :{rf.score(X_train_poly,y_train):.2f}')
print(f'accuracy on test with interaction     :{rf.score(X_test_poly,y_test):.2f}')

In [None]:
rnd=np.random.RandomState(0)
X_org=rnd.normal(size=(1000,3))
w=rnd.normal(size=3)
X=rnd.poisson(10*np.exp(X_org))
y=np.dot(X_org,w)

In [None]:
bins_0=np.bincount(X[:,0])
bins_1=np.bincount(X[:,1])
bins_2=np.bincount(X[:,2])

In [None]:
fig,axes=plt.subplots(3,1,figsize=(15,10))
for i,(ax,bins) in enumerate(zip(axes,[bins_0,bins_1,bins_2])):
    ax.bar(x=range(len(bins)),height=bins)
    ax.set_title(f'feature {i}')

In [None]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=0)

ridge=Ridge().fit(X_train,y_train)
print(f'accuracy on train without interaction :{ridge.score(X_train,y_train):.2f}')
print(f'accuracy on test without interaction :{ridge.score(X_test,y_test):.2f}')

In [None]:
X_train_log=np.log(1+X_train)
X_test_log=np.log(1+X_test)

In [None]:
plt.hist(X_train_log[:,0],bins=25,color='gray')

In [None]:
ridge=Ridge().fit(X_train_log,y_train)
print(f'accuracy on train without interaction :{ridge.score(X_train_log,y_train):.2f}')
print(f'accuracy on test without interaction :{ridge.score(X_test_log,y_test):.2f}')

In [None]:
from sklearn.datasets import load_breast_cancer
from sklearn.feature_selection import SelectPercentile,f_classif
from sklearn.model_selection import train_test_split

cancer=load_breast_cancer()

rng=np.random.RandomState(42)
noise=rng.normal(size=(len(cancer.data),50))
X_w_noise=np.hstack([cancer.data,noise])

X_train,X_test,y_train,y_test=train_test_split(X_w_noise,cancer.target,random_state=0,test_size=.5)

select=SelectPercentile(score_func=f_classif,percentile=50,).fit(X_train,y_train)
X_train_selected=select.transform(X_train)
X_test_selected=select.transform(X_test)

print(f'X train shape : {X_train.shape}')
print(f'selected X train shape : {X_train_selected.shape}')

In [None]:
mask = select.get_support()
print(mask)
plt.matshow(mask.reshape(1,-1),cmap='gray')

In [None]:
from sklearn.linear_model import LogisticRegression
m=3000
logreg=LogisticRegression(max_iter=m).fit(X_train,y_train)
# print(f'accuracy on train : {logreg.score(X_train,y_train):.2f}')
print(f'accuracy on test : {logreg.score(X_test,y_test):.3f}')
logreg=LogisticRegression(max_iter=m).fit(X_train_selected,y_train)
# print(f'selected accuracy on train : {logreg.score(X_train_selected,y_train):.2f}')
print(f'selected accuracy on test : {logreg.score(X_test_selected,y_test):.3f}')

In [None]:
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestRegressor
select=SelectFromModel(
    RandomForestRegressor(n_estimators=100,random_state=42),
    threshold='median'
)
select.fit(X_train,y_train)

In [None]:
X_train_l1=select.transform(X_train)
X_test_l1=select.transform(X_test)
print(f'X train shape : {X_train.shape}')
print(f'selected X train shape : {X_train_l1.shape}')

mask = select.get_support()
print(mask)
plt.matshow(mask.reshape(1,-1),cmap='gray')

In [None]:
m=3000
logreg=LogisticRegression(max_iter=m).fit(X_train,y_train)
print(f'accuracy on test : {logreg.score(X_test,y_test):.5f}')
logreg=LogisticRegression(max_iter=m).fit(X_train_selected,y_train)
print(f'selected accuracy on test : {logreg.score(X_test_selected,y_test):.5f}')
logreg=LogisticRegression(max_iter=m).fit(X_train_l1,y_train)
print(f'l1 accuracy on test : {logreg.score(X_test_l1,y_test):.5f}')

In [None]:
from sklearn.feature_selection import RFE
select=RFE(
    RandomForestRegressor(n_estimators=100,random_state=42),
    n_features_to_select=40
)
select.fit(X_train,y_train)
mask=select.get_support()


In [None]:
mask.reshape(1,-1).

In [None]:
plt.matshow(mask.reshape(1,-1),cmap='gray')

In [None]:
X_train_rfe=select.transform(X_train)
X_test_rfe=select.transform(X_test)
print(f'X train shape : {X_train.shape}')
print(f'selected X train shape : {X_train_rfe.shape}')

In [None]:
m=3000
logreg=LogisticRegression(max_iter=m).fit(X_train,y_train)
print(f'accuracy on test : {logreg.score(X_test,y_test):.5f}')
logreg=LogisticRegression(max_iter=m).fit(X_train_selected,y_train)
print(f'selected accuracy on test : {logreg.score(X_test_selected,y_test):.5f}')
logreg=LogisticRegression(max_iter=m).fit(X_train_l1,y_train)
print(f'l1 accuracy on test : {logreg.score(X_test_l1,y_test):.5f}')
logreg=LogisticRegression(max_iter=m).fit(X_train_rfe,y_train)
print(f'rfe accuracy on test : {logreg.score(X_test_rfe,y_test):.5f}')

In [None]:
citibike=mglearn.datasets.load_citibike()
citibike

In [None]:
xticks=pd.date_range(start=citibike.index.min(),end=citibike.index.max())


In [None]:
plt.figure(figsize=(15,5))
plt.xticks(xticks,xticks.strftime('%a %m-%d'),rotation=90,ha='left')
plt.plot(citibike)

In [None]:
y=citibike.values
X=citibike.index.astype(int).to_numpy()
X=X.reshape(-1,1)
X=X//10**9

In [None]:
sample_size=len(citibike)
train_size=23*8
train_rate=train_size/sample_size

In [None]:
def eval_on_feature(features,targets,regressor):
    X_train,X_test=features[:train_size],features[train_size:]
    y_train,y_test=targets[:train_size],targets[train_size:]
    # print(X_train.shape)
    # print(y_train.shape)
    regressor.fit(X_train,y_train)
    print(f'Accurancy on train : {regressor.score(X_train,y_train):.2f}')
    print(f'Accurancy on test : {regressor.score(X_test,y_test):.2f}')
    plt.figure(figsize=(15,5))
    # xticks=pd.date_range(start=np.min(features),end=np.max(features))
    # print(xticks)
    plt.xticks(
        range(0,len(X),8),
        xticks.strftime('%a %m-%d'),
        rotation=90,
        ha='left'
    )
    plt.plot(range(0,len(X_train)),y_train,label='train')
    plt.plot(range(len(X_train),len(X)),y_test,label='test')
    plt.plot(range(0,len(X_train)),regressor.predict(X_train),'--',label='predict_on_train')
    plt.plot(range(len(X_train),len(X)),regressor.predict(X_test),'--',label='predict_on_test')
    plt.legend(loc=(1.01,0))

* 決定木ベースのランダムフォレストは外装性がない = 訓練データの変数の範囲内しか予測することができない

In [None]:
from sklearn.ensemble import RandomForestRegressor
rfr=RandomForestRegressor(n_estimators=100,random_state=42)
eval_on_feature(X,y,rfr)

- 変数を日付にすると、テスト変数が訓練変数の外側になってしまうので、現在と未来での共通の因子を変数に設定することでランダムフォレストでも未来の日付の予測を実装する  
- まず具体的に、時刻による予測モデルを試す。

In [None]:
X_hour=citibike.index.hour.astype(int).to_numpy().reshape(-1,1)
eval_on_feature(X_hour,y,rfr)

- 時刻を学習させることで未来予測の精度は上がったが、まだまだ
- 時刻に加えて曜日情報を学習させる

In [None]:
X_hour_week=np.hstack([citibike.index.dayofweek.to_numpy().reshape(-1,1),X_hour])
eval_on_feature(X_hour_week,y,rfr)

- ここまでランダムフォレストで実装したが、時刻と曜日による予測にランダムフォレストはコスパよくない。
- このレベルであれば線型回帰で予測することができるので実装。
- まずは何も考えずに

In [None]:
from sklearn.linear_model import LinearRegression
eval_on_feature(X_hour_week,y,LinearRegression())

- 全然ダメダメ
- 線型回帰は変数を線型(=比例)でしか学習しないので、曜日(0~6で入力されている)や時刻を連続値として線型に学習し、それぞれ単純な比例関係に解釈してしまっている。
- 線型回帰でカテゴリ変数を使用する場合はone hot encoding

In [None]:
from sklearn.preprocessing import OneHotEncoder
X_hour_week_encoded=OneHotEncoder().fit_transform(X_hour_week).toarray() # スパース行列での出力になるのでtoarray()
eval_on_feature(X_hour_week_encoded,y,LinearRegression())

In [None]:
from sklearn.linear_model import Ridge
eval_on_feature(X_hour_week_encoded.toarray(),y,Ridge())


- 精度は上がってきた
- 現状のモデルは時刻、曜日それぞれを独立に学習しているので、例えば日曜日の夕方、みたいな交互作用は学習していない
- polynormal変数を作成して交互作用を学習させる

In [None]:
from sklearn.preprocessing import PolynomialFeatures
poly=PolynomialFeatures(
    degree=2, # 曜日✖️時刻で２次元
    interaction_only=True, # カテゴリ変数なので各変数の二乗は不要
    include_bias=False
    )
X_hour_week_encoded_poly=poly.fit_transform(X_hour_week_encoded)

In [None]:
lr=LinearRegression()
eval_on_feature(X_hour_week_encoded_poly,y,lr)

- ランダムフォレストと同じくらいいい感じ！
- 特長量エンジニアリングをして線型回帰するメリットは、どんな項目を学習することで精度が上がったかの解釈がしやすい（曜日✖️時刻の特長量に係数がつくから）
- 係数を可視化する。

In [None]:
# 特長量に名前をつける。
# 最初の５つは月曜日始まりの曜日のone-hot
# 続く８つは0時始まりの３時間ごとの時刻
# そのあとは交互作用

weekday = ['mon','tue','wed','thu','fri','sat','sun']
hour = [f'{i:02}:00' for i in range(0,24,3)]
features = weekday+hour
features_poly = poly.get_feature_names_out(features)

In [None]:
# features_poly_nonzero=features_poly[lr.coef_ != 0]
features_poly_nonzero=np.array(features_poly)[lr.coef_ != 0]
coef_nonzero=lr.coef_[lr.coef_ != 0]
plt.figure(figsize=(15,5))
plt.plot(coef_nonzero,'o')
plt.xticks(range(len(coef_nonzero)),features_poly_nonzero,rotation=90)
plt.xlim(-1,len(coef_nonzero)+1)

In [None]:
features_poly_nonzero

In [None]:
X_hour_week_encoded