# ライブラリのインポート

In [30]:
%reload_ext autoreload
%autoreload 2
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import statsmodels.api as sm

from my_regression import my_linear_regression
from my_statistics import my_check_data

# データの読み込み
目的変数である`u`を適当に欠損させる

In [31]:
X = pd.read_excel("data.xlsx", index_col = 0)

for idx in range(len(X)):
    if np.random.random() < 0.3:
        X.at[idx, "u"] = np.nan
display(X)

Unnamed: 0,x,y,z,u,v,dummy1,dummy2
0,1.880670,5.119414,-3.171900,0.639825,-4.229670,D,d
1,-0.522854,-6.839106,-0.301561,,2.455704,J,a
2,2.022402,-0.413710,-0.918374,-3.410037,0.870053,I,a
3,0.426285,1.731233,-2.399366,-0.715397,-0.749180,B,a
4,3.117020,-3.601585,2.562953,-5.160408,0.209371,C,b
...,...,...,...,...,...,...,...
995,-0.563727,-2.526431,0.489600,-4.106001,4.380324,H,a
996,1.749643,-1.628650,-1.726381,-0.474737,-1.261273,F,d
997,3.272143,1.122671,-0.930049,-0.786890,-4.958666,E,d
998,1.648471,-0.658811,-0.561886,,-2.546939,F,b


# 欠損値の削除とダミー変数処理
ユニバースを確定する段階でモデルを作成する．その際に欠損値を取り除く処理も同時に実行できる．また回帰結果を出力するファイルを指定することもできる．

`get_dummies`を使用することで，指定した列をダミー変数に取り換えることができる．その際に，基準とする並び方を与えたリスト(これはダミーの要素すべてを含んでいなければならない)とdropする要素を指定する．

In [32]:
model = my_linear_regression(X, subset = ["u"], file_name = "hoge")
# 順序構造を記述したリスト，データはこの中の一部の値しか持っていないかもしれない
dummy1_full_list = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "AB"]
model.get_dummies("dummy1", dummy1_full_list, dummy_drop = "A")
dummy2_full_list = ["a", "b", "c", "d"]
model.get_dummies("dummy2", dummy2_full_list, dummy_drop = "a")

# 回帰の例1
ダミー1のみを使用して回帰する．第一引数にはターゲット(被説明変数)，第二引数には量的変数のリスト，第三引数には使用するダミー変数のリストを指定する．

`save`では，回帰結果を指定した名前でシートに保存することができる．

`predict`では，`fit`の結果をもとに推計をする．

In [33]:
model.fit("u", ["x", "y"], dummy_list = ["dummy1"])
model.save("with_dummy1")
model.predict()

0,1,2,3
Dep. Variable:,u,R-squared:,0.513
Model:,OLS,Adj. R-squared:,0.505
Method:,Least Squares,F-statistic:,64.47
Date:,"Sat, 24 Jul 2021",Prob (F-statistic):,1.5600000000000001e-97
Time:,14:54:33,Log-Likelihood:,-1500.3
No. Observations:,685,AIC:,3025.0
Df Residuals:,673,BIC:,3079.0
Df Model:,11,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-0.8244,0.825,-0.999,0.318,-2.444,0.795
x,-0.7057,0.034,-20.602,0.000,-0.773,-0.638
y,0.3476,0.025,13.706,0.000,0.298,0.397
B,0.4861,0.933,0.521,0.603,-1.346,2.318
C,-0.1241,0.846,-0.147,0.883,-1.785,1.536
D,-0.0905,0.845,-0.107,0.915,-1.750,1.569
E,0.0699,0.893,0.078,0.938,-1.684,1.824
F,0.0605,0.872,0.069,0.945,-1.652,1.773
G,-0.1394,0.874,-0.160,0.873,-1.855,1.576

0,1,2,3
Omnibus:,2.943,Durbin-Watson:,2.053
Prob(Omnibus):,0.23,Jarque-Bera (JB):,2.93
Skew:,-0.16,Prob(JB):,0.231
Kurtosis:,2.978,Cond. No.,106.0


0     -0.462224
2     -2.382982
3     -0.037231
4     -4.400143
7     -3.203445
         ...   
991    0.762651
995   -1.400199
996   -2.564775
997   -2.673204
999    1.501514
Length: 685, dtype: float64

# 回帰の例2
ダミー2のみを使用して回帰する．`fit`の際に，使用する説明変数を変更するだけでよい．

In [34]:
model.fit("u", ["x", "y"], dummy_list = ["dummy2"])
model.save("with_dummy2")

0,1,2,3
Dep. Variable:,u,R-squared:,0.52
Model:,OLS,Adj. R-squared:,0.517
Method:,Least Squares,F-statistic:,147.2
Date:,"Sat, 24 Jul 2021",Prob (F-statistic):,9.3e-106
Time:,14:54:37,Log-Likelihood:,-1495.2
No. Observations:,685,AIC:,3002.0
Df Residuals:,679,BIC:,3030.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-0.6940,0.138,-5.016,0.000,-0.966,-0.422
x,-0.7038,0.033,-21.014,0.000,-0.770,-0.638
y,0.3478,0.025,13.933,0.000,0.299,0.397
b,-0.6487,0.215,-3.011,0.003,-1.072,-0.226
c,-0.1320,0.268,-0.492,0.623,-0.659,0.395
d,0.0869,0.211,0.412,0.681,-0.327,0.501

0,1,2,3
Omnibus:,3.018,Durbin-Watson:,2.052
Prob(Omnibus):,0.221,Jarque-Bera (JB):,3.057
Skew:,-0.161,Prob(JB):,0.217
Kurtosis:,2.939,Cond. No.,13.4
