# ライブラリのインポート

In [2]:
%reload_ext autoreload
%autoreload
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import statsmodels.api as sm

from my_regression import my_linear_regression
from my_statistics import my_check_data

# データの読み込み
目的変数である`u`を適当に欠損させる

In [3]:
X = pd.read_excel("data.xlsx", index_col = 0)

for idx in range(len(X)):
    if np.random.random() < 0.3:
        X.at[idx, "u"] = np.nan
display(X)

Unnamed: 0,x,y,z,u,v,dummy1,dummy2
0,1.880670,5.119414,-3.171900,,-4.229670,D,d
1,-0.522854,-6.839106,-0.301561,,2.455704,J,a
2,2.022402,-0.413710,-0.918374,-3.410037,0.870053,I,a
3,0.426285,1.731233,-2.399366,-0.715397,-0.749180,B,a
4,3.117020,-3.601585,2.562953,-5.160408,0.209371,C,b
...,...,...,...,...,...,...,...
995,-0.563727,-2.526431,0.489600,,4.380324,H,a
996,1.749643,-1.628650,-1.726381,-0.474737,-1.261273,F,d
997,3.272143,1.122671,-0.930049,,-4.958666,E,d
998,1.648471,-0.658811,-0.561886,-1.318114,-2.546939,F,b


# 欠損値の削除とダミー変数処理
ユニバースを確定する段階でモデルを作成する．その際に欠損値を取り除く処理も同時に実行できる．また回帰結果を出力するファイルを指定することもできる．

`get_dummies`を使用することで，指定した列をダミー変数に取り換えることができる．その際に，基準とする並び方を与えたリスト(これはダミーの要素すべてを含んでいなければならない)とdropする要素を指定する．

In [6]:
model = my_linear_regression(X, subset = ["u"], file = "hoge.xlsx")
# 順序構造を記述したリスト，データはこの中の一部の値しか持っていないかもしれない
dummy1_full_list = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "AB"]
model.get_dummies("dummy1", dummy1_full_list, dummy_drop = "A")
dummy2_full_list = ["a", "b", "c", "d"]
model.get_dummies("dummy2", dummy2_full_list, dummy_drop = "a")

# 回帰の例1
ダミー1のみを使用して回帰する．第一引数にはターゲット(被説明変数)，第二引数には量的変数のリスト，第三引数には使用するダミー変数のリストを指定する．

`save`では，回帰結果を指定した名前でシートに保存することができる．

`predict`では，`fit`の結果をもとに推計をする．

In [7]:
model.fit("u", ["x", "y"], dummy_list = ["dummy1"])
model.save("with_dummy1")
model.predict()

0,1,2,3
Dep. Variable:,u,R-squared:,0.527
Model:,OLS,Adj. R-squared:,0.519
Method:,Least Squares,F-statistic:,68.41
Date:,"Wed, 21 Jul 2021",Prob (F-statistic):,3.96e-102
Time:,08:13:29,Log-Likelihood:,-1502.8
No. Observations:,688,AIC:,3030.0
Df Residuals:,676,BIC:,3084.0
Df Model:,11,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-0.9884,0.820,-1.205,0.229,-2.599,0.622
x,-0.7101,0.034,-20.669,0.000,-0.778,-0.643
y,0.3410,0.025,13.377,0.000,0.291,0.391
B,0.8021,0.937,0.856,0.392,-1.037,2.642
C,0.0742,0.842,0.088,0.930,-1.578,1.726
D,0.2461,0.841,0.293,0.770,-1.405,1.898
E,0.5290,0.905,0.584,0.559,-1.249,2.306
F,-0.0276,0.862,-0.032,0.974,-1.721,1.665
G,0.2803,0.873,0.321,0.748,-1.433,1.994

0,1,2,3
Omnibus:,7.654,Durbin-Watson:,1.935
Prob(Omnibus):,0.022,Jarque-Bera (JB):,7.567
Skew:,-0.251,Prob(JB):,0.0227
Kurtosis:,3.107,Cond. No.,107.0


2     -2.464513
3      0.101320
4     -4.355546
5     -2.638049
7     -3.265618
         ...   
993   -0.899818
994   -0.818520
996   -2.813666
998   -2.411132
999    1.647804
Length: 688, dtype: float64

# 回帰の例2
ダミー2のみを使用して回帰する．`fit`の際に，使用する説明変数を変更するだけでよい．

In [130]:
model.fit("u", ["x", "y"], dummy_list = ["dummy2"])
model.save("with_dummy2")

0,1,2,3
Dep. Variable:,u,R-squared:,0.52
Model:,OLS,Adj. R-squared:,0.517
Method:,Least Squares,F-statistic:,147.4
Date:,"Tue, 20 Jul 2021",Prob (F-statistic):,6.76e-106
Time:,23:15:22,Log-Likelihood:,-1501.2
No. Observations:,686,AIC:,3014.0
Df Residuals:,680,BIC:,3042.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-0.8285,0.141,-5.856,0.000,-1.106,-0.551
x,-0.6861,0.034,-20.307,0.000,-0.752,-0.620
y,0.3403,0.025,13.704,0.000,0.292,0.389
b,-0.6371,0.215,-2.969,0.003,-1.058,-0.216
c,0.3083,0.268,1.151,0.250,-0.218,0.834
d,0.0443,0.215,0.206,0.837,-0.379,0.467

0,1,2,3
Omnibus:,3.749,Durbin-Watson:,2.073
Prob(Omnibus):,0.153,Jarque-Bera (JB):,3.822
Skew:,-0.168,Prob(JB):,0.148
Kurtosis:,2.857,Cond. No.,13.8
