## 1.データについて

In [38]:
import pandas as pd
import numpy as np

# データの読み込み
df = pd.read_csv('./都道府県魅力度.csv')
print(df.isnull().sum()) # 欠損値の確認
print(df.describe()) # 記述統計量の確認
print(df.dtypes) # データ型の確認
print(df.set_index('都道府県').corr()) # インデックスの変更と相関係数の確認

都道府県      0
人口        0
ホテル客室数    0
旅館客室数     0
魅力度       0
dtype: int64
                 人口        ホテル客室数         旅館客室数        魅力度
count  4.700000e+01     47.000000     47.000000  47.000000
mean   2.683960e+06  17600.234043  15644.063830  26.400000
std    2.796582e+06  17671.324681  10879.597536  12.335615
min    5.534070e+05   2567.000000   4540.000000  11.600000
25%    1.068802e+06   7837.500000   8113.000000  18.700000
50%    1.588256e+06  10918.000000  12565.000000  22.300000
75%    2.688894e+06  22213.000000  19305.500000  30.200000
max    1.404759e+07  97879.000000  47355.000000  73.400000
都道府県       object
人口        float64
ホテル客室数      int64
旅館客室数       int64
魅力度       float64
dtype: object
              人口    ホテル客室数     旅館客室数       魅力度
人口      1.000000  0.843940  0.541923  0.456821
ホテル客室数  0.843940  1.000000  0.666313  0.718636
旅館客室数   0.541923  0.666313  1.000000  0.471949
魅力度     0.456821  0.718636  0.471949  1.000000


In [39]:
df.set_index('都道府県')

Unnamed: 0_level_0,人口,ホテル客室数,旅館客室数,魅力度
都道府県,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
北海道,5224614.0,63650,47355,73.4
京都府,2578087.0,22820,10208,56.4
沖縄県,1467480.0,33926,8890,54.4
東京都,14047594.0,97879,44186,47.5
大阪府,8837685.0,56992,19319,42.0
神奈川県,9237337.0,30631,25040,40.0
福岡県,5135214.0,38867,11234,37.5
長崎県,1312317.0,6353,14067,33.9
奈良県,1324473.0,3528,5527,33.4
石川県,1132526.0,10816,14142,32.5


In [44]:
import statsmodels.api as sm

# 変数の選択
y = df["魅力度"]
X = df.drop("魅力度", axis="columns")

# 数値データのみを選択
X = X.select_dtypes(include=[np.number])

# 定数項の追加
X_ = sm.add_constant(X)
# モデルの設定
model = sm.OLS(y, X_)
# 回帰分析の実行
resulut = model.fit()
# 結果の表示
summary = resulut.summary()
summary

0,1,2,3
Dep. Variable:,魅力度,R-squared:,0.595
Model:,OLS,Adj. R-squared:,0.567
Method:,Least Squares,F-statistic:,21.04
Date:,"Tue, 12 Nov 2024",Prob (F-statistic):,1.53e-08
Time:,10:44:08,Log-Likelihood:,-163.04
No. Observations:,47,AIC:,334.1
Df Residuals:,43,BIC:,341.5
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,18.6402,2.117,8.806,0.000,14.371,22.909
人口,-2.304e-06,7.99e-07,-2.883,0.006,-3.92e-06,-6.92e-07
ホテル客室数,0.0008,0.000,5.781,0.000,0.001,0.001
旅館客室数,-3.578e-05,0.000,-0.242,0.810,-0.000,0.000

0,1,2,3
Omnibus:,14.24,Durbin-Watson:,1.048
Prob(Omnibus):,0.001,Jarque-Bera (JB):,16.469
Skew:,1.102,Prob(JB):,0.000265
Kurtosis:,4.885,Cond. No.,6890000.0


In [21]:
# HTMLファイルに出力
html = "<div>"+summary.as_html()+"</div>"
with open('result.html', 'w') as f:
    f.write(html)

# モデルの保存
import pickle
with open('model.pickle', 'wb') as f:
    pickle.dump(resulut, f)
    

In [23]:
# モデルの読み込み
with open('model.pickle', 'rb') as f:
    resulut_re = pickle.load(f)

resulut_re.summary()

0,1,2,3
Dep. Variable:,魅力度,R-squared:,0.595
Model:,OLS,Adj. R-squared:,0.567
Method:,Least Squares,F-statistic:,21.04
Date:,"Tue, 12 Nov 2024",Prob (F-statistic):,1.53e-08
Time:,10:34:50,Log-Likelihood:,-163.04
No. Observations:,47,AIC:,334.1
Df Residuals:,43,BIC:,341.5
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,18.6402,2.117,8.806,0.000,14.371,22.909
都道府県,-2.828e-09,3.21e-10,-8.806,0.000,-3.48e-09,-2.18e-09
人口,-2.304e-06,7.99e-07,-2.883,0.006,-3.92e-06,-6.92e-07
ホテル客室数,0.0008,0.000,5.781,0.000,0.001,0.001
旅館客室数,-3.578e-05,0.000,-0.242,0.810,-0.000,0.000

0,1,2,3
Omnibus:,14.24,Durbin-Watson:,1.048
Prob(Omnibus):,0.001,Jarque-Bera (JB):,16.469
Skew:,1.102,Prob(JB):,0.000265
Kurtosis:,4.885,Cond. No.,6.05e+18
