In [None]:
# 一元线性回归模型的应用
# import statsmodels.api as sm
# sm.ols(formula, data, subset=None, drop_cols=None)
# formula：以字符串的形式指定线性回归模型的公式，如'y~x'就表示简单线性回归模型
# data：指定建模的数据集
# subset：通过bool类型的数组对象，获取data的子集用于建模
# drop_cols：指定需要从data中删除的变量

In [1]:
# 导入第三方模块
import pandas as pd
import statsmodels.api as sm

income = pd.read_csv('Salary_Data.csv')
# 利用收入数据集，构建回归模型
fit = sm.formula.ols('Salary ~ YearsExperience', data = income).fit()
# 返回模型的参数值
fit.params

Intercept          25792.200199
YearsExperience     9449.962321
dtype: float64

In [3]:
# 导入模块
from sklearn import model_selection

# 导入数据
Profit = pd.read_excel(r'Predict to Profit.xlsx')
# 将数据集拆分为训练集和测试集
train, test = model_selection.train_test_split(Profit, test_size = 0.2, random_state=1234)
# 根据train数据集建模
model = sm.formula.ols('Profit ~ RD_Spend+Administration+Marketing_Spend+C(State)', data = train).fit()

print('模型的偏回归系数分别为：\n', model.params)
# 删除test数据集中的Profit变量，用剩下的自变量进行预测
test_X = test.drop(labels = 'Profit', axis = 1)
pred = model.predict(exog = test_X)

print('对比预测值和实际值的差异：\n',pd.DataFrame({'Prediction':pred,'Real':test.Profit}))
# 默认情况下，对于离散变量State而言，模型选择California值作为对照组。

模型的偏回归系数分别为：
 Intercept               58581.516503
C(State)[T.Florida]       927.394424
C(State)[T.New York]     -513.468310
RD_Spend                    0.803487
Administration             -0.057792
Marketing_Spend             0.013779
dtype: float64
对比预测值和实际值的差异：
        Prediction       Real
8   150621.345801  152211.77
48   55513.218079   35673.41
14  150369.022458  132602.65
42   74057.015562   71498.49
29  103413.378282  101004.64
44   67844.850378   65200.33
4   173454.059691  166187.94
31   99580.888894   97483.56
13  128147.138396  134307.35
18  130693.433835  124266.90


In [4]:
# 生成由State变量衍生的哑变量
dummies = pd.get_dummies(Profit.State)
# 将哑变量与原始数据集水平合并
Profit_New = pd.concat([Profit,dummies], axis = 1)
# 删除State变量和California变量（因为State变量已被分解为哑变量，New York变量需要作为参照组）
Profit_New.drop(labels = ['State','New York'], axis = 1, inplace = True)
# 拆分数据集Profit_New
train, test = model_selection.train_test_split(Profit_New, test_size = 0.2, random_state=1234)
# 建模
model2 = sm.formula.ols('Profit~RD_Spend+Administration+Marketing_Spend+Florida+California', data = train).fit()
print('模型的偏回归系数分别为：\n', model2.params)

模型的偏回归系数分别为：
 Intercept          58068.048193
RD_Spend               0.803487
Administration        -0.057792
Marketing_Spend        0.013779
Florida             1440.862734
California           513.468310
dtype: float64


In [37]:
income.columns

Index(['YearsExperience', 'Salary'], dtype='object')

In [38]:
income.Salary.corr(income.YearsExperience)

0.9782416184887598

In [42]:
Profit.drop('State',axis=1).corrwith(Profit['Profit'])

RD_Spend           0.978437
Administration     0.205841
Marketing_Spend    0.739307
Profit             1.000000
dtype: float64

In [43]:
Profit.drop('State',axis=1).corr()

Unnamed: 0,RD_Spend,Administration,Marketing_Spend,Profit
RD_Spend,1.0,0.243438,0.711654,0.978437
Administration,0.243438,1.0,-0.03728,0.205841
Marketing_Spend,0.711654,-0.03728,1.0,0.739307
Profit,0.978437,0.205841,0.739307,1.0


In [5]:
# 导入第三方模块
import numpy as np
# 计算建模数据中因变量的均值
ybar = train.Profit.mean()
# 统计变量个数和观测个数
p = model2.df_model
n = train.shape[0]
# 计算回归离差平方和
RSS = np.sum((model2.fittedvalues-ybar) ** 2)
# 计算误差平方和
ESS = np.sum(model2.resid ** 2)
# 计算F统计量的值
F = (RSS/p)/(ESS/(n-p-1))
print('F统计量的值：',F)

F统计量的值： 174.63721716844674


In [6]:
# 导入模块
from scipy.stats import f
# 计算F分布的理论值
F_Theroy = f.ppf(q=0.95, dfn = p, dfd = n-p-1)
print('F分布的理论值为：',F_Theroy)
# 计算出来的F统计量值174.64远远大于F分布的理论值2.50，所以应当拒绝原假设，
# 即认为多元线性回归模型是显著的，也就是说回归模型的偏回归系数都不全为0。

F分布的理论值为： 2.502635007415366


In [7]:
# 有关模型的概览信息
model2.summary()
# 从返回的结果可知，只有截距项Intercept和研发成本RD_Spend对应的p值小于0.05，
# 才说明其余变量都没有通过系数的显著性检验，即在模型中这些变量不是影响利润的重要因素。

0,1,2,3
Dep. Variable:,Profit,R-squared:,0.964
Model:,OLS,Adj. R-squared:,0.958
Method:,Least Squares,F-statistic:,174.6
Date:,"Tue, 14 Jul 2020",Prob (F-statistic):,9.739999999999999e-23
Time:,14:26:08,Log-Likelihood:,-401.2
No. Observations:,39,AIC:,814.4
Df Residuals:,33,BIC:,824.4
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,5.807e+04,6846.305,8.482,0.000,4.41e+04,7.2e+04
RD_Spend,0.8035,0.040,19.988,0.000,0.722,0.885
Administration,-0.0578,0.051,-1.133,0.265,-0.162,0.046
Marketing_Spend,0.0138,0.015,0.930,0.359,-0.016,0.044
Florida,1440.8627,3059.931,0.471,0.641,-4784.615,7666.340
California,513.4683,3043.160,0.169,0.867,-5677.887,6704.824

0,1,2,3
Omnibus:,1.721,Durbin-Watson:,1.896
Prob(Omnibus):,0.423,Jarque-Bera (JB):,1.148
Skew:,0.096,Prob(JB):,0.563
Kurtosis:,2.182,Cond. No.,1600000.0
