# Прогнозирование

In [330]:
import pandas as pd
import numpy as np
import statsmodels.api as sm 
import statsmodels.formula.api as smf
from statsmodels.iolib.summary2 import summary_params # вывод результатов тестирования
from statsmodels.iolib.summary2 import summary_col # вывод результатов тестирования
import re
import scipy.stats as stats
from statsmodels.stats.outliers_influence import variance_inflation_factor # VIF
import scipy
import matplotlib.pyplot as plt
import seaborn as sns

## Задание 1 sleep equation #1

In [331]:
data1 = pd.read_csv('https://raw.githubusercontent.com/artamonoff/Econometrica/master/python-notebooks/data-csv/sleep75.csv')
data1.shape

(706, 34)

## Модель 1
## Создаем спецификацию модели через формулу и подгоняем модель

In [332]:
Sleep_eq1 = smf.ols(formula = 'sleep~totwrk+age+south+male+smsa+yngkid+marr', data = data1).fit()

In [333]:
Sleep_eq1.summary()

0,1,2,3
Dep. Variable:,sleep,R-squared:,0.131
Model:,OLS,Adj. R-squared:,0.123
Method:,Least Squares,F-statistic:,15.06
Date:,"Tue, 18 Apr 2023",Prob (F-statistic):,2.14e-18
Time:,23:14:07,Log-Likelihood:,-5255.9
No. Observations:,706,AIC:,10530.0
Df Residuals:,698,BIC:,10560.0
Df Model:,7,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,3450.9128,80.726,42.748,0.000,3292.418,3609.408
totwrk,-0.1692,0.018,-9.372,0.000,-0.205,-0.134
age,2.6891,1.469,1.830,0.068,-0.195,5.574
south,101.5685,41.837,2.428,0.015,19.427,183.710
male,87.6690,35.104,2.497,0.013,18.747,156.591
smsa,-54.7476,33.123,-1.653,0.099,-119.780,10.285
yngkid,-13.9624,50.341,-0.277,0.782,-112.801,84.876
marr,31.2106,42.233,0.739,0.460,-51.709,114.130

0,1,2,3
Omnibus:,62.368,Durbin-Watson:,1.947
Prob(Omnibus):,0.0,Jarque-Bera (JB):,169.049
Skew:,-0.445,Prob(JB):,1.96e-37
Kurtosis:,5.226,Cond. No.,12600.0


In [334]:
summary_col(Sleep_eq1, stars=None)

0,1
,sleep
Intercept,3450.9128
,(80.7262)
totwrk,-0.1692
,(0.0181)
age,2.6891
,(1.4692)
south,101.5685
,(41.8368)
male,87.6690


In [335]:
print(summary_col(Sleep_eq1, stars=True))


                  sleep    
---------------------------
Intercept      3450.9128***
               (80.7262)   
totwrk         -0.1692***  
               (0.0181)    
age            2.6891*     
               (1.4692)    
south          101.5685**  
               (41.8368)   
male           87.6690**   
               (35.1041)   
smsa           -54.7476*   
               (33.1230)   
yngkid         -13.9624    
               (50.3412)   
marr           31.2106     
               (42.2331)   
R-squared      0.1312      
R-squared Adj. 0.1225      
Standard errors in
parentheses.
* p<.1, ** p<.05, ***p<.01


In [336]:
data_string = '''№  totwrk age south male smsa yngkid marr
1  2150  37    0    1    1     0     1  
2  1950  28    1    1    0     1     0  
3  2240  26    0    0    1     0     0'''
wn_df = pd.read_csv(io.StringIO(data_string), sep='\s+')
wn_df.drop(columns='№', inplace=True)
wn_df

Unnamed: 0,totwrk,age,south,male,smsa,yngkid,marr
0,2150,37,0,1,1,0,1
1,1950,28,1,1,0,1,0
2,2240,26,0,0,1,0,0


In [337]:
#create new DataFrame
sleep_new = pd.DataFrame({'totwrk': [2150, 1950, 2240],
                       'age': [37, 28, 26],
                       'south': [0, 1, 0],
                       'male': [1, 1, 0],
                       'smsa': [1, 0, 1],
                       'yngkid': [0, 1, 0],
                       'marr': [1, 0, 0]})
#view new DataFrame
sleep_new

Unnamed: 0,totwrk,age,south,male,smsa,yngkid,marr
0,2150,37,0,1,1,0,1
1,1950,28,1,1,0,1,0
2,2240,26,0,0,1,0,0


In [338]:
#predict sleep
Sleep_eq1.predict(sleep_new).round(2).to_frame(name='Прогноз')

Unnamed: 0,Прогноз
0,3250.68
1,3371.46
2,3086.98


## Задание 2 sleep equation #2

In [339]:
data2 = pd.read_csv('https://raw.githubusercontent.com/artamonoff/Econometrica/master/python-notebooks/data-csv/sleep75.csv')
data2.shape

(706, 34)

In [340]:
Sleep_eq2 = smf.ols(formula = 'sleep~totwrk+age+age*age+south+male', data = data2).fit()

In [341]:
Sleep_eq2.summary()

0,1,2,3
Dep. Variable:,sleep,R-squared:,0.127
Model:,OLS,Adj. R-squared:,0.122
Method:,Least Squares,F-statistic:,25.46
Date:,"Tue, 18 Apr 2023",Prob (F-statistic):,1.03e-19
Time:,23:14:12,Log-Likelihood:,-5257.7
No. Observations:,706,AIC:,10530.0
Df Residuals:,701,BIC:,10550.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,3444.9975,67.882,50.750,0.000,3311.721,3578.273
totwrk,-0.1694,0.018,-9.437,0.000,-0.205,-0.134
age,2.7848,1.387,2.008,0.045,0.062,5.507
south,116.4434,40.561,2.871,0.004,36.809,196.078
male,90.7432,34.247,2.650,0.008,23.504,157.982

0,1,2,3
Omnibus:,60.057,Durbin-Watson:,1.948
Prob(Omnibus):,0.0,Jarque-Bera (JB):,160.194
Skew:,-0.431,Prob(JB):,1.64e-35
Kurtosis:,5.168,Cond. No.,10100.0


In [342]:
summary_col(Sleep_eq2, stars=None)

0,1
,sleep
Intercept,3444.9975
,(67.8818)
totwrk,-0.1694
,(0.0179)
age,2.7848
,(1.3867)
south,116.4434
,(40.5606)
male,90.7432


In [343]:
print(summary_col(Sleep_eq2, stars=True))


                  sleep    
---------------------------
Intercept      3444.9975***
               (67.8818)   
totwrk         -0.1694***  
               (0.0179)    
age            2.7848**    
               (1.3867)    
south          116.4434*** 
               (40.5606)   
male           90.7432***  
               (34.2471)   
R-squared      0.1268      
R-squared Adj. 0.1219      
Standard errors in
parentheses.
* p<.1, ** p<.05, ***p<.01


In [344]:
data_string='''  totwrk age south male
1  2160  32    1    0  
2  1720  24    0    1  
3  2390  44    0    1 '''
wn_df = pd.read_csv(io.StringIO(data_string), sep='\s+')
wn_df

Unnamed: 0,totwrk,age,south,male
1,2160,32,1,0
2,1720,24,0,1
3,2390,44,0,1


In [345]:
sleep2_new = pd.DataFrame({'totwrk': [2160, 1720, 2390],
                       'age': [32, 24, 44],
                       'south': [1, 0, 0],
                       'male': [0, 1, 1]})
sleep2_new

Unnamed: 0,totwrk,age,south,male
0,2160,32,1,0
1,1720,24,0,1
2,2390,44,0,1


In [346]:
Sleep_eq2.predict(sleep_new).round(2).to_frame(name='Прогноз')

Unnamed: 0,Прогноз
0,3274.6
1,3399.86
2,3137.98


In [347]:
# не совпадает с ответами 

## Задание 3 wage equation #1

In [348]:
data3 = pd.read_csv('https://raw.githubusercontent.com/artamonoff/Econometrica/master/python-notebooks/data-csv/wage2.csv')
data3.shape

(935, 17)

In [349]:
Wage_eq2=smf.ols(formula='np.log(wage)~age+IQ+south+married+urban', data = data3).fit()

In [350]:
summary_col(Wage_eq2, stars=True)

0,1
,np.log(wage)
Intercept,4.9740***
,(0.1654)
age,0.0213***
,(0.0040)
IQ,0.0082***
,(0.0008)
south,-0.0990***
,(0.0268)
married,0.2010***


In [351]:
print(summary_col(Wage_eq2, stars=True))


               np.log(wage)
---------------------------
Intercept      4.9740***   
               (0.1654)    
age            0.0213***   
               (0.0040)    
IQ             0.0082***   
               (0.0008)    
south          -0.0990***  
               (0.0268)    
married        0.2010***   
               (0.0402)    
urban          0.1750***   
               (0.0276)    
R-squared      0.1996      
R-squared Adj. 0.1953      
Standard errors in
parentheses.
* p<.1, ** p<.05, ***p<.01


In [386]:
data_wage= '''№ age IQ  south married urban
1 36  105   1      1      1  
2 29  123   0      1      0  
3 25  112   1      0      1'''
wg_nw = pd.read_csv(io.StringIO(data_wage), sep='\s+')
wg_nw.drop(columns='№',inplace=True)
wg_nw

Unnamed: 0,age,IQ,south,married,urban
0,36,105,1,1,1
1,29,123,0,1,0
2,25,112,1,0,1


In [387]:
Wage_new = pd.DataFrame({'age': [36,29,25],
                        'IQ': [105,123,112],
                        'south': [1,0,1],
                        'married': [1,1,0],
                        'urban': [1,0,1]})
Wage_new

Unnamed: 0,age,IQ,south,married,urban
0,36,105,1,1,1
1,29,123,0,1,0
2,25,112,1,0,1


In [388]:
LOG_wage = Wage_eq2.predict(Wage_new)
LOG_wage

0    6.877240
1    6.800093
2    6.499888
dtype: float64

In [389]:
type(LOG_wage)

pandas.core.series.Series

In [390]:
pd.DataFrame({'np.log(wage)': LOG_wage.values})

Unnamed: 0,np.log(wage)
0,6.87724
1,6.800093
2,6.499888


In [391]:
LOG_wage.to_frame(name='np.log(wage)')

Unnamed: 0,np.log(wage)
0,6.87724
1,6.800093
2,6.499888


In [392]:
np.exp(LOG_wage).round(2)

0    969.95
1    897.93
2    665.07
dtype: float64

In [393]:
np.exp(LOG_wage).round(2).to_frame(name='Прогноз')

Unnamed: 0,Прогноз
0,969.95
1,897.93
2,665.07


## Задание 4 wage equation #2

In [394]:
data4 = pd.read_csv('https://raw.githubusercontent.com/artamonoff/Econometrica/master/python-notebooks/data-csv/wage1.csv')
data4.shape

(526, 24)

In [395]:
Wage_eq1=smf.ols(formula='np.log(wage)~exper+exper*exper+female+married+smsa', data = data4).fit()

In [396]:
summary_col(Wage_eq1, stars=True)

0,1
,np.log(wage)
Intercept,1.4137***
,(0.0578)
exper,0.0022
,(0.0016)
female,-0.3601***
,(0.0412)
married,0.2390***
,(0.0445)
smsa,0.2769***


In [397]:
print(summary_col(Wage_eq1, stars=True))


               np.log(wage)
---------------------------
Intercept      1.4137***   
               (0.0578)    
exper          0.0022      
               (0.0016)    
female         -0.3601***  
               (0.0412)    
married        0.2390***   
               (0.0445)    
smsa           0.2769***   
               (0.0457)    
R-squared      0.2389      
R-squared Adj. 0.2331      
Standard errors in
parentheses.
* p<.1, ** p<.05, ***p<.01


In [401]:
data_wage= '''№  exper female married smsa
1   5     1       1     1  
2  26     0       0     1  
3  38     1       1     0  '''
wg_nw = pd.read_csv(io.StringIO(data_wage), sep='\s+')
wg_nw.drop(columns='№',inplace=True)
wg_nw

Unnamed: 0,exper,female,married,smsa
0,5,1,1,1
1,26,0,0,1
2,38,1,1,0


In [402]:
Wage_new = pd.DataFrame({'exper': [5,26,38],
                        'female': [1,0,1],
                        'married': [1,0,1],
                        'smsa': [1,1,0]})
Wage_new

Unnamed: 0,exper,female,married,smsa
0,5,1,1,1
1,26,0,0,1
2,38,1,1,0


In [403]:
LOG_wage = Wage_eq1.predict(Wage_new)
LOG_wage

0    1.580249
1    1.746639
2    1.374421
dtype: float64

In [404]:
type(LOG_wage)

pandas.core.series.Series

In [405]:
pd.DataFrame({'np.log(wage)': LOG_wage.values})

Unnamed: 0,np.log(wage)
0,1.580249
1,1.746639
2,1.374421


In [406]:
LOG_wage.to_frame(name='np.log(wage)')

Unnamed: 0,np.log(wage)
0,1.580249
1,1.746639
2,1.374421


In [408]:
np.exp(LOG_wage).round(2)

0    4.86
1    5.74
2    3.95
dtype: float64

In [409]:
np.exp(LOG_wage).round(2).to_frame(name='Прогноз')

Unnamed: 0,Прогноз
0,4.86
1,5.74
2,3.95


## Задание 5 output equation #1

In [412]:
data5 = pd.read_csv('https://raw.githubusercontent.com/artamonoff/Econometrica/master/python-notebooks/data-csv/Labour.csv')
data5.shape

(569, 4)

In [414]:
output1=smf.ols(formula='np.log(output)~np.log(capital)+np.log(labour)', data = data5).fit()

In [433]:
summary_col(output1, stars=True)

0,1
,np.log(output)
Intercept,-1.7115***
,(0.0967)
np.log(capital),0.2076***
,(0.0172)
np.log(labour),0.7148***
,(0.0231)
R-squared,0.8378
R-squared Adj.,0.8373


In [434]:
data_output= '''№   capital labour
1  2.970    85  
2 10.450    60  
3  3.850   105   '''
wg_nw = pd.read_csv(io.StringIO(data_output), sep='\s+')
wg_nw.drop(columns='№',inplace=True)
wg_nw

Unnamed: 0,capital,labour
0,2.97,85
1,10.45,60
2,3.85,105


In [435]:
output_new = pd.DataFrame({'capital': [2.97,10.45,3.85],
                        'labour': [85,60,105]})
output_new

Unnamed: 0,capital,labour
0,2.97,85
1,10.45,60
2,3.85,105


In [436]:
LOG_output= output1.predict(output_new)
LOG_output

0    1.690309
1    1.702455
2    1.895229
dtype: float64

In [437]:
type(LOG_output)

pandas.core.series.Series

In [438]:
pd.DataFrame({'np.log(output)': LOG_output.values})

Unnamed: 0,np.log(output)
0,1.690309
1,1.702455
2,1.895229


In [439]:
LOG_output.to_frame(name='np.log(output)')

Unnamed: 0,np.log(output)
0,1.690309
1,1.702455
2,1.895229


In [426]:
np.exp(LOG_output).round(2)

0    5.42
1    5.49
2    6.65
dtype: float64

In [427]:
np.exp(LOG_output).round(2).to_frame(name='Прогноз')

Unnamed: 0,Прогноз
0,5.42
1,5.49
2,6.65


## Задание 6 output equation #2

In [428]:
data6 = pd.read_csv('https://raw.githubusercontent.com/artamonoff/Econometrica/master/python-notebooks/data-csv/Labour.csv')
data6.shape

(569, 4)

In [429]:
output2=smf.ols(formula='np.log(output)~np.log(capital)+np.log(labour)+np.log(capital)*np.log(capital)+np.log(labour)*np.log(labour)', data = data6).fit()
summary_col(output2, stars=True)

0,1
,np.log(output)
Intercept,-1.7115***
,(0.0967)
np.log(capital),0.2076***
,(0.0172)
np.log(labour),0.7148***
,(0.0231)
R-squared,0.8378
R-squared Adj.,0.8373


In [430]:
summary_col(output2, stars=True)

0,1
,np.log(output)
Intercept,-1.7115***
,(0.0967)
np.log(capital),0.2076***
,(0.0172)
np.log(labour),0.7148***
,(0.0231)
R-squared,0.8378
R-squared Adj.,0.8373


In [441]:
data_output= '''№ capital labour
1 22.140   407  
2  7.320   197  
3  0.670    31   '''
wg_nw = pd.read_csv(io.StringIO(data_output), sep='\s+')
wg_nw.drop(columns='№',inplace=True)
wg_nw

Unnamed: 0,capital,labour
0,22.14,407
1,7.32,197
2,0.67,31


In [442]:
output_new = pd.DataFrame({'capital': [22.14,7.32,0.67],
                        'labour': [407,197,31]})
output_new

Unnamed: 0,capital,labour
0,22.14,407
1,7.32,197
2,0.67,31


In [443]:
LOG_output= output2.predict(output_new)
LOG_output

0    3.226847
1    2.478413
2    0.660188
dtype: float64

In [444]:
type(LOG_output)

pandas.core.series.Series

In [445]:
pd.DataFrame({'np.log(output)': LOG_output.values})

Unnamed: 0,np.log(output)
0,3.226847
1,2.478413
2,0.660188


In [446]:
LOG_output.to_frame(name='np.log(output)')

Unnamed: 0,np.log(output)
0,3.226847
1,2.478413
2,0.660188


In [447]:
np.exp(LOG_output).round(2)

0    25.20
1    11.92
2     1.94
dtype: float64

In [448]:
np.exp(LOG_output).round(2).to_frame(name='Прогноз')

Unnamed: 0,Прогноз
0,25.2
1,11.92
2,1.94
