# Least Squares

In [1]:
%%capture
import stata_setup, os
if os.name == 'nt':
    stata_setup.config('C:/Program Files/Stata17/','mp')
else:
    stata_setup.config('/usr/local/stata17','mp')

We load the data, rename the outcome variable, generate the indicator variables for ```year``` and ```cluster``` and define local Stata variables called ```journals``` and ```jel_imp``` which collects all relevant indicators.

In [2]:
%%stata -qui

use "../data/data", clear
rename log_flesch_kincaid_grade_level FKG
quietly tabulate year, generate(y_)
quietly tabulate cluster, generate(c_)

local journals  ecm jpe qje res  //AER based category

local jel_imp a_imp b_imp c_imp  e_imp f_imp g_imp h_imp i_imp j_imp k_imp /// 
		l_imp m_imp n_imp o_imp p_imp q_imp r_imp y_imp z_imp // D JEL based case




Performing the OLS regression of $\mathbf{Y}$ on $\mathbf{X}$ using ```Stata```:

In [3]:
%%stata -qui
#delimit ;
reg FKG log_num_authors  log_num_pages both_genders prop_women
			`journals' `jel_imp' y_2-y_20  c_2-c_215  jel_flag, vce(cluster cluster);
matrix b_selected = e(b)[1,"log_num_authors"],e(b)[1,"log_num_pages"],
                    e(b)[1,"both_genders"],e(b)[1,"prop_women"],e(b)[1,"_cons"];
#delimit cr




Printing a subset of
the OLS estimate $\widehat{\beta}$ (originally a $262\times 1$ vector)

In [4]:
%%stata
matrix list b_selected


b_selected[1,5]
    log_num_au~s  log_num_pa~s  both_genders    prop_women         _cons
y1    -.00397377     .01915903     .00059809    -.01889331     2.7023992


Predicting the _fitted values_ and the _residuals_, then format them to be displayed with up to 4 decimals only along with other variables.

In [5]:
%%stata -qui
predict FKG_hat, xb
predict double e_hat, residuals
format FKG FKG_hat e_hat log_num_authors log_num_pages %5.4f 




These command randomly sorts the rows of the data set in memory.

In [6]:
%%stata -qui
#delimit ;
set seed 42; tempvar sortorder; gen `sortorder' = runiform(); sort `sortorder';
#delimit cr




In [7]:
%stata list FKG FKG_hat e_hat log_num_authors log_num_pages both_genders prop_women in 1/20, table separator(20)


     +------------------------------------------------------------------------+
     |    FKG   FKG_hat     e_hat   log_n~rs   log_~ges   both_g~s   prop_w~n |
     |------------------------------------------------------------------------|
  1. | 2.8015    2.7041    0.0973     0.6931     3.5835          1         .5 |
  2. | 2.7776    2.7294    0.0482     0.6931     3.7136          0          0 |
  3. | 2.7829    2.7017    0.0812     1.0986     3.1781          0          0 |
  4. | 2.7027    2.7289   -0.0262     0.0000     3.5553          0          0 |
  5. | 2.8278    2.7167    0.1111     0.6931     3.3322          0          0 |
  6. | 2.3858    2.7210   -0.3352     0.6931     3.2958          0          0 |
  7. | 2.1718    2.7226   -0.5509     0.0000     3.8286          0          0 |
  8. | 2.5743    2.7851   -0.2108     0.0000     3.4657          0          0 |
  9. | 2.8177    2.8233   -0.0056     1.0986     4.1589          0          0 |
 10. | 2.7264    2.7048    0.0216     1

Printing the $TSS$

In [8]:
%stata display e(mss)+e(rss)

140.32277


Printing the $ESS$

In [9]:
%stata display e(mss)

12.920871


Printing the $RSS$

In [10]:
%stata display e(rss)

127.4019


Printing the $R^2$

In [11]:
%stata display e(mss)/(e(mss)+e(rss))

.09207965


## Leverage Values & LOO Regression

Extracting the leverage values

In [12]:
%%stata -qui
#delimit ;
reg FKG log_num_authors  log_num_pages both_genders prop_women
			`journals' `jel_imp' y_2-y_20  c_2-c_215  jel_flag;
#delimit cr
predict h, leverage
summarize h




Checking that $0\le h_{ii} \le 1$

In [13]:
%%stata
display "Min: " r(min)
display "Max: " r(max)


. display "Min: " r(min)
Min: .00481838

. display "Max: " r(max)
Max: 1

. 


Checking that $h_{ii}\ge 1/n$ by checking that $\min_{i=1,\ldots,n}(h_{ii})\ge 1/n$

In [14]:
%%stata
local sample_size = r(N)
local reciprocal = 1 / `sample_size'
local min_leverage = r(min)
display "Min leverage >= 1/N: " (`min_leverage' >= `reciprocal')


. local sample_size = r(N)

. local reciprocal = 1 / `sample_size'

. local min_leverage = r(min)

. display "Min leverage >= 1/N: " (`min_leverage' >= `reciprocal')
Min leverage >= 1/N: 1

. 


Checking that $\sum_{i=1}^n h_{ii}=k$

In [15]:
%%stata
egen total_h = total(h)
local rounded_total_h = ceil(total_h)
display "Rounded total of h: " `rounded_total_h'


. egen total_h = total(h)

. local rounded_total_h = ceil(total_h)

. display "Rounded total of h: " `rounded_total_h'
Rounded total of h: 262

. 


In [16]:
%%stata -qui
predict dfbeta, dfbeta(prop_women)
cv_regress, generr(e_tilde)




In [18]:
from sfi import Data
dfbeta_py = Data.get("dfbeta")
e_tilde_py = Data.get("e_tilde")