# Class 2: Bivarate regression

In [23]:
library(tidyverse)
library(lfe)
library(readxl)
library(httr)

# Download Excel dataset
url = 'https://github.com/tsvoronos/API202-students/raw/main/data/caschool.xlsx?raw=TRUE'
GET(url, write_disk(tf <- tempfile(fileext = ".xlsx")))

# Read in each sheet of the Excel file
caschool <- read_excel(tf, sheet = 1)

Response [https://raw.githubusercontent.com/tsvoronos/API202-students/main/data/caschool.xlsx]
  Date: 2023-01-09 15:45
  Status: 200
  Content-Type: application/octet-stream
  Size: 91.4 kB
<ON DISK>  /var/folders/23/wnrvfh7x5mjf3_zz99xxypxc0000gn/T//RtmpekIPf4/file13ba0643bbdd0.xlsxNULL

In [9]:
main <- ggplot(caschool, aes(x = str, y = testscr)) + 
  geom_point() +
  scale_y_continuous('Test scores') + 
  scale_x_continuous('Student-teacher ratio') +
  theme(text = element_text(size = 20))     

In [10]:
caschool %>%
  mutate(smallclass = str<20) %>%
  group_by(smallclass) %>%
  summarize(mean_test = mean(testscr), sd_test = sd(testscr), n = n())

smallclass,mean_test,sd_test,n
<lgl>,<dbl>,<dbl>,<int>
False,649.9788,17.85336,182
True,657.3513,19.35801,238


In [11]:
caschool_bin <- caschool %>%
  mutate(smallclass = str<20)
 
caschool_stats <- caschool_bin %>%
  group_by(smallclass) %>%
  summarize(mean_test = mean(testscr), sd_test = sd(testscr), n = n())
caschool_stats

mean_large <- 649.9788
mean_small <- 657.3513
sd_large <- 17.85336
sd_small <- 19.35801
n_large <- 182
n_small <- 238

se <- sqrt(sd_small^2/n_small + sd_large^2/n_large)
se
z <- (mean_small - mean_large)/se
z

smallclass,mean_test,sd_test,n
<lgl>,<dbl>,<dbl>,<int>
False,649.9788,17.85336,182
True,657.3513,19.35801,238


In [12]:
mainfit <- ggplot(caschool, aes(x = str, y = testscr)) + 
  geom_point() +
  geom_smooth(method ='lm', formula = y~x, se = FALSE) +
  scale_y_continuous('Test scores', limits = c(610,710)) + 
  scale_x_continuous('Student-teacher ratio', limits = c(12,27)) +
  theme(text = element_text(size = 20))

In [25]:
bivariate <- felm(testscr ~ str, data=caschool)
summary(bivariate)


Call:
   felm(formula = testscr ~ str, data = caschool) 

Residuals:
    Min      1Q  Median      3Q     Max 
-47.727 -14.251   0.483  12.822  48.540 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 698.9330     9.4675  73.825  < 2e-16 ***
str          -2.2798     0.4798  -4.751 2.78e-06 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 18.58 on 418 degrees of freedom
Multiple R-squared(full model): 0.05124   Adjusted R-squared: 0.04897 
Multiple R-squared(proj model): 0.05124   Adjusted R-squared: 0.04897 
F-statistic(full model):22.58 on 1 and 418 DF, p-value: 2.783e-06 
F-statistic(proj model): 22.58 on 1 and 418 DF, p-value: 2.783e-06 



In [18]:
sample1 <- sample_n(caschool, 50)
sample2 <- sample_n(caschool, 50)
sample3 <- sample_n(caschool, 50)
sample4 <- sample_n(caschool, 50)
sample5 <- sample_n(caschool, 50)

In [19]:
lm(testscr ~ str, data=caschool)
lm(testscr ~ str, data=sample1)
lm(testscr ~ str, data=sample2)
lm(testscr ~ str, data=sample3)
lm(testscr ~ str, data=sample4)
lm(testscr ~ str, data=sample5)


Call:
lm(formula = testscr ~ str, data = caschool)

Coefficients:
(Intercept)          str  
     698.93        -2.28  



Call:
lm(formula = testscr ~ str, data = sample1)

Coefficients:
(Intercept)          str  
    676.404       -1.168  



Call:
lm(formula = testscr ~ str, data = sample2)

Coefficients:
(Intercept)          str  
    719.116       -3.169  



Call:
lm(formula = testscr ~ str, data = sample3)

Coefficients:
(Intercept)          str  
  651.14309      0.04477  



Call:
lm(formula = testscr ~ str, data = sample4)

Coefficients:
(Intercept)          str  
    672.817       -0.906  



Call:
lm(formula = testscr ~ str, data = sample5)

Coefficients:
(Intercept)          str  
     733.71        -3.89  


In [20]:
plot1 <- ggplot(sample1, aes(x = str, y = testscr)) + 
  geom_point() +
  geom_smooth(method ='lm', formula = y~x, se = FALSE) +
  scale_y_continuous('Test scores', limits = c(610,710)) + 
  scale_x_continuous('Student-teacher ratio', limits = c(12,27)) +
  theme(text = element_text(size = 20))

In [21]:
plot2 <- ggplot(sample2, aes(x = str, y = testscr)) + 
  geom_point() +
  geom_smooth(method ='lm', formula = y~x, se = FALSE) +
  scale_y_continuous('Test scores', limits = c(610,710)) + 
  scale_x_continuous('Student-teacher ratio', limits = c(12,27)) +
  theme(text = element_text(size = 20))

In [22]:
plot3 <- ggplot(sample3, aes(x = str, y = testscr)) + 
  geom_point() +
  geom_smooth(method ='lm', formula = y~x, se = FALSE) +
  scale_y_continuous('Test scores', limits = c(610,710)) + 
  scale_x_continuous('Student-teacher ratio', limits = c(12,27)) +
  theme(text = element_text(size = 20))

In [23]:
plot4 <- ggplot(sample4, aes(x = str, y = testscr)) + 
  geom_point() +
  geom_smooth(method ='lm', formula = y~x, se = FALSE) +
  scale_y_continuous('Test scores', limits = c(610,710)) + 
  scale_x_continuous('Student-teacher ratio', limits = c(12,27)) +
  theme(text = element_text(size = 20))

In [24]:
plot5 <- ggplot(sample5, aes(x = str, y = testscr)) + 
  geom_point() +
  geom_smooth(method ='lm', formula = y~x, se = FALSE) +
  scale_y_continuous('Test scores', limits = c(610,710)) + 
  scale_x_continuous('Student-teacher ratio', limits = c(12,27)) +
  theme(text = element_text(size = 20),
)

In [25]:
plotbackground <- ggplot() + 
  geom_point() +
  geom_smooth(method ='lm', formula = y~x, se = FALSE) +
  scale_y_continuous('Test scores', limits = c(610,710)) + 
  scale_x_continuous('Student-teacher ratio', limits = c(12,27)) +
  theme(text = element_text(size = 20))

plot1clear <- plot1 +theme(panel.background = element_rect(fill='transparent'), #transparent panel bg
    plot.background = element_rect(fill='transparent', color=NA), #transparent plot bg
    panel.grid.major = element_blank(), #remove major gridlines
    panel.grid.minor = element_blank(), #remove minor gridlines
    legend.background = element_rect(fill='transparent'), #transparent legend bg
    legend.box.background = element_rect(fill='transparent') #transparent legend panel
                           )

plot2clear <- plot2 +theme(panel.background = element_rect(fill='transparent'), #transparent panel bg
    plot.background = element_rect(fill='transparent', color=NA), #transparent plot bg
    panel.grid.major = element_blank(), #remove major gridlines
    panel.grid.minor = element_blank(), #remove minor gridlines
    legend.background = element_rect(fill='transparent'), #transparent legend bg
    legend.box.background = element_rect(fill='transparent') #transparent legend panel
                           )

plot3clear <- plot3 +theme(panel.background = element_rect(fill='transparent'), #transparent panel bg
    plot.background = element_rect(fill='transparent', color=NA), #transparent plot bg
    panel.grid.major = element_blank(), #remove major gridlines
    panel.grid.minor = element_blank(), #remove minor gridlines
    legend.background = element_rect(fill='transparent'), #transparent legend bg
    legend.box.background = element_rect(fill='transparent') #transparent legend panel
                           )
plot4clear <- plot4 +theme(panel.background = element_rect(fill='transparent'), #transparent panel bg
    plot.background = element_rect(fill='transparent', color=NA), #transparent plot bg
    panel.grid.major = element_blank(), #remove major gridlines
    panel.grid.minor = element_blank(), #remove minor gridlines
    legend.background = element_rect(fill='transparent'), #transparent legend bg
    legend.box.background = element_rect(fill='transparent') #transparent legend panel
                           )
plot5clear <- plot5 +theme(panel.background = element_rect(fill='transparent'), #transparent panel bg
    plot.background = element_rect(fill='transparent', color=NA), #transparent plot bg
    panel.grid.major = element_blank(), #remove major gridlines
    panel.grid.minor = element_blank(), #remove minor gridlines
    legend.background = element_rect(fill='transparent'), #transparent legend bg
    legend.box.background = element_rect(fill='transparent') #transparent legend panel
                           )



In [26]:
ggsave(
  plot = main,
  filename = "main.png",
  bg = "transparent"
)

ggsave(
  plot = mainfit,
  filename = "mainfit.png",
  bg = "transparent"
)


ggsave(
  plot = plotbackground,
  filename = "plotbackground.png",
  bg = "transparent"
)

ggsave(
  plot = plot1clear,
  filename = "plot1.png",
  bg = "transparent"
)

ggsave(
  plot = plot2clear,
  filename = "plot2.png",
  bg = "transparent"
)

ggsave(
  plot = plot3clear,
  filename = "plot3.png",
  bg = "transparent"
)

ggsave(
  plot = plot4clear,
  filename = "plot4.png",
  bg = "transparent"
)

ggsave(
  plot = plot5clear,
  filename = "plot5.png",
  bg = "transparent"
)

[1m[22mSaving 7 x 7 in image
[1m[22mSaving 7 x 7 in image
“[1m[22mRemoved 3 rows containing non-finite values (`stat_smooth()`).”
“[1m[22mRemoved 3 rows containing missing values (`geom_point()`).”
[1m[22mSaving 7 x 7 in image
[1m[22mSaving 7 x 7 in image
[1m[22mSaving 7 x 7 in image
[1m[22mSaving 7 x 7 in image
“[1m[22mRemoved 1 rows containing non-finite values (`stat_smooth()`).”
“[1m[22mRemoved 1 rows containing missing values (`geom_point()`).”
[1m[22mSaving 7 x 7 in image
[1m[22mSaving 7 x 7 in image
