In [410]:
# Might need to install arm - if so, uncomment the following line
#install.packages("arm")
library(arm)
library(lme4)
library(blme)
library(ggplot2)
library(stringr)
library(psycho)
library(glmnet)
library(car)
library("multcomp")

In [411]:
# Set ggplot theme
base_size <- 20
theme_set(theme_bw(base_size=base_size) +
            theme(#panel.grid.major=element_blank(),
              panel.grid.minor=element_blank(),
              axis.title.y=element_text(angle=90,vjust=0.5),
              axis.text.x=element_text(angle=0, hjust=0.5),
              axis.title.x=element_blank()))

In [412]:
correct_vs_length_and_prob <- function(df, include_output_chars=TRUE, include_output_tokens=TRUE){
 
    
    if (include_output_chars & include_output_tokens){
        model <- glm(correct ~ input_logprob + output_logprob + input_ntokens + output_ntokens + input_nchars + output_nchars, 
               data=df, family=binomial)
    } else if (include_output_chars & !include_output_tokens){
        model <- glm(correct ~ input_logprob + output_logprob + input_ntokens + input_nchars + output_nchars, 
               data=df, family=binomial)
    } else if (!include_output_chars & include_output_tokens){
        model <- glm(correct ~ input_logprob + output_logprob + input_ntokens + output_ntokens + input_nchars, 
               data=df, family=binomial)
    } else if (!include_output_chars & !include_output_tokens){
        model <- glm(correct ~ input_logprob + output_logprob + input_ntokens + input_nchars, 
               data=df, family=binomial)
    }
  
    
  return(model)
}

In [413]:
# For Z-scoring datasets

scale_df <- function(df) {
    new_df <- data.frame(scale(df[2:7]))
    new_df$index <- as.factor(df$index)
    new_df$correct <- df$correct
    
    return(new_df)
}

scale_taskpair_df <- function(df) {
    new_df <- data.frame(scale(df[3:8]))
    new_df$index <- as.factor(df$index)
    new_df$task <- as.factor(df$task)
    new_df$correct <- df$correct
    
    return(new_df)
}

scale_taskpair_prob_df <- function(df) {
    new_df <- data.frame(scale(df[2:8]))
    new_df$index <- as.factor(df$index)
    new_df$correct <- df$correct
    
    return(new_df)
}



scale_df_with_index <- function(df) {
    new_df <- data.frame(scale(df[1:7]))
    new_df$correct <- df$correct
    
    return(new_df)
}

# Shift ciphers

In [414]:
# Not looking at Llama and PaLM because their accuracy was 0.0 across the board

In [415]:
# Read in data
rot13enc_gpt4_df <- read.table(file = 'table_rot13enc_gpt-4-0613.tsv', sep = '\t', header = TRUE)
rot13enc_gpt35_df <- read.table(file = 'table_rot13enc_gpt-3.5-turbo-0613.tsv', sep = '\t', header = TRUE)

rot13dec_gpt4_df <- read.table(file = 'table_rot13dec_gpt-4-0613.tsv', sep = '\t', header = TRUE)
rot13dec_gpt35_df <- read.table(file = 'table_rot13dec_gpt-3.5-turbo-0613.tsv', sep = '\t', header = TRUE)


In [416]:
# Z-score data
scaled_rot13enc_gpt4_df <- scale_df(rot13enc_gpt4_df)
scaled_rot13enc_gpt35_df <- scale_df(rot13enc_gpt35_df)


scaled_rot13dec_gpt4_df <- scale_df(rot13dec_gpt4_df)
scaled_rot13dec_gpt35_df <- scale_df(rot13dec_gpt35_df)




In [417]:

rot13enc_gpt4_model <- correct_vs_length_and_prob(scaled_rot13enc_gpt4_df, include_output_chars=FALSE)



In [418]:
rot13enc_gpt35_model <- correct_vs_length_and_prob(scaled_rot13enc_gpt35_df, include_output_chars=FALSE)


“glm.fit: fitted probabilities numerically 0 or 1 occurred”


In [419]:
rot13dec_gpt4_model <- correct_vs_length_and_prob(scaled_rot13dec_gpt4_df, include_output_chars=FALSE)


In [420]:
rot13dec_gpt35_model <- correct_vs_length_and_prob(scaled_rot13dec_gpt35_df, include_output_chars=FALSE)

In [421]:
summary(rot13enc_gpt4_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob + input_ntokens + 
    output_ntokens + input_nchars, family = binomial, data = df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.99812  -0.44168  -0.14960  -0.01557   2.95372  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     -4.1263     0.5728  -7.204 5.84e-13 ***
input_logprob    1.2287     0.4840   2.539  0.01112 *  
output_logprob   7.8371     3.2790   2.390  0.01684 *  
input_ntokens    2.7704     0.9766   2.837  0.00456 ** 
output_ntokens  -0.9145     2.5316  -0.361  0.71793    
input_nchars     3.5012     3.3936   1.032  0.30221    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 242.98  on 299  degrees of freedom
Residual deviance: 155.44  on 294  degrees of freedom
AIC: 167.44

Number of Fisher Scoring iterations: 7


In [422]:
summary(rot13enc_gpt35_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob + input_ntokens + 
    output_ntokens + input_nchars, family = binomial, data = df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.5360  -0.0774  -0.0159  -0.0002   3.1706  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)    -11.2249     2.6091  -4.302 1.69e-05 ***
input_logprob   -0.5781     1.1487  -0.503   0.6147    
output_logprob  20.4597    10.2690   1.992   0.0463 *  
input_ntokens    4.4254     2.7395   1.615   0.1062    
output_ntokens   6.4896     7.5227   0.863   0.3883    
input_nchars     1.4265     8.9552   0.159   0.8734    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 87.687  on 299  degrees of freedom
Residual deviance: 37.361  on 294  degrees of freedom
AIC: 49.361

Number of Fisher Scoring iterations: 10


In [423]:
summary(rot13dec_gpt4_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob + input_ntokens + 
    output_ntokens + input_nchars, family = binomial, data = df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.5419  -0.8313  -0.4035   0.9128   2.6598  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     -1.4549     0.2014  -7.224 5.04e-13 ***
input_logprob    0.8315     1.7469   0.476    0.634    
output_logprob   1.9277     0.3396   5.676 1.38e-08 ***
input_ntokens   -2.5684     1.6014  -1.604    0.109    
output_ntokens   0.8680     0.6047   1.436    0.151    
input_nchars     2.8939     2.0645   1.402    0.161    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 359.48  on 299  degrees of freedom
Residual deviance: 286.22  on 294  degrees of freedom
AIC: 298.22

Number of Fisher Scoring iterations: 6


In [424]:
summary(rot13dec_gpt35_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob + input_ntokens + 
    output_ntokens + input_nchars, family = binomial, data = df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.36411  -0.35121  -0.10535  -0.01285   2.73780  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     -4.9690     0.7601  -6.537 6.26e-11 ***
input_logprob    1.8431     3.4909   0.528  0.59752    
output_logprob   3.3646     0.8453   3.980 6.88e-05 ***
input_ntokens   -5.4321     2.8906  -1.879  0.06021 .  
output_ntokens   3.6594     1.1220   3.262  0.00111 ** 
input_nchars     2.8000     3.6591   0.765  0.44414    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 203.69  on 299  degrees of freedom
Residual deviance: 125.67  on 294  degrees of freedom
AIC: 137.67

Number of Fisher Scoring iterations: 8


In [425]:
vif(rot13enc_gpt4_model)

In [426]:
vif(rot13enc_gpt35_model)

In [427]:
vif(rot13dec_gpt4_model)

In [428]:
vif(rot13dec_gpt35_model)

### Distance as output

In [429]:
# Not looking at Llama and PaLM because it was difficult to extract
# just their response (they often included explanations etc.)

In [430]:
# Read in data
rot13encdist_gpt4_df <- read.table(file = 'table_rot13enc_gpt-4-0613_dist.tsv', sep = '\t', header = TRUE)
rot13encdist_gpt35_df <- read.table(file = 'table_rot13enc_gpt-3.5-turbo-0613_dist.tsv', sep = '\t', header = TRUE)

In [431]:
# Get rid of "correct"
rot13encdist_gpt4_df <- rot13encdist_gpt4_df[,-c(1,8)]
rot13encdist_gpt35_df <- rot13encdist_gpt35_df[,-c(1,8)]

# Scale
rot13encdist_gpt4_df <- data.frame(scale(rot13encdist_gpt4_df))
rot13encdist_gpt35_df <- data.frame(scale(rot13encdist_gpt35_df))

In [432]:
model_dist4 <- glm(distance ~ input_logprob + output_logprob + input_ntokens + output_ntokens + input_nchars, 
               data=rot13encdist_gpt4_df)
model_dist35 <- glm(distance ~ input_logprob + output_logprob + input_ntokens + output_ntokens + input_nchars, 
               data=rot13encdist_gpt35_df)

In [433]:
summary(model_dist4)


Call:
glm(formula = distance ~ input_logprob + output_logprob + input_ntokens + 
    output_ntokens + input_nchars, data = rot13encdist_gpt4_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.33914  -0.34787  -0.03202   0.29298   2.81865  

Coefficients:
                 Estimate Std. Error t value Pr(>|t|)    
(Intercept)    -2.046e-16  3.254e-02   0.000  1.00000    
input_logprob  -1.339e-01  4.522e-02  -2.960  0.00333 ** 
output_logprob -1.418e-01  3.300e-01  -0.430  0.66777    
input_ntokens  -1.138e+00  1.219e-01  -9.337  < 2e-16 ***
output_ntokens  1.029e+00  3.292e-01   3.126  0.00195 ** 
input_nchars    5.795e-01  4.344e-01   1.334  0.18315    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for gaussian family taken to be 0.3177005)

    Null deviance: 299.000  on 299  degrees of freedom
Residual deviance:  93.404  on 294  degrees of freedom
AIC: 515.31

Number of Fisher Scoring iterations: 2


In [434]:
summary(model_dist35)


Call:
glm(formula = distance ~ input_logprob + output_logprob + input_ntokens + 
    output_ntokens + input_nchars, data = rot13encdist_gpt35_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.1784  -0.2853  -0.0955   0.0689  10.0305  

Coefficients:
                 Estimate Std. Error t value Pr(>|t|)   
(Intercept)    -9.139e-16  5.090e-02   0.000  1.00000   
input_logprob  -2.170e-01  7.074e-02  -3.067  0.00236 **
output_logprob  9.036e-01  5.162e-01   1.751  0.08107 . 
input_ntokens  -5.331e-01  1.906e-01  -2.797  0.00550 **
output_ntokens  8.194e-01  5.150e-01   1.591  0.11267   
input_nchars    8.817e-01  6.794e-01   1.298  0.19540   
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for gaussian family taken to be 0.7773369)

    Null deviance: 299.00  on 299  degrees of freedom
Residual deviance: 228.54  on 294  degrees of freedom
AIC: 783.74

Number of Fisher Scoring iterations: 2


In [435]:
vif(model_dist4)

In [436]:
vif(model_dist35)

### Comparing rot-2 to rot-13

In [437]:
# Not looking at Llama and PaLM because their accuracy was 0.0 across the board

In [438]:
# Read in data
rot13and2enc_gpt4_df <- read.table(file = 'table_rot13and2enc_gpt-4-0613.tsv', sep = '\t', header = TRUE)
rot13and2enc_gpt35_df <- read.table(file = 'table_rot13and2enc_gpt-3.5-turbo-0613.tsv', sep = '\t', header = TRUE)

rot13and2dec_gpt4_df <- read.table(file = 'table_rot13and2dec_gpt-4-0613.tsv', sep = '\t', header = TRUE)
rot13and2dec_gpt35_df <- read.table(file = 'table_rot13and2dec_gpt-3.5-turbo-0613.tsv', sep = '\t', header = TRUE)

In [439]:
# Z-score data
scaled_rot13and2enc_gpt4_df <- scale_taskpair_df(rot13and2enc_gpt4_df)
scaled_rot13and2enc_gpt35_df <- scale_taskpair_df(rot13and2enc_gpt35_df)

scaled_rot13and2dec_gpt4_df <- scale_taskpair_df(rot13and2dec_gpt4_df)
scaled_rot13and2dec_gpt35_df <- scale_taskpair_df(rot13and2dec_gpt35_df)


In [440]:
model_taskenc4 <- bayesglm(correct ~ task + input_nchars + input_ntokens + output_ntokens + input_logprob + output_logprob, 
               data=scaled_rot13and2enc_gpt4_df, family=binomial)


In [441]:
model_taskenc35 <- bayesglm(correct ~ task + input_nchars + input_ntokens + output_ntokens + input_logprob + output_logprob, 
               data=scaled_rot13and2enc_gpt35_df, family=binomial)

In [442]:
model_taskdec4 <- bayesglm(correct ~ task + input_nchars + input_ntokens + output_ntokens + input_logprob + output_logprob, 
               data=scaled_rot13and2dec_gpt4_df, family=binomial)

In [443]:
model_taskdec35 <- bayesglm(correct ~ task + input_nchars + input_ntokens + output_ntokens + input_logprob + output_logprob, 
               data=scaled_rot13and2dec_gpt35_df, family=binomial)

In [444]:
summary(model_taskenc4)


Call:
bayesglm(formula = correct ~ task + input_nchars + input_ntokens + 
    output_ntokens + input_logprob + output_logprob, family = binomial, 
    data = scaled_rot13and2enc_gpt4_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.27805  -0.22008  -0.07750  -0.01265   2.79444  

Coefficients:
                      Estimate Std. Error z value Pr(>|z|)    
(Intercept)          -2.874195   0.601776  -4.776 1.79e-06 ***
taskrot2enc_highprob -4.321099   1.546874  -2.793  0.00522 ** 
input_nchars         -0.005755   1.059544  -0.005  0.99567    
input_ntokens         2.000505   0.883266   2.265  0.02352 *  
output_ntokens       -1.134385   1.281776  -0.885  0.37615    
input_logprob         0.889370   0.735783   1.209  0.22676    
output_logprob        2.584170   1.456150   1.775  0.07595 .  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 134.373  on 199  degrees

In [445]:
summary(model_taskenc35)


Call:
bayesglm(formula = correct ~ task + input_nchars + input_ntokens + 
    output_ntokens + input_logprob + output_logprob, family = binomial, 
    data = scaled_rot13and2enc_gpt35_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-0.43256  -0.15092  -0.08247  -0.03489   2.66220  

Coefficients:
                     Estimate Std. Error z value Pr(>|z|)    
(Intercept)           -5.2207     1.4545  -3.589 0.000332 ***
taskrot2enc_highprob  -1.5693     1.4745  -1.064 0.287186    
input_nchars          -0.5423     1.1370  -0.477 0.633374    
input_ntokens          0.2442     1.0284   0.237 0.812345    
output_ntokens        -0.3968     1.1269  -0.352 0.724741    
input_logprob          0.8289     1.1217   0.739 0.459945    
output_logprob         0.5643     1.1385   0.496 0.620125    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 22.401  on 199  degrees of free

In [446]:
summary(model_taskdec4)


Call:
bayesglm(formula = correct ~ task + input_nchars + input_ntokens + 
    output_ntokens + input_logprob + output_logprob, family = binomial, 
    data = scaled_rot13and2dec_gpt4_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.4359  -0.4123  -0.2432   0.7928   2.6442  

Coefficients:
                     Estimate Std. Error z value Pr(>|z|)    
(Intercept)           -0.1211     0.2446  -0.495   0.6205    
taskrot2dec_highprob  -3.3529     0.6169  -5.435 5.49e-08 ***
input_nchars           0.3561     0.8967   0.397   0.6913    
input_ntokens         -0.7574     0.9265  -0.818   0.4136    
output_ntokens         0.9942     0.5949   1.671   0.0947 .  
input_logprob          0.1798     0.8221   0.219   0.8268    
output_logprob         1.0907     0.5100   2.139   0.0325 *  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 233.30  on 199  degrees of freedom
Residua

In [447]:
summary(model_taskdec35)


Call:
bayesglm(formula = correct ~ task + input_nchars + input_ntokens + 
    output_ntokens + input_logprob + output_logprob, family = binomial, 
    data = scaled_rot13and2dec_gpt35_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.28128  -0.30361  -0.09751  -0.02439   2.29539  

Coefficients:
                     Estimate Std. Error z value Pr(>|z|)    
(Intercept)           -2.3690     0.4916  -4.819 1.44e-06 ***
taskrot2dec_highprob  -4.3845     1.5528  -2.824  0.00475 ** 
input_nchars          -0.1466     0.9843  -0.149  0.88156    
input_ntokens         -0.8970     1.1125  -0.806  0.42011    
output_ntokens         2.1699     0.8523   2.546  0.01090 *  
input_logprob          0.7821     1.0182   0.768  0.44244    
output_logprob         2.3148     0.8666   2.671  0.00756 ** 
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 134.373  on 199  degrees of fre

In [448]:
vif(model_taskenc4)

In [449]:
vif(model_taskenc35)

In [450]:
vif(model_taskdec4)

In [451]:
vif(model_taskdec35)

### Input and output logprob for different prompt styles

In [452]:
# Read in data
rot13encbasic_gpt4_df <- read.table(file = 'table_rot13enc_gpt-4-0613.tsv', sep = '\t', header = TRUE)
rot13decbasic_gpt4_df <- read.table(file = 'table_rot13dec_gpt-4-0613.tsv', sep = '\t', header = TRUE)

rot13encstep_gpt4_df <- read.table(file = 'table_rot13encstep_gpt-4-0613.tsv', sep = '\t', header = TRUE)
rot13decstep_gpt4_df <- read.table(file = 'table_rot13decstep_gpt-4-0613.tsv', sep = '\t', header = TRUE)

rot13enccot_gpt4_df <- read.table(file = 'table_rot13enccot_gpt-4-0613.tsv', sep = '\t', header = TRUE)
rot13deccot_gpt4_df <- read.table(file = 'table_rot13deccot_gpt-4-0613.tsv', sep = '\t', header = TRUE)


In [453]:
# Z-score data
scaled_rot13encbasic_gpt4_df <- scale_df(rot13encbasic_gpt4_df)
scaled_rot13decbasic_gpt4_df <- scale_df(rot13decbasic_gpt4_df)

scaled_rot13encstep_gpt4_df <- scale_df(rot13encstep_gpt4_df)
scaled_rot13decstep_gpt4_df <- scale_df(rot13decstep_gpt4_df)

scaled_rot13enccot_gpt4_df <- scale_df(rot13enccot_gpt4_df)
scaled_rot13deccot_gpt4_df <- scale_df(rot13deccot_gpt4_df)


In [454]:
rot13encbasic_gpt4_model <- correct_vs_length_and_prob(scaled_rot13encbasic_gpt4_df, include_output_chars=FALSE)
rot13decbasic_gpt4_model <- correct_vs_length_and_prob(scaled_rot13decbasic_gpt4_df, include_output_chars=FALSE)

rot13encstep_gpt4_model <- correct_vs_length_and_prob(scaled_rot13encstep_gpt4_df, include_output_chars=FALSE)
rot13decstep_gpt4_model <- correct_vs_length_and_prob(scaled_rot13decstep_gpt4_df, include_output_chars=FALSE)

rot13enccot_gpt4_model <- correct_vs_length_and_prob(scaled_rot13enccot_gpt4_df, include_output_chars=FALSE)
rot13deccot_gpt4_model <- correct_vs_length_and_prob(scaled_rot13deccot_gpt4_df, include_output_chars=FALSE)



In [455]:
summary(rot13encbasic_gpt4_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob + input_ntokens + 
    output_ntokens + input_nchars, family = binomial, data = df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.99812  -0.44168  -0.14960  -0.01557   2.95372  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     -4.1263     0.5728  -7.204 5.84e-13 ***
input_logprob    1.2287     0.4840   2.539  0.01112 *  
output_logprob   7.8371     3.2790   2.390  0.01684 *  
input_ntokens    2.7704     0.9766   2.837  0.00456 ** 
output_ntokens  -0.9145     2.5316  -0.361  0.71793    
input_nchars     3.5012     3.3936   1.032  0.30221    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 242.98  on 299  degrees of freedom
Residual deviance: 155.44  on 294  degrees of freedom
AIC: 167.44

Number of Fisher Scoring iterations: 7


In [456]:
summary(rot13decbasic_gpt4_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob + input_ntokens + 
    output_ntokens + input_nchars, family = binomial, data = df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.5419  -0.8313  -0.4035   0.9128   2.6598  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     -1.4549     0.2014  -7.224 5.04e-13 ***
input_logprob    0.8315     1.7469   0.476    0.634    
output_logprob   1.9277     0.3396   5.676 1.38e-08 ***
input_ntokens   -2.5684     1.6014  -1.604    0.109    
output_ntokens   0.8680     0.6047   1.436    0.151    
input_nchars     2.8939     2.0645   1.402    0.161    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 359.48  on 299  degrees of freedom
Residual deviance: 286.22  on 294  degrees of freedom
AIC: 298.22

Number of Fisher Scoring iterations: 6


In [457]:
summary(rot13encstep_gpt4_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob + input_ntokens + 
    output_ntokens + input_nchars, family = binomial, data = df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.73379  -0.51483  -0.20383  -0.03238   2.75493  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     -3.5963     0.4842  -7.427 1.11e-13 ***
input_logprob    0.8294     0.4394   1.888   0.0591 .  
output_logprob   6.8981     3.0990   2.226   0.0260 *  
input_ntokens    2.1459     0.9113   2.355   0.0185 *  
output_ntokens   0.4738     2.4126   0.196   0.8443    
input_nchars     2.1449     3.1837   0.674   0.5005    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 235.60  on 299  degrees of freedom
Residual deviance: 166.53  on 294  degrees of freedom
AIC: 178.53

Number of Fisher Scoring iterations: 7


In [458]:
summary(rot13decstep_gpt4_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob + input_ntokens + 
    output_ntokens + input_nchars, family = binomial, data = df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.6857  -0.8601  -0.4367   0.9461   2.8363  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     -0.9332     0.1641  -5.686 1.30e-08 ***
input_logprob    1.6491     1.6681   0.989   0.3229    
output_logprob   1.3987     0.2705   5.172 2.32e-07 ***
input_ntokens   -1.4723     1.5050  -0.978   0.3279    
output_ntokens   1.4870     0.5932   2.507   0.0122 *  
input_nchars     1.5574     1.9448   0.801   0.4232    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 392.05  on 299  degrees of freedom
Residual deviance: 314.33  on 294  degrees of freedom
AIC: 326.33

Number of Fisher Scoring iterations: 5


In [459]:
summary(rot13enccot_gpt4_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob + input_ntokens + 
    output_ntokens + input_nchars, family = binomial, data = df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.28774  -0.20523  -0.04222  -0.00230   2.98322  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     -7.5672     1.3560  -5.581  2.4e-08 ***
input_logprob    1.8554     0.9072   2.045   0.0408 *  
output_logprob   7.4052     5.2162   1.420   0.1557    
input_ntokens    4.2542     1.8051   2.357   0.0184 *  
output_ntokens  -6.2785     4.9176  -1.277   0.2017    
input_nchars     5.2889     5.8233   0.908   0.3638    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 141.63  on 299  degrees of freedom
Residual deviance:  77.40  on 294  degrees of freedom
AIC: 89.4

Number of Fisher Scoring iterations: 9


In [460]:
summary(rot13deccot_gpt4_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob + input_ntokens + 
    output_ntokens + input_nchars, family = binomial, data = df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.9243  -1.0111   0.6140   0.8735   2.4145  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     0.16840    0.13331   1.263    0.207    
input_logprob   0.83460    1.42938   0.584    0.559    
output_logprob  1.29187    0.21888   5.902 3.59e-09 ***
input_ntokens  -2.08775    1.36528  -1.529    0.126    
output_ntokens  0.08215    0.50933   0.161    0.872    
input_nchars    2.97586    1.83157   1.625    0.104    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 412.47  on 299  degrees of freedom
Residual deviance: 339.83  on 294  degrees of freedom
AIC: 351.83

Number of Fisher Scoring iterations: 4


In [461]:
vif(rot13encbasic_gpt4_model)

In [462]:
vif(rot13decbasic_gpt4_model)

In [463]:
vif(rot13encstep_gpt4_model)

In [464]:
vif(rot13decstep_gpt4_model)

In [465]:
vif(rot13enccot_gpt4_model)

In [466]:
vif(rot13deccot_gpt4_model)

### Input logprob for different prompt styles with distance as output

In [467]:
# Read in data
rot13encbasicdist_gpt4_df <- read.table(file = 'table_rot13enc_gpt-4-0613_dist.tsv', sep = '\t', header = TRUE)
rot13encstepdist_gpt4_df <- read.table(file = 'table_rot13encstep_gpt-4-0613_dist.tsv', sep = '\t', header = TRUE)
rot13enccotdist_gpt4_df <- read.table(file = 'table_rot13enccot_gpt-4-0613_dist.tsv', sep = '\t', header = TRUE)



In [468]:
# Get rid of "correct"
rot13encbasicdist_gpt4_df <- rot13encbasicdist_gpt4_df[,-c(1,8)]
rot13encstepdist_gpt4_df <- rot13encstepdist_gpt4_df[,-c(1,8)]
rot13enccotdist_gpt4_df <- rot13enccotdist_gpt4_df[,-c(1,8)]


# Scale
scaled_rot13encbasicdist_gpt4_df <- data.frame(scale(rot13encbasicdist_gpt4_df))
scaled_rot13encstepdist_gpt4_df <- data.frame(scale(rot13encstepdist_gpt4_df))
scaled_rot13enccotdist_gpt4_df <- data.frame(scale(rot13enccotdist_gpt4_df))



In [469]:
modelbasic_dist4 <- glm(distance ~ input_logprob + output_logprob + input_ntokens + output_ntokens + input_nchars, 
               data=rot13encbasicdist_gpt4_df)
modelstep_dist4 <- glm(distance ~ input_logprob + output_logprob + input_ntokens + output_ntokens + input_nchars, 
               data=rot13encstepdist_gpt4_df)
modelcot_dist4 <- glm(distance ~ input_logprob + output_logprob + input_ntokens + output_ntokens + input_nchars, 
               data=rot13enccotdist_gpt4_df)

In [470]:
summary(modelbasic_dist4)


Call:
glm(formula = distance ~ input_logprob + output_logprob + input_ntokens + 
    output_ntokens + input_nchars, data = rot13encbasicdist_gpt4_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-11.9233   -3.0973   -0.2851    2.6086   25.0964  

Coefficients:
                Estimate Std. Error t value Pr(>|t|)    
(Intercept)    -2.171012   0.977895  -2.220  0.02718 *  
input_logprob  -0.021256   0.007181  -2.960  0.00333 ** 
output_logprob -0.009503   0.022119  -0.430  0.66777    
input_ntokens  -1.236568   0.132434  -9.337  < 2e-16 ***
output_ntokens  0.427956   0.136915   3.126  0.00195 ** 
input_nchars    0.111892   0.083861   1.334  0.18315    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for gaussian family taken to be 25.18589)

    Null deviance: 23703.4  on 299  degrees of freedom
Residual deviance:  7404.7  on 294  degrees of freedom
AIC: 1827.2

Number of Fisher Scoring iterations: 2


In [471]:
summary(modelstep_dist4)


Call:
glm(formula = distance ~ input_logprob + output_logprob + input_ntokens + 
    output_ntokens + input_nchars, data = rot13encstepdist_gpt4_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-12.8705   -2.8071   -0.2109    2.5441   19.1410  

Coefficients:
                Estimate Std. Error t value Pr(>|t|)    
(Intercept)    -1.594581   0.964343  -1.654   0.0993 .  
input_logprob  -0.018180   0.007081  -2.567   0.0107 *  
output_logprob -0.002303   0.021813  -0.106   0.9160    
input_ntokens  -1.147949   0.130598  -8.790   <2e-16 ***
output_ntokens  0.312296   0.135018   2.313   0.0214 *  
input_nchars    0.164673   0.082699   1.991   0.0474 *  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for gaussian family taken to be 24.4927)

    Null deviance: 21311.9  on 299  degrees of freedom
Residual deviance:  7200.9  on 294  degrees of freedom
AIC: 1818.8

Number of Fisher Scoring iterations: 2


In [472]:
summary(modelcot_dist4)


Call:
glm(formula = distance ~ input_logprob + output_logprob + input_ntokens + 
    output_ntokens + input_nchars, data = rot13enccotdist_gpt4_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-14.799   -4.138   -1.006    3.792   21.962  

Coefficients:
               Estimate Std. Error t value Pr(>|t|)    
(Intercept)    -2.06029    1.24189  -1.659   0.0982 .  
input_logprob  -0.01683    0.00912  -1.846   0.0659 .  
output_logprob -0.04055    0.02809  -1.444   0.1499    
input_ntokens  -1.11568    0.16819  -6.634 1.57e-10 ***
output_ntokens  0.40731    0.17388   2.343   0.0198 *  
input_nchars    0.04840    0.10650   0.454   0.6498    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for gaussian family taken to be 40.62025)

    Null deviance: 34779  on 299  degrees of freedom
Residual deviance: 11942  on 294  degrees of freedom
AIC: 1970.6

Number of Fisher Scoring iterations: 2


In [473]:
vif(modelbasic_dist4)

In [474]:
vif(modelstep_dist4)

In [475]:
vif(modelcot_dist4)

### Comparing rot-13 and rot-2 for different prompt styles

In [476]:
# Read in data
rot13and2encbasic_gpt4_df <- read.table(file = 'table_rot13and2enc_gpt-4-0613.tsv', sep = '\t', header = TRUE)
rot13and2encstep_gpt4_df <- read.table(file = 'table_rot13and2encstep_gpt-4-0613.tsv', sep = '\t', header = TRUE)
rot13and2enccot_gpt4_df <- read.table(file = 'table_rot13and2enccot_gpt-4-0613.tsv', sep = '\t', header = TRUE)

rot13and2decbasic_gpt4_df <- read.table(file = 'table_rot13and2dec_gpt-4-0613.tsv', sep = '\t', header = TRUE)
rot13and2decstep_gpt4_df <- read.table(file = 'table_rot13and2decstep_gpt-4-0613.tsv', sep = '\t', header = TRUE)
rot13and2deccot_gpt4_df <- read.table(file = 'table_rot13and2deccot_gpt-4-0613.tsv', sep = '\t', header = TRUE)


In [477]:
# Z-score data
scaled_rot13and2encbasic_gpt4_df <- scale_taskpair_df(rot13and2encbasic_gpt4_df)
scaled_rot13and2encstep_gpt4_df <- scale_taskpair_df(rot13and2encstep_gpt4_df)
scaled_rot13and2enccot_gpt4_df <- scale_taskpair_df(rot13and2enccot_gpt4_df)


scaled_rot13and2decbasic_gpt4_df <- scale_taskpair_df(rot13and2decbasic_gpt4_df)
scaled_rot13and2decstep_gpt4_df <- scale_taskpair_df(rot13and2decstep_gpt4_df)
scaled_rot13and2deccot_gpt4_df <- scale_taskpair_df(rot13and2deccot_gpt4_df)



In [478]:
model_taskencbasic4 <- bayesglm(correct ~ task + input_nchars + input_ntokens + output_ntokens + input_logprob + output_logprob, 
               data=scaled_rot13and2encbasic_gpt4_df, family=binomial)
model_taskencstep4 <- bayesglm(correct ~ task + input_nchars + input_ntokens + output_ntokens + input_logprob + output_logprob, 
               data=scaled_rot13and2encstep_gpt4_df, family=binomial)
model_taskenccot4 <- bayesglm(correct ~ task + input_nchars + input_ntokens + output_ntokens + input_logprob + output_logprob, 
               data=scaled_rot13and2enccot_gpt4_df, family=binomial)

model_taskdecbasic4 <- bayesglm(correct ~ task + input_nchars + input_ntokens + output_ntokens + input_logprob + output_logprob, 
               data=scaled_rot13and2decbasic_gpt4_df, family=binomial)
model_taskdecstep4 <- bayesglm(correct ~ task + input_nchars + input_ntokens + output_ntokens + input_logprob + output_logprob, 
               data=scaled_rot13and2decstep_gpt4_df, family=binomial)
model_taskdeccot4 <- bayesglm(correct ~ task + input_nchars + input_ntokens + output_ntokens + input_logprob + output_logprob, 
               data=scaled_rot13and2deccot_gpt4_df, family=binomial)

In [479]:
summary(model_taskencbasic4)


Call:
bayesglm(formula = correct ~ task + input_nchars + input_ntokens + 
    output_ntokens + input_logprob + output_logprob, family = binomial, 
    data = scaled_rot13and2encbasic_gpt4_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.27805  -0.22008  -0.07750  -0.01265   2.79444  

Coefficients:
                      Estimate Std. Error z value Pr(>|z|)    
(Intercept)          -2.874195   0.601776  -4.776 1.79e-06 ***
taskrot2enc_highprob -4.321099   1.546874  -2.793  0.00522 ** 
input_nchars         -0.005755   1.059544  -0.005  0.99567    
input_ntokens         2.000505   0.883266   2.265  0.02352 *  
output_ntokens       -1.134385   1.281776  -0.885  0.37615    
input_logprob         0.889370   0.735783   1.209  0.22676    
output_logprob        2.584170   1.456150   1.775  0.07595 .  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 134.373  on 199  de

In [480]:
summary(model_taskencstep4)


Call:
bayesglm(formula = correct ~ task + input_nchars + input_ntokens + 
    output_ntokens + input_logprob + output_logprob, family = binomial, 
    data = scaled_rot13and2encstep_gpt4_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.17351  -0.27674  -0.09821  -0.02221   2.59970  

Coefficients:
                         Estimate Std. Error z value Pr(>|z|)    
(Intercept)              -2.46595    0.51133  -4.823 1.42e-06 ***
taskrot2encstep_highprob -4.20757    1.53336  -2.744  0.00607 ** 
input_nchars             -0.02859    1.02087  -0.028  0.97766    
input_ntokens             1.36581    0.80535   1.696  0.08990 .  
output_ntokens           -0.59194    1.08706  -0.545  0.58607    
input_logprob             0.79898    0.70636   1.131  0.25800    
output_logprob            1.99277    1.27315   1.565  0.11753    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null devian

In [481]:
summary(model_taskenccot4)


Call:
bayesglm(formula = correct ~ task + input_nchars + input_ntokens + 
    output_ntokens + input_logprob + output_logprob, family = binomial, 
    data = scaled_rot13and2enccot_gpt4_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.09833  -0.21603  -0.05857  -0.00944   1.86068  

Coefficients:
                        Estimate Std. Error z value Pr(>|z|)    
(Intercept)             -4.62384    1.00878  -4.584 4.57e-06 ***
taskrot2enccot_highprob -3.45087    1.48371  -2.326   0.0200 *  
input_nchars            -0.05547    1.11507  -0.050   0.9603    
input_ntokens            2.04535    1.10870   1.845   0.0651 .  
output_ntokens          -0.69142    1.23346  -0.561   0.5751    
input_logprob            1.22494    0.97408   1.258   0.2086    
output_logprob           3.60140    1.80598   1.994   0.0461 *  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 85.19

In [482]:
summary(model_taskdecbasic4)


Call:
bayesglm(formula = correct ~ task + input_nchars + input_ntokens + 
    output_ntokens + input_logprob + output_logprob, family = binomial, 
    data = scaled_rot13and2decbasic_gpt4_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.4359  -0.4123  -0.2432   0.7928   2.6442  

Coefficients:
                     Estimate Std. Error z value Pr(>|z|)    
(Intercept)           -0.1211     0.2446  -0.495   0.6205    
taskrot2dec_highprob  -3.3529     0.6169  -5.435 5.49e-08 ***
input_nchars           0.3561     0.8967   0.397   0.6913    
input_ntokens         -0.7574     0.9265  -0.818   0.4136    
output_ntokens         0.9942     0.5949   1.671   0.0947 .  
input_logprob          0.1798     0.8221   0.219   0.8268    
output_logprob         1.0907     0.5100   2.139   0.0325 *  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 233.30  on 199  degrees of freedom
Re

In [483]:
summary(model_taskdecstep4)


Call:
bayesglm(formula = correct ~ task + input_nchars + input_ntokens + 
    output_ntokens + input_logprob + output_logprob, family = binomial, 
    data = scaled_rot13and2decstep_gpt4_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.6640  -0.8047  -0.3517   0.7893   2.4030  

Coefficients:
                         Estimate Std. Error z value Pr(>|z|)    
(Intercept)              -0.05560    0.25395  -0.219   0.8267    
taskrot2decstep_highprob -1.68691    0.39733  -4.246 2.18e-05 ***
input_nchars              0.04472    0.85979   0.052   0.9585    
input_ntokens            -0.64643    0.85972  -0.752   0.4521    
output_ntokens            0.39650    0.52454   0.756   0.4497    
input_logprob             0.11562    0.78154   0.148   0.8824    
output_logprob            0.95721    0.46558   2.056   0.0398 *  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 261.37

In [484]:
summary(model_taskdeccot4)


Call:
bayesglm(formula = correct ~ task + input_nchars + input_ntokens + 
    output_ntokens + input_logprob + output_logprob, family = binomial, 
    data = scaled_rot13and2deccot_gpt4_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.1577  -0.8867   0.5736   0.7239   1.8240  

Coefficients:
                        Estimate Std. Error z value Pr(>|z|)    
(Intercept)              1.41000    0.28275   4.987 6.14e-07 ***
taskrot2deccot_highprob -0.49165    0.40436  -1.216  0.22404    
input_nchars             0.62867    0.89690   0.701  0.48334    
input_ntokens            0.58590    0.77800   0.753  0.45140    
output_ntokens           0.05053    0.48148   0.105  0.91641    
input_logprob            0.84259    0.79638   1.058  0.29004    
output_logprob           1.10417    0.41744   2.645  0.00817 ** 
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 229.22  on 199 

In [485]:
vif(model_taskencbasic4)

In [486]:
vif(model_taskencstep4)

In [487]:
vif(model_taskenccot4)

In [488]:
vif(model_taskdecbasic4)

In [489]:
vif(model_taskdecstep4)

In [490]:
vif(model_taskdeccot4)

In [491]:
# Comparing rot-13 and rot-12
rot13and12deccot_gpt4_df <- read.table(file = 'table_rot13and12deccot_gpt-4-0613.tsv', sep = '\t', header = TRUE)

scaled_rot13and12deccot_gpt4_df <- scale_taskpair_df(rot13and12deccot_gpt4_df)

model_rot13and12deccot_gpt4_df <- bayesglm(correct ~ task + input_nchars + input_ntokens + output_ntokens + input_logprob + output_logprob, 
               data=scaled_rot13and12deccot_gpt4_df, family=binomial)

summary(model_rot13and12deccot_gpt4_df)


Call:
bayesglm(formula = correct ~ task + input_nchars + input_ntokens + 
    output_ntokens + input_logprob + output_logprob, family = binomial, 
    data = scaled_rot13and12deccot_gpt4_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-2.15101  -0.08213  -0.04706   0.53706   1.39744  

Coefficients:
                Estimate Std. Error z value Pr(>|z|)    
(Intercept)      1.39453    0.27887   5.001 5.72e-07 ***
taskshiftcot_12 -7.60737    1.91115  -3.981 6.88e-05 ***
input_nchars     0.46638    0.92162   0.506    0.613    
input_ntokens    0.08506    0.88289   0.096    0.923    
output_ntokens  -0.55992    0.60273  -0.929    0.353    
input_logprob   -0.03679    0.84415  -0.044    0.965    
output_logprob   0.73405    0.53124   1.382    0.167    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 267.499  on 199  degrees of freedom
Residual deviance:  94.683  on 1

In [492]:
vif(model_rot13and12deccot_gpt4_df)

### Comparing prompt techniques

In [493]:
# Read in data
rot13encprompts_gpt4_df <- read.table(file = 'table_rot13enc_prompt_comparison.tsv', sep = '\t', header = TRUE)
rot13decprompts_gpt4_df <- read.table(file = 'table_rot13dec_prompt_comparison.tsv', sep = '\t', header = TRUE)

In [494]:
head(rot13encprompts_gpt4_df)

Unnamed: 0_level_0,index,prompt,input_nchars,input_ntokens,input_logprob,output_nchars,output_ntokens,output_logprob,correct
Unnamed: 0_level_1,<int>,<chr>,<int>,<int>,<dbl>,<int>,<int>,<dbl>,<int>
1,0,,48,11,-27.93722,48,22,-140.6093,1
2,1,,52,11,-28.95337,52,26,-163.6525,0
3,2,,52,11,-26.48786,52,25,-171.8435,1
4,3,,65,15,-37.63794,65,31,-206.2903,1
5,4,,37,10,-24.1028,37,18,-123.9919,1
6,5,,46,11,-31.4136,46,21,-147.9776,1


In [495]:
scale_prompt_df <- function(df) {
    new_df <- data.frame(scale(df[3:8]))
    new_df$index <- as.factor(df$index)
    new_df$correct <- df$correct
    new_df$prompt <- df$prompt
    
    return(new_df)
}

scaled_rot13encprompts_gpt4_df <- scale_prompt_df(rot13encprompts_gpt4_df)
scaled_rot13decprompts_gpt4_df <- scale_prompt_df(rot13decprompts_gpt4_df)


In [496]:
head(scaled_rot13encprompts_gpt4_df)

Unnamed: 0_level_0,input_nchars,input_ntokens,input_logprob,output_nchars,output_ntokens,output_logprob,index,correct,prompt
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<fct>,<int>,<chr>
1,-1.0989416,-1.066837,1.2245026,-1.0989416,-1.1392848,1.226839,0,1,
2,-1.0122059,-1.066837,1.1836319,-1.0122059,-0.9522869,1.0522985,1,0,
3,-1.0122059,-1.066837,1.2827981,-1.0122059,-0.9990364,0.9902564,2,1,
4,-0.7303148,-0.578582,0.8343259,-0.7303148,-0.7185395,0.7293406,3,1,
5,-1.3374649,-1.188901,1.3787283,-1.3374649,-1.3262827,1.3527068,4,1,
6,-1.1423095,-1.066837,1.0846777,-1.1423095,-1.1860343,1.1710282,5,1,


In [497]:
scaled_rot13encprompts_gpt4_df$prompt <- factor(scaled_rot13encprompts_gpt4_df$prompt)
scaled_rot13decprompts_gpt4_df$prompt <- factor(scaled_rot13decprompts_gpt4_df$prompt)

contrasts(scaled_rot13encprompts_gpt4_df$prompt) <- contr.sum(3)
contrasts(scaled_rot13decprompts_gpt4_df$prompt) <- contr.sum(3)


In [498]:
rot13encprompts_gpt4_model <- glmer(correct ~ prompt + (1|index), 
               data=scaled_rot13encprompts_gpt4_df, family=binomial)
rot13decprompts_gpt4_model <- glmer(correct ~ prompt + (1|index), 
               data=scaled_rot13decprompts_gpt4_df, family=binomial)

In [499]:
rot13encprompts_gpt4_null_model <- glmer(correct ~ (1|index), 
               data=scaled_rot13encprompts_gpt4_df, family=binomial)
rot13decprompts_gpt4_null_model <- glmer(correct ~ (1|index), 
               data=scaled_rot13decprompts_gpt4_df, family=binomial)

In [500]:
anova(rot13encprompts_gpt4_model,rot13encprompts_gpt4_null_model,prompt="Chisq")

Unnamed: 0_level_0,npar,AIC,BIC,logLik,deviance,Chisq,Df,Pr(>Chisq)
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
rot13encprompts_gpt4_null_model,2,168.669,176.0766,-82.3345,164.669,,,
rot13encprompts_gpt4_model,4,146.33,161.1451,-69.16499,138.33,26.33901,2.0,1.907901e-06


In [501]:
anova(rot13decprompts_gpt4_model,rot13decprompts_gpt4_null_model,prompt="Chisq")

Unnamed: 0_level_0,npar,AIC,BIC,logLik,deviance,Chisq,Df,Pr(>Chisq)
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
rot13decprompts_gpt4_null_model,2,360.7307,368.1382,-178.3653,356.7307,,,
rot13decprompts_gpt4_model,4,325.0535,339.8686,-158.5267,317.0535,39.67717,2.0,2.42221e-09


In [502]:
gpt4enc_rot13_multcomp <- glht(rot13encprompts_gpt4_model, linfct=mcp(prompt="Tukey"))
gpt4dec_rot13_multcomp <- glht(rot13decprompts_gpt4_model, linfct=mcp(prompt="Tukey"))


In [503]:
summary(gpt4enc_rot13_multcomp)


	 Simultaneous Tests for General Linear Hypotheses

Multiple Comparisons of Means: Tukey Contrasts


Fit: glmer(formula = correct ~ prompt + (1 | index), data = scaled_rot13encprompts_gpt4_df, 
    family = binomial)

Linear Hypotheses:
                Estimate Std. Error z value Pr(>|z|)   
cot -  == 0      -6.3926     1.9508  -3.277  0.00253 **
step -  == 0     -0.4289     0.9364  -0.458  0.88606   
step - cot == 0   5.9637     1.8963   3.145  0.00413 **
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Adjusted p values reported -- single-step method)


In [504]:
summary(gpt4dec_rot13_multcomp)


	 Simultaneous Tests for General Linear Hypotheses

Multiple Comparisons of Means: Tukey Contrasts


Fit: glmer(formula = correct ~ prompt + (1 | index), data = scaled_rot13decprompts_gpt4_df, 
    family = binomial)

Linear Hypotheses:
                Estimate Std. Error z value Pr(>|z|)    
cot -  == 0       2.8054     0.5948   4.717   <1e-04 ***
step -  == 0      0.1839     0.4291   0.429    0.902    
step - cot == 0  -2.6214     0.5818  -4.506   <1e-04 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Adjusted p values reported -- single-step method)


# Reversal

In [None]:
# No Llama for encoding because it got 0% across the board

In [100]:
# Read in data
revenc_gpt4_df <- read.table(file = 'table_revenc_gpt-4-0613.tsv', sep = '\t', header = TRUE)
revenc_gpt35_df <- read.table(file = 'table_revenc_gpt-3.5-turbo-0613.tsv', sep = '\t', header = TRUE)
revenc_palm_df <- read.table(file = 'table_revenc_text-bison-001.tsv', sep = '\t', header = TRUE)

revdec_gpt4_df <- read.table(file = 'table_revdec_gpt-4-0613.tsv', sep = '\t', header = TRUE)
revdec_gpt35_df <- read.table(file = 'table_revdec_gpt-3.5-turbo-0613.tsv', sep = '\t', header = TRUE)
revdec_llama_df <- read.table(file = 'table_revdec_llama-2-70b-chat.tsv', sep = '\t', header = TRUE)
revdec_palm_df <- read.table(file = 'table_revdec_text-bison-001.tsv', sep = '\t', header = TRUE)

In [101]:
# Z-score data
scaled_revenc_gpt4_df <- scale_df(revenc_gpt4_df)
scaled_revenc_gpt35_df <- scale_df(revenc_gpt35_df)
scaled_revenc_palm_df <- scale_df(revenc_palm_df)

scaled_revdec_gpt4_df <- scale_df(revdec_gpt4_df)
scaled_revdec_gpt35_df <- scale_df(revdec_gpt35_df)
scaled_revdec_llama_df <- scale_df(revdec_llama_df)
scaled_revdec_palm_df <- scale_df(revdec_palm_df)

In [102]:

revenc_gpt4_model <- correct_vs_length_and_prob(scaled_revenc_gpt4_df, include_output_chars=FALSE)
revenc_gpt35_model <- correct_vs_length_and_prob(scaled_revenc_gpt35_df, include_output_chars=FALSE)
revenc_palm_model <- correct_vs_length_and_prob(scaled_revenc_palm_df, include_output_chars=FALSE, include_output_tokens=FALSE)

revdec_gpt4_model <- correct_vs_length_and_prob(scaled_revdec_gpt4_df, include_output_chars=FALSE)
revdec_gpt35_model <- correct_vs_length_and_prob(scaled_revdec_gpt35_df, include_output_chars=FALSE)
revdec_llama_model <- correct_vs_length_and_prob(scaled_revdec_llama_df, include_output_chars=FALSE, include_output_tokens=FALSE)
revdec_palm_model <- correct_vs_length_and_prob(scaled_revdec_palm_df, include_output_chars=FALSE, include_output_tokens=FALSE)


In [103]:
summary(revenc_gpt4_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob + input_ntokens + 
    output_ntokens + input_nchars, family = binomial, data = df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.4668   0.3250   0.4665   0.5949   1.7972  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     1.71991    0.17799   9.663   <2e-16 ***
input_logprob  -0.03883    0.20693  -0.188   0.8511    
output_logprob -0.30283    0.77852  -0.389   0.6973    
input_ntokens   3.06200    1.90858   1.604   0.1086    
output_ntokens -3.65279    2.03329  -1.796   0.0724 .  
input_nchars   -0.67044    0.65079  -1.030   0.3029    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 288.81  on 299  degrees of freedom
Residual deviance: 245.57  on 294  degrees of freedom
AIC: 257.57

Number of Fisher Scoring iterations: 5


In [104]:
summary(revenc_gpt35_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob + input_ntokens + 
    output_ntokens + input_nchars, family = binomial, data = df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.5268  -0.9842  -0.4943   1.0049   2.1479  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)    -0.64275    0.14184  -4.532 5.85e-06 ***
input_logprob   0.27657    0.22757   1.215    0.224    
output_logprob -0.11149    0.86103  -0.129    0.897    
input_ntokens  -1.68717    1.72867  -0.976    0.329    
output_ntokens  0.01594    1.73855   0.009    0.993    
input_nchars    0.64733    0.68799   0.941    0.347    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 400.34  on 299  degrees of freedom
Residual deviance: 341.51  on 294  degrees of freedom
AIC: 353.51

Number of Fisher Scoring iterations: 5


In [105]:
summary(revenc_palm_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob + input_ntokens + 
    input_nchars, family = binomial, data = df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-0.30133  -0.13232  -0.07462  -0.03341   3.05123  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     -6.2851     1.6857  -3.728 0.000193 ***
input_logprob    1.2169     2.0228   0.602 0.547454    
output_logprob   3.5007     4.7387   0.739 0.460061    
input_ntokens   -0.2146     3.1247  -0.069 0.945246    
input_nchars     2.8534     3.5914   0.795 0.426899    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 24.029  on 299  degrees of freedom
Residual deviance: 21.065  on 295  degrees of freedom
AIC: 31.065

Number of Fisher Scoring iterations: 9


In [106]:
summary(revdec_gpt4_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob + input_ntokens + 
    output_ntokens + input_nchars, family = binomial, data = df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.8716   0.1950   0.3063   0.4420   1.9434  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)      2.1370     0.2382   8.972  < 2e-16 ***
input_logprob   -2.9227     0.9709  -3.010  0.00261 ** 
output_logprob   2.2588     0.3278   6.891 5.55e-12 ***
input_ntokens   -0.1544     2.3128  -0.067  0.94677    
output_ntokens  -0.9864     2.2231  -0.444  0.65727    
input_nchars    -0.4750     0.7494  -0.634  0.52616    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 282.84  on 299  degrees of freedom
Residual deviance: 193.97  on 294  degrees of freedom
AIC: 205.97

Number of Fisher Scoring iterations: 6


In [107]:
summary(revdec_gpt35_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob + input_ntokens + 
    output_ntokens + input_nchars, family = binomial, data = df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.9183  -0.9433   0.5918   0.8400   2.5700  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)      0.0747     0.1386   0.539  0.58981    
input_logprob   -2.2254     0.8629  -2.579  0.00991 ** 
output_logprob   1.3848     0.2447   5.659 1.52e-08 ***
input_ntokens   -3.0624     1.8011  -1.700  0.08908 .  
output_ntokens   1.2449     1.7323   0.719  0.47235    
input_nchars    -0.5018     0.7103  -0.706  0.47989    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 413.63  on 299  degrees of freedom
Residual deviance: 328.84  on 294  degrees of freedom
AIC: 340.84

Number of Fisher Scoring iterations: 5


In [108]:
summary(revdec_llama_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob + input_ntokens + 
    input_nchars, family = binomial, data = df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.07524  -0.31169  -0.10751  -0.02153   2.61657  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     -5.4775     0.9790  -5.595 2.21e-08 ***
input_logprob    0.4676     2.7739   0.169  0.86614    
output_logprob   3.2479     1.1626   2.794  0.00521 ** 
input_ntokens   -1.1908     1.7790  -0.669  0.50326    
input_nchars     0.8046     2.0456   0.393  0.69407    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 130.617  on 299  degrees of freedom
Residual deviance:  92.752  on 295  degrees of freedom
AIC: 102.75

Number of Fisher Scoring iterations: 8


In [109]:
summary(revdec_palm_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob + input_ntokens + 
    input_nchars, family = binomial, data = df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.1592  -0.5847  -0.3060  -0.0740   2.4680  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     -2.9932     0.3941  -7.596 3.06e-14 ***
input_logprob   -1.5138     1.5093  -1.003    0.316    
output_logprob   2.5183     0.6152   4.094 4.25e-05 ***
input_ntokens   -1.0319     0.9002  -1.146    0.252    
input_nchars    -0.3107     1.1772  -0.264    0.792    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 231.83  on 299  degrees of freedom
Residual deviance: 187.11  on 295  degrees of freedom
AIC: 197.11

Number of Fisher Scoring iterations: 7


In [110]:
vif(revenc_gpt4_model)

In [111]:
vif(revenc_gpt35_model)

In [112]:
vif(revenc_palm_model)

In [113]:
vif(revdec_gpt4_model)

In [114]:
vif(revdec_gpt35_model)

In [115]:
vif(revdec_llama_model)

In [116]:
vif(revdec_palm_model)

# Swap

In [117]:
# Read in data
swap_next_base_gpt4_df <- read.table(file = 'table_swap_next_base_gpt-4-0613.tsv', sep = '\t', header = TRUE)
swap_next_base_gpt35_df <- read.table(file = 'table_swap_next_base_gpt-3.5-turbo-0613.tsv', sep = '\t', header = TRUE)
swap_next_base_llama_df <- read.table(file = 'table_swap_next_base_llama-2-70b-chat.tsv', sep = '\t', header = TRUE)
swap_next_base_palm_df <- read.table(file = 'table_swap_next_base_text-bison-001.tsv', sep = '\t', header = TRUE)



In [118]:
# Z-score data
scaled_swap_next_base_gpt4_df <- scale_df(swap_next_base_gpt4_df)
scaled_swap_next_base_gpt35_df <- scale_df(swap_next_base_gpt35_df)
scaled_swap_next_base_llama_df <- scale_df(swap_next_base_llama_df)
scaled_swap_next_base_palm_df <- scale_df(swap_next_base_palm_df)



In [119]:
swap_next_base_gpt4_model <- glm(correct ~ input_logprob + output_logprob + input_ntokens + input_nchars, 
                                 data=scaled_swap_next_base_gpt4_df, family=binomial)
swap_next_base_gpt35_model <- glm(correct ~ input_logprob + output_logprob + input_ntokens + input_nchars, 
                                 data=scaled_swap_next_base_gpt35_df, family=binomial)
swap_next_base_llama_model <- glm(correct ~ input_logprob + output_logprob + input_ntokens + input_nchars, 
                                 data=scaled_swap_next_base_llama_df, family=binomial)
swap_next_base_palm_model <- glm(correct ~ input_logprob + output_logprob + input_ntokens + input_nchars, 
                                 data=scaled_swap_next_base_palm_df, family=binomial)


In [120]:
vif(swap_next_base_gpt4_model)

In [121]:
vif(swap_next_base_gpt35_model)

In [122]:
vif(swap_next_base_llama_model)

In [123]:
vif(swap_next_base_palm_model)

### Rerunning with just output logprob

In [124]:
swap_next_base_gpt4_model <- glm(correct ~ output_logprob + input_ntokens + input_nchars, 
                                 data=scaled_swap_next_base_gpt4_df, family=binomial)
swap_next_base_gpt35_model <- glm(correct ~ output_logprob + input_ntokens + input_nchars, 
                                 data=scaled_swap_next_base_gpt35_df, family=binomial)
swap_next_base_llama_model <- glm(correct ~ output_logprob + input_ntokens + input_nchars, 
                                 data=scaled_swap_next_base_llama_df, family=binomial)
swap_next_base_palm_model <- glm(correct ~ output_logprob + input_ntokens + input_nchars, 
                                 data=scaled_swap_next_base_palm_df, family=binomial)


In [125]:
summary(swap_next_base_gpt4_model)


Call:
glm(formula = correct ~ output_logprob + input_ntokens + input_nchars, 
    family = binomial, data = scaled_swap_next_base_gpt4_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-2.23047  -0.54883  -0.07368   0.65085   2.53538  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     -0.6818     0.1924  -3.543 0.000396 ***
output_logprob   3.5219     0.4123   8.541  < 2e-16 ***
input_ntokens    2.2323     0.7371   3.029 0.002457 ** 
input_nchars    -0.5648     0.6971  -0.810 0.417840    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 413.97  on 299  degrees of freedom
Residual deviance: 245.89  on 296  degrees of freedom
AIC: 253.89

Number of Fisher Scoring iterations: 6


In [126]:
summary(swap_next_base_gpt35_model)


Call:
glm(formula = correct ~ output_logprob + input_ntokens + input_nchars, 
    family = binomial, data = scaled_swap_next_base_gpt35_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.6911  -0.9031  -0.3101   0.9510   2.2064  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     -0.8507     0.1628  -5.224 1.75e-07 ***
output_logprob   1.9801     0.2885   6.864 6.69e-12 ***
input_ntokens    0.2248     0.5931   0.379    0.705    
input_nchars     0.4359     0.5867   0.743    0.457    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 397.45  on 299  degrees of freedom
Residual deviance: 316.07  on 296  degrees of freedom
AIC: 324.07

Number of Fisher Scoring iterations: 5


In [127]:
summary(swap_next_base_llama_model)


Call:
glm(formula = correct ~ output_logprob + input_ntokens + input_nchars, 
    family = binomial, data = scaled_swap_next_base_llama_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.6454  -0.7201  -0.2520   0.7269   2.7394  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     -2.0358     0.2744  -7.418 1.19e-13 ***
output_logprob   3.1366     0.4763   6.586 4.52e-11 ***
input_ntokens    1.7278     0.6991   2.471   0.0135 *  
input_nchars    -0.7706     0.6257  -1.232   0.2181    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 345.91  on 299  degrees of freedom
Residual deviance: 248.15  on 296  degrees of freedom
AIC: 256.15

Number of Fisher Scoring iterations: 6


In [128]:
summary(swap_next_base_palm_model)


Call:
glm(formula = correct ~ output_logprob + input_ntokens + input_nchars, 
    family = binomial, data = scaled_swap_next_base_palm_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-0.5928  -0.1994  -0.1384  -0.0814   3.1921  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     -4.7869     0.7764  -6.166 7.01e-10 ***
output_logprob   1.5992     1.0698   1.495   0.1350    
input_ntokens    2.7465     1.5691   1.750   0.0801 .  
input_nchars    -2.0134     1.6493  -1.221   0.2222    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 50.860  on 299  degrees of freedom
Residual deviance: 45.019  on 296  degrees of freedom
AIC: 53.019

Number of Fisher Scoring iterations: 8


In [129]:
vif(swap_next_base_gpt4_model)

In [130]:
vif(swap_next_base_gpt35_model)

In [131]:
vif(swap_next_base_llama_model)

In [132]:
vif(swap_next_base_palm_model)

# Pig Latin

In [133]:
# No encoding for Llama or PaLM because both get 0% accuracy across the board

In [134]:
# Read in data
pigenc_gpt4_df <- read.table(file = 'table_pig_ay_enc_gpt-4-0613.tsv', sep = '\t', header = TRUE)
pigenc_gpt35_df <- read.table(file = 'table_pig_ay_enc_gpt-3.5-turbo-0613.tsv', sep = '\t', header = TRUE)

pigdec_gpt4_df <- read.table(file = 'table_pig_ay_dec_gpt-4-0613.tsv', sep = '\t', header = TRUE)
pigdec_gpt35_df <- read.table(file = 'table_pig_ay_dec_gpt-3.5-turbo-0613.tsv', sep = '\t', header = TRUE)
pigdec_llama_df <- read.table(file = 'table_pig_ay_dec_llama-2-70b-chat.tsv', sep = '\t', header = TRUE)
pigdec_palm_df <- read.table(file = 'table_pig_ay_dec_text-bison-001.tsv', sep = '\t', header = TRUE)

In [135]:
# Z-score data
scaled_pigenc_gpt4_df <- scale_df(pigenc_gpt4_df)
scaled_pigenc_gpt35_df <- scale_df(pigenc_gpt35_df)

scaled_pigdec_gpt4_df <- scale_df(pigdec_gpt4_df)
scaled_pigdec_gpt35_df <- scale_df(pigdec_gpt35_df)
scaled_pigdec_llama_df <- scale_df(pigdec_llama_df)
scaled_pigdec_palm_df <- scale_df(pigdec_palm_df)

In [136]:

pigenc_gpt4_model <- correct_vs_length_and_prob(scaled_pigenc_gpt4_df, include_output_chars=TRUE)
pigenc_gpt35_model <- correct_vs_length_and_prob(scaled_pigenc_gpt35_df, include_output_chars=TRUE)

pigdec_gpt4_model <- correct_vs_length_and_prob(scaled_pigdec_gpt4_df, include_output_chars=TRUE)
pigdec_gpt35_model <- correct_vs_length_and_prob(scaled_pigdec_gpt35_df, include_output_chars=TRUE)
pigdec_llama_model <- model <- bayesglm(correct ~ input_logprob + output_logprob + input_ntokens + output_ntokens + input_nchars + output_nchars, 
               data=scaled_pigdec_llama_df, family=binomial)
pigdec_palm_model <- correct_vs_length_and_prob(scaled_pigdec_palm_df, include_output_chars=TRUE)

In [137]:
summary(pigenc_gpt4_model) 


Call:
glm(formula = correct ~ input_logprob + output_logprob + input_ntokens + 
    output_ntokens + input_nchars + output_nchars, family = binomial, 
    data = df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.7494  -1.0129  -0.6483   1.1876   2.0373  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)    -0.52418    0.12731  -4.117 3.83e-05 ***
input_logprob  -0.02844    0.18989  -0.150  0.88097    
output_logprob  1.93966    0.70569   2.749  0.00598 ** 
input_ntokens   1.89304    0.79457   2.382  0.01720 *  
output_ntokens  1.90174    0.99917   1.903  0.05700 .  
input_nchars    5.93802    2.95840   2.007  0.04473 *  
output_nchars  -8.30880    3.67071  -2.264  0.02360 *  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 400.34  on 299  degrees of freedom
Residual deviance: 368.70  on 293  degrees of freedom
AIC: 382.7

Number of 

In [138]:
summary(pigenc_gpt35_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob + input_ntokens + 
    output_ntokens + input_nchars + output_nchars, family = binomial, 
    data = df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.3523  -0.8287  -0.6536   1.1613   2.2372  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)    -1.16906    0.14519  -8.052 8.15e-16 ***
input_logprob   0.08184    0.21390   0.383   0.7020    
output_logprob  0.91205    0.74120   1.230   0.2185    
input_ntokens   1.72350    0.86877   1.984   0.0473 *  
output_ntokens -0.86339    1.09634  -0.788   0.4310    
input_nchars   -1.56653    3.29903  -0.475   0.6349    
output_nchars   1.17719    4.02761   0.292   0.7701    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 341.72  on 299  degrees of freedom
Residual deviance: 321.69  on 293  degrees of freedom
AIC: 335.69

Number of

In [139]:
summary(pigdec_gpt4_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob + input_ntokens + 
    output_ntokens + input_nchars + output_nchars, family = binomial, 
    data = df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.8736  -0.7337   0.4224   0.7328   2.1890  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)      0.6525     0.1543   4.229 2.34e-05 ***
input_logprob    0.8155     0.7951   1.026   0.3050    
output_logprob   2.0507     0.2562   8.004 1.21e-15 ***
input_ntokens    0.9519     1.1533   0.825   0.4091    
output_ntokens  -1.5573     0.9316  -1.672   0.0946 .  
input_nchars     5.8314     4.3050   1.355   0.1756    
output_nchars   -2.8202     3.4841  -0.809   0.4183    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 401.25  on 299  degrees of freedom
Residual deviance: 292.16  on 293  degrees of freedom
AIC: 306.16

Number of

In [140]:
summary(pigdec_gpt35_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob + input_ntokens + 
    output_ntokens + input_nchars + output_nchars, family = binomial, 
    data = df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.6092  -0.6466  -0.1985   0.6533   3.3620  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     -2.0072     0.2743  -7.317 2.54e-13 ***
input_logprob    1.4048     0.8884   1.581   0.1138    
output_logprob   3.1852     0.4590   6.940 3.93e-12 ***
input_ntokens    0.3708     1.2622   0.294   0.7689    
output_ntokens  -2.3121     1.0911  -2.119   0.0341 *  
input_nchars     4.7209     4.8613   0.971   0.3315    
output_nchars   -0.2726     3.9111  -0.070   0.9444    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 357.64  on 299  degrees of freedom
Residual deviance: 236.09  on 293  degrees of freedom
AIC: 250.09

Number of

In [141]:
summary(pigdec_llama_model)


Call:
bayesglm(formula = correct ~ input_logprob + output_logprob + 
    input_ntokens + output_ntokens + input_nchars + output_nchars, 
    family = binomial, data = scaled_pigdec_llama_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-0.31704  -0.16940  -0.12168  -0.06982   2.84774  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     -5.1986     0.8839  -5.882 4.06e-09 ***
input_logprob   -0.2883     1.0002  -0.288    0.773    
output_logprob   1.2232     0.9715   1.259    0.208    
input_ntokens   -0.2496     1.0204  -0.245    0.807    
output_ntokens  -0.6614     1.0346  -0.639    0.523    
input_nchars     0.1261     1.0219   0.123    0.902    
output_nchars    0.2872     1.0172   0.282    0.778    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 33.601  on 299  degrees of freedom
Residual deviance: 28.537  on 293  degree

In [142]:
summary(pigdec_palm_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob + input_ntokens + 
    output_ntokens + input_nchars + output_nchars, family = binomial, 
    data = df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.21677  -0.35169  -0.14802  -0.03697   2.88598  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     -4.6178     0.7242  -6.376 1.82e-10 ***
input_logprob    1.3150     1.4637   0.898   0.3690    
output_logprob   3.8554     0.9496   4.060 4.91e-05 ***
input_ntokens    4.3825     1.8366   2.386   0.0170 *  
output_ntokens   0.8430     1.3294   0.634   0.5260    
input_nchars   -16.3625     7.2277  -2.264   0.0236 *  
output_nchars   13.9849     5.9001   2.370   0.0178 *  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 157.31  on 299  degrees of freedom
Residual deviance: 110.47  on 293  degrees of freedom
AIC: 124.47


In [143]:
vif(pigenc_gpt4_model)

In [144]:
vif(pigenc_gpt35_model)

In [145]:
vif(pigdec_gpt4_model)

In [146]:
vif(pigdec_gpt35_model)

In [147]:
vif(pigdec_llama_model)

In [148]:
vif(pigdec_palm_model)

### Comparing Pig Latin and Boar Etruscan

In [149]:
# Omitting Llama encoding (which had 0% across the board) 
# and PaLM encoding (which has 0% across the board)

In [150]:
# Read in data
pigboarenc_gpt4_df <- read.table(file = 'table_pig_boar_enc_gpt-4-0613.tsv', sep = '\t', header = TRUE)
pigboarenc_gpt35_df <- read.table(file = 'table_pig_boar_enc_gpt-3.5-turbo-0613.tsv', sep = '\t', header = TRUE)

pigboardec_gpt4_df <- read.table(file = 'table_pig_boar_dec_gpt-4-0613.tsv', sep = '\t', header = TRUE)
pigboardec_gpt35_df <- read.table(file = 'table_pig_boar_dec_gpt-3.5-turbo-0613.tsv', sep = '\t', header = TRUE)
pigboardec_llama_df <- read.table(file = 'table_pig_boar_dec_llama-2-70b-chat.tsv', sep = '\t', header = TRUE)
pigboardec_palm_df <- read.table(file = 'table_pig_boar_dec_text-bison-001.tsv', sep = '\t', header = TRUE)

In [151]:
# Z-score data
scaled_pigboarenc_gpt4_df <- scale_taskpair_df(pigboarenc_gpt4_df)
scaled_pigboarenc_gpt35_df <- scale_taskpair_df(pigboarenc_gpt35_df)

scaled_pigboardec_gpt4_df <- scale_taskpair_df(pigboardec_gpt4_df)
scaled_pigboardec_gpt35_df <- scale_taskpair_df(pigboardec_gpt35_df)
scaled_pigboardec_llama_df <- scale_taskpair_df(pigboardec_llama_df)
scaled_pigboardec_palm_df <- scale_taskpair_df(pigboardec_palm_df)

In [152]:
model_pigtaskenc4 <- bayesglm(correct ~ task + input_nchars + output_nchars + input_ntokens + output_ntokens + input_logprob + output_logprob, 
               data=scaled_pigboarenc_gpt4_df, family=binomial)


In [153]:
model_pigtaskenc35 <- bayesglm(correct ~ task + input_nchars + output_nchars + input_ntokens + output_ntokens + input_logprob + output_logprob, 
               data=scaled_pigboarenc_gpt35_df, family=binomial)

In [154]:
model_pigtaskdec4 <- bayesglm(correct ~ task + input_nchars + output_nchars + input_ntokens + output_ntokens + input_logprob + output_logprob, 
               data=scaled_pigboardec_gpt4_df, family=binomial)

In [155]:
model_pigtaskdec35 <- bayesglm(correct ~ task + input_nchars + output_nchars + input_ntokens + output_ntokens + input_logprob + output_logprob, 
               data=scaled_pigboardec_gpt35_df, family=binomial)

In [156]:
model_pigtaskdecllama <- bayesglm(correct ~ task + input_nchars + output_nchars + input_ntokens + output_ntokens + input_logprob + output_logprob, 
               data=scaled_pigboardec_llama_df, family=binomial)

In [157]:
model_pigtaskdecpalm <- bayesglm(correct ~ task + input_nchars + output_nchars + input_ntokens + output_ntokens + input_logprob + output_logprob, 
               data=scaled_pigboardec_palm_df, family=binomial)

In [158]:
summary(model_pigtaskenc4)


Call:
bayesglm(formula = correct ~ task + input_nchars + output_nchars + 
    input_ntokens + output_ntokens + input_logprob + output_logprob, 
    family = binomial, data = scaled_pigboarenc_gpt4_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.2647  -0.7784  -0.5503   1.0503   2.3496  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)    -0.56003    0.23023  -2.432 0.014998 *  
taskuv         -1.32615    0.38979  -3.402 0.000668 ***
input_nchars    0.22833    0.77500   0.295 0.768286    
output_nchars   0.17641    0.87201   0.202 0.839682    
input_ntokens  -0.06654    0.61653  -0.108 0.914051    
output_ntokens  0.16345    0.72926   0.224 0.822658    
input_logprob  -0.05815    0.31163  -0.187 0.851964    
output_logprob  1.02052    0.70345   1.451 0.146853    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 229.22  on 199  degr

In [159]:
summary(model_pigtaskenc35)


Call:
bayesglm(formula = correct ~ task + input_nchars + output_nchars + 
    input_ntokens + output_ntokens + input_logprob + output_logprob, 
    family = binomial, data = scaled_pigboarenc_gpt35_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.37642  -0.50706  -0.09424  -0.03471   2.18253  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     -1.2589     0.3004  -4.191 2.78e-05 ***
taskuv          -4.7283     1.5892  -2.975  0.00293 ** 
input_nchars    -0.1590     0.8572  -0.186  0.85283    
output_nchars   -0.3834     0.9585  -0.400  0.68912    
input_ntokens    1.6958     0.9195   1.844  0.06516 .  
output_ntokens  -0.9901     0.9850  -1.005  0.31484    
input_logprob    0.5392     0.4245   1.270  0.20403    
output_logprob   0.6935     0.8150   0.851  0.39483    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 169.08  o

In [160]:
summary(model_pigtaskdec4)


Call:
bayesglm(formula = correct ~ task + input_nchars + output_nchars + 
    input_ntokens + output_ntokens + input_logprob + output_logprob, 
    family = binomial, data = scaled_pigboardec_gpt4_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.2592   0.2391   0.4923   0.6698   1.7133  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     1.89288    0.31533   6.003 1.94e-09 ***
taskuv         -0.91583    0.41160  -2.225   0.0261 *  
input_nchars    0.40501    1.04353   0.388   0.6979    
output_nchars   2.20902    1.05734   2.089   0.0367 *  
input_ntokens  -0.23040    0.73565  -0.313   0.7541    
output_ntokens -1.01859    0.68521  -1.487   0.1371    
input_logprob   0.07216    0.66647   0.108   0.9138    
output_logprob  1.49702    0.34215   4.375 1.21e-05 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 218.10  on 199  degr

In [161]:
summary(model_pigtaskdec35)


Call:
bayesglm(formula = correct ~ task + input_nchars + output_nchars + 
    input_ntokens + output_ntokens + input_logprob + output_logprob, 
    family = binomial, data = scaled_pigboardec_gpt35_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.9060  -0.8991  -0.3178   0.9207   2.4139  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     0.02765    0.24673   0.112   0.9108    
taskuv         -0.51736    0.36100  -1.433   0.1518    
input_nchars    0.77663    1.12490   0.690   0.4899    
output_nchars   2.27877    1.07207   2.126   0.0335 *  
input_ntokens  -0.27435    0.72280  -0.380   0.7043    
output_ntokens -0.99061    0.63655  -1.556   0.1197    
input_logprob   0.78394    0.68171   1.150   0.2502    
output_logprob  1.81746    0.37748   4.815 1.47e-06 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 276.54  on 199  deg

In [162]:
mean(scaled_pigboardec_llama_df$correct)

In [163]:
summary(model_pigtaskdecllama)


Call:
bayesglm(formula = correct ~ task + input_nchars + output_nchars + 
    input_ntokens + output_ntokens + input_logprob + output_logprob, 
    family = binomial, data = scaled_pigboardec_llama_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-0.3620  -0.2075  -0.1179  -0.0743   2.6069  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)    -3.88497    0.77534  -5.011 5.42e-07 ***
taskuv         -1.98884    1.44261  -1.379    0.168    
input_nchars    0.06181    1.01785   0.061    0.952    
output_nchars   0.20777    1.00700   0.206    0.837    
input_ntokens  -0.40238    1.04743  -0.384    0.701    
output_ntokens -0.71066    1.04661  -0.679    0.497    
input_logprob  -0.08822    0.98639  -0.089    0.929    
output_logprob  0.10475    0.77607   0.135    0.893    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 31.153  on 199  deg

In [164]:
summary(model_pigtaskdecpalm)


Call:
bayesglm(formula = correct ~ task + input_nchars + output_nchars + 
    input_ntokens + output_ntokens + input_logprob + output_logprob, 
    family = binomial, data = scaled_pigboardec_palm_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.0009  -0.5836  -0.3695  -0.2330   2.4156  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)    -1.46840    0.30339  -4.840  1.3e-06 ***
taskuv         -1.68945    0.57272  -2.950  0.00318 ** 
input_nchars    0.23628    0.91598   0.258  0.79644    
output_nchars   0.59211    0.84886   0.698  0.48547    
input_ntokens   0.51326    0.80900   0.634  0.52580    
output_ntokens -0.46717    0.67581  -0.691  0.48940    
input_logprob   0.04689    0.74676   0.063  0.94993    
output_logprob  1.18563    0.48785   2.430  0.01508 *  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 150.71  on 199  degr

In [165]:
vif(model_pigtaskenc4)

In [166]:
vif(model_pigtaskenc35)

In [167]:
vif(model_pigtaskdec4)

In [168]:
vif(model_pigtaskdec35)

In [169]:
vif(model_pigtaskdecpalm)

### Comparing Pig Latin variants

In [170]:
# Omitting encoding for Llama and PaLM, which are both 0%
# across the board 

In [171]:
# Read in data
pigprobenc_gpt4_df <- read.table(file = 'table_pig_prob_enc_gpt-4-0613.tsv', sep = '\t', header = TRUE)
pigprobenc_gpt35_df <- read.table(file = 'table_pig_prob_enc_gpt-3.5-turbo-0613.tsv', sep = '\t', header = TRUE)

pigprobdec_gpt4_df <- read.table(file = 'table_pig_prob_dec_gpt-4-0613.tsv', sep = '\t', header = TRUE)
pigprobdec_gpt35_df <- read.table(file = 'table_pig_prob_dec_gpt-3.5-turbo-0613.tsv', sep = '\t', header = TRUE)
pigprobdec_llama_df <- read.table(file = 'table_pig_prob_dec_llama-2-70b-chat.tsv', sep = '\t', header = TRUE)
pigprobdec_palm_df <- read.table(file = 'table_pig_prob_dec_text-bison-001.tsv', sep = '\t', header = TRUE)

In [172]:
# Z-score data
scaled_pigprobenc_gpt4_df <- scale_taskpair_prob_df(pigprobenc_gpt4_df)
scaled_pigprobenc_gpt35_df <- scale_taskpair_prob_df(pigprobenc_gpt35_df)

scaled_pigprobdec_gpt4_df <- scale_taskpair_prob_df(pigprobdec_gpt4_df)
scaled_pigprobdec_gpt35_df <- scale_taskpair_prob_df(pigprobdec_gpt35_df)
scaled_pigprobdec_llama_df <- scale_taskpair_prob_df(pigprobdec_llama_df)
scaled_pigprobdec_palm_df <- scale_taskpair_prob_df(pigprobdec_palm_df)

In [173]:
model_pigprobenc4 <- glm(correct ~ task + input_nchars + output_nchars + input_ntokens + output_ntokens + input_logprob + output_logprob, 
               data=scaled_pigprobenc_gpt4_df, family=binomial)


In [174]:
model_pigprobenc35 <- glm(correct ~ task + input_nchars + output_nchars + input_ntokens + output_ntokens + input_logprob + output_logprob, 
               data=scaled_pigprobenc_gpt35_df, family=binomial)


In [175]:
model_pigprobdec4 <- glm(correct ~ task + input_nchars + output_nchars + input_ntokens + output_ntokens + input_logprob + output_logprob, 
               data=scaled_pigprobdec_gpt4_df, family=binomial)

In [176]:
model_pigprobdec35 <- glm(correct ~ task + input_nchars + output_nchars + input_ntokens + output_ntokens + input_logprob + output_logprob, 
               data=scaled_pigprobdec_gpt35_df, family=binomial)

In [177]:
model_pigprobdecllama <- glm(correct ~ task + input_nchars + output_nchars + input_ntokens + output_ntokens + input_logprob + output_logprob, 
               data=scaled_pigprobdec_llama_df, family=binomial)

In [178]:
model_pigprobdecpalm <- glm(correct ~ task + input_nchars + output_nchars + input_ntokens + output_ntokens + input_logprob + output_logprob, 
               data=scaled_pigprobdec_palm_df, family=binomial)

In [179]:
summary(model_pigprobenc4)


Call:
glm(formula = correct ~ task + input_nchars + output_nchars + 
    input_ntokens + output_ntokens + input_logprob + output_logprob, 
    family = binomial, data = scaled_pigprobenc_gpt4_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.6702  -0.7804  -0.4988   0.9214   2.5315  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)    -1.26069    0.12461 -10.117  < 2e-16 ***
task            0.74354    0.12438   5.978 2.26e-09 ***
input_nchars    5.28312    1.50249   3.516 0.000438 ***
output_nchars  -6.55400    1.90123  -3.447 0.000566 ***
input_ntokens   0.82896    0.60496   1.370 0.170602    
output_ntokens  1.82726    0.71174   2.567 0.010249 *  
input_logprob   0.01677    0.22750   0.074 0.941239    
output_logprob  1.90058    0.58417   3.253 0.001140 ** 
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 581.26  on 499  degrees o

In [180]:
summary(model_pigprobenc35)


Call:
glm(formula = correct ~ task + input_nchars + output_nchars + 
    input_ntokens + output_ntokens + input_logprob + output_logprob, 
    family = binomial, data = scaled_pigprobenc_gpt35_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.5192  -0.6000  -0.3587  -0.1589   2.6930  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     -2.1511     0.1796 -11.979  < 2e-16 ***
task             0.7805     0.1537   5.079 3.79e-07 ***
input_nchars     4.3925     1.8793   2.337  0.01943 *  
output_nchars   -6.2121     2.3870  -2.602  0.00926 ** 
input_ntokens    2.1263     0.7775   2.735  0.00624 ** 
output_ntokens   1.7178     0.9240   1.859  0.06302 .  
input_logprob    0.4887     0.2990   1.634  0.10217    
output_logprob   2.4526     0.7408   3.311  0.00093 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 452.70  on 499  degrees 

In [181]:
summary(model_pigprobdec4)


Call:
glm(formula = correct ~ task + input_nchars + output_nchars + 
    input_ntokens + output_ntokens + input_logprob + output_logprob, 
    family = binomial, data = scaled_pigprobdec_gpt4_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.7178   0.2276   0.4047   0.5621   2.2107  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)      1.9096     0.1555  12.281  < 2e-16 ***
task             0.2296     0.1350   1.700  0.08911 .  
input_nchars    -3.5093     2.4045  -1.459  0.14443    
output_nchars    6.7374     2.0216   3.333  0.00086 ***
input_ntokens    0.1912     0.7882   0.243  0.80833    
output_ntokens   0.1227     0.7785   0.158  0.87477    
input_logprob    1.6197     0.6849   2.365  0.01803 *  
output_logprob   1.7782     0.2850   6.240 4.39e-10 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 483.31  on 499  degrees o

In [182]:
summary(model_pigprobdec35)


Call:
glm(formula = correct ~ task + input_nchars + output_nchars + 
    input_ntokens + output_ntokens + input_logprob + output_logprob, 
    family = binomial, data = scaled_pigprobdec_gpt35_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-2.44809  -0.93313   0.04602   0.97348   2.74208  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)    -0.05707    0.10349  -0.551 0.581348    
task            0.11682    0.10485   1.114 0.265176    
input_nchars    1.12060    1.64945   0.679 0.496899    
output_nchars   2.39500    1.37888   1.737 0.082402 .  
input_ntokens   0.65655    0.59733   1.099 0.271708    
output_ntokens -2.08243    0.55620  -3.744 0.000181 ***
input_logprob   0.99006    0.50218   1.972 0.048665 *  
output_logprob  1.59434    0.24255   6.573 4.92e-11 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 693.15  on 499

In [183]:
summary(model_pigprobdecllama)


Call:
glm(formula = correct ~ task + input_nchars + output_nchars + 
    input_ntokens + output_ntokens + input_logprob + output_logprob, 
    family = binomial, data = scaled_pigprobdec_llama_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-0.6262  -0.2222  -0.1482  -0.0805   3.4275  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     -4.7632     0.6089  -7.823 5.17e-15 ***
task             0.1329     0.2996   0.444   0.6572    
input_nchars     5.5197     6.3443   0.870   0.3843    
output_nchars   -5.8920     5.3659  -1.098   0.2722    
input_ntokens   -0.5693     2.2518  -0.253   0.8004    
output_ntokens  -3.1654     2.1365  -1.482   0.1385    
input_logprob   -3.5764     2.1390  -1.672   0.0945 .  
output_logprob   0.8872     0.9963   0.890   0.3732    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 105.724  on 499  degrees

In [184]:
summary(model_pigprobdecpalm)


Call:
glm(formula = correct ~ task + input_nchars + output_nchars + 
    input_ntokens + output_ntokens + input_logprob + output_logprob, 
    family = binomial, data = scaled_pigprobdec_palm_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.4565  -0.6720  -0.4721  -0.2370   2.5287  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)    -1.75217    0.14586 -12.013  < 2e-16 ***
task           -0.03319    0.12532  -0.265  0.79111    
input_nchars   -4.02359    2.03429  -1.978  0.04794 *  
output_nchars   3.37585    1.68293   2.006  0.04486 *  
input_ntokens   2.49690    0.80635   3.097  0.00196 ** 
output_ntokens -1.31741    0.68136  -1.933  0.05317 .  
input_logprob  -0.57858    0.69279  -0.835  0.40363    
output_logprob  1.58445    0.31423   5.042  4.6e-07 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 486.22  on 499  degrees o

In [185]:
vif(model_pigprobenc4)

In [186]:
vif(model_pigprobenc35)

In [187]:
vif(model_pigprobdec4)

In [188]:
vif(model_pigprobdec35)

In [189]:
vif(model_pigprobdecpalm)

# Acronyms

In [190]:
# Read in data
acronym_gpt4_outp_df <- read.table(file = 'table_acronym_varyoutp_gpt-4-0613.tsv', sep = '\t', header = TRUE)
acronym_gpt35_outp_df <- read.table(file = 'table_acronym_varyoutp_gpt-3.5-turbo-0613.tsv', sep = '\t', header = TRUE)
acronym_llama_outp_df <- read.table(file = 'table_acronym_varyoutp_llama-2-70b-chat.tsv', sep = '\t', header = TRUE)
acronym_palm_outp_df <- read.table(file = 'table_acronym_varyoutp_text-bison-001.tsv', sep = '\t', header = TRUE)



acronym_gpt4_inp_df <- read.table(file = 'table_acronym_varyinp_gpt-4-0613.tsv', sep = '\t', header = TRUE)
acronym_gpt35_inp_df <- read.table(file = 'table_acronym_varyinp_gpt-3.5-turbo-0613.tsv', sep = '\t', header = TRUE)
acronym_llama_inp_df <- read.table(file = 'table_acronym_varyinp_llama-2-70b-chat.tsv', sep = '\t', header = TRUE)
acronym_palm_inp_df <- read.table(file = 'table_acronym_varyinp_text-bison-001.tsv', sep = '\t', header = TRUE)



In [191]:
# Z-score data
scaled_acronym_gpt4_outp_df <- scale_df(acronym_gpt4_outp_df)
scaled_acronym_gpt35_outp_df <- scale_df(acronym_gpt35_outp_df)
scaled_acronym_llama_outp_df <- scale_df(acronym_llama_outp_df)
scaled_acronym_palm_outp_df <- scale_df(acronym_palm_outp_df)


scaled_acronym_gpt4_inp_df <- scale_df(acronym_gpt4_inp_df)
scaled_acronym_gpt35_inp_df <- scale_df(acronym_gpt35_inp_df)
scaled_acronym_llama_inp_df <- scale_df(acronym_llama_inp_df)
scaled_acronym_palm_inp_df <- scale_df(acronym_palm_inp_df)

In [192]:
acronym_gpt4_outp_model <- glm(correct ~ input_logprob + output_logprob, 
               data=scaled_acronym_gpt4_outp_df, family=binomial)
acronym_gpt35_outp_model <- glm(correct ~ input_logprob + output_logprob, 
               data=scaled_acronym_gpt35_outp_df, family=binomial)
acronym_llama_outp_model <- glm(correct ~ input_logprob + output_logprob + input_ntokens + output_ntokens, 
               data=scaled_acronym_llama_outp_df, family=binomial)
acronym_palm_outp_model <- glm(correct ~ input_logprob + output_logprob + input_ntokens + output_ntokens, 
               data=scaled_acronym_palm_outp_df, family=binomial)

acronym_gpt4_inp_model <- glm(correct ~ input_logprob + output_logprob, 
               data=scaled_acronym_gpt4_inp_df, family=binomial)
acronym_gpt35_inp_model <- glm(correct ~ input_logprob + output_logprob, 
               data=scaled_acronym_gpt35_inp_df, family=binomial)
acronym_llama_inp_model <- glm(correct ~ input_logprob + output_logprob + input_ntokens + output_ntokens, 
               data=scaled_acronym_llama_inp_df, family=binomial)
acronym_palm_inp_model <- glm(correct ~ input_logprob + output_logprob + input_ntokens + output_ntokens, 
               data=scaled_acronym_palm_inp_df, family=binomial)

In [193]:
summary(acronym_gpt4_outp_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob, family = binomial, 
    data = scaled_acronym_gpt4_outp_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.8290  -1.4607   0.7400   0.8019   1.0212  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     1.02684    0.03230  31.795  < 2e-16 ***
input_logprob   0.01891    0.03233   0.585    0.559    
output_logprob  0.20402    0.03182   6.412 1.44e-10 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 5788.3  on 4999  degrees of freedom
Residual deviance: 5746.4  on 4997  degrees of freedom
AIC: 5752.4

Number of Fisher Scoring iterations: 4


In [194]:
summary(acronym_gpt35_outp_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob, family = binomial, 
    data = scaled_acronym_gpt35_outp_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.1969  -0.8597  -0.7189   1.3686   2.1452  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)    -0.95746    0.03234 -29.605   <2e-16 ***
input_logprob   0.04300    0.03167   1.357    0.175    
output_logprob  0.39609    0.03396  11.664   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 5970.6  on 4999  degrees of freedom
Residual deviance: 5821.2  on 4997  degrees of freedom
AIC: 5827.2

Number of Fisher Scoring iterations: 4


In [195]:
summary(acronym_llama_outp_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob + input_ntokens + 
    output_ntokens, family = binomial, data = scaled_acronym_llama_outp_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.0928  -0.8173  -0.7321   1.4325   1.9610  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)    -1.03151    0.03247 -31.763  < 2e-16 ***
input_logprob   0.04547    0.03211   1.416 0.156721    
output_logprob  0.24590    0.03368   7.302 2.84e-13 ***
input_ntokens   0.02709    0.03217   0.842 0.399686    
output_ntokens  0.12574    0.03272   3.842 0.000122 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 5792.4  on 4999  degrees of freedom
Residual deviance: 5725.5  on 4995  degrees of freedom
AIC: 5735.5

Number of Fisher Scoring iterations: 4


In [196]:
summary(acronym_palm_outp_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob + input_ntokens + 
    output_ntokens, family = binomial, data = scaled_acronym_palm_outp_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-0.4393  -0.3364  -0.3071  -0.2782   2.7104  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)    -3.02200    0.06823 -44.294  < 2e-16 ***
input_logprob  -0.06063    0.07040  -0.861  0.38907    
output_logprob  0.22987    0.07021   3.274  0.00106 ** 
input_ntokens   0.03851    0.06909   0.557  0.57726    
output_ntokens -0.08832    0.06891  -1.282  0.19998    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 1919.9  on 4999  degrees of freedom
Residual deviance: 1905.0  on 4995  degrees of freedom
AIC: 1915

Number of Fisher Scoring iterations: 6


In [197]:
summary(acronym_gpt4_inp_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob, family = binomial, 
    data = scaled_acronym_gpt4_inp_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.8250  -1.4850   0.7384   0.7858   0.9456  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     1.07649    0.03261  33.008  < 2e-16 ***
input_logprob   0.05493    0.03259   1.685   0.0919 .  
output_logprob  0.15358    0.03244   4.735 2.19e-06 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 5681.9  on 4999  degrees of freedom
Residual deviance: 5656.6  on 4997  degrees of freedom
AIC: 5662.6

Number of Fisher Scoring iterations: 4


In [198]:
summary(acronym_gpt35_inp_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob, family = binomial, 
    data = scaled_acronym_gpt35_inp_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.1457  -0.9082  -0.8348   1.4117   1.7552  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)    -0.72733    0.03035 -23.961  < 2e-16 ***
input_logprob   0.14452    0.03027   4.775 1.80e-06 ***
output_logprob  0.14814    0.03056   4.847 1.25e-06 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 6323.2  on 4999  degrees of freedom
Residual deviance: 6276.6  on 4997  degrees of freedom
AIC: 6282.6

Number of Fisher Scoring iterations: 4


In [199]:
summary(acronym_llama_inp_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob + input_ntokens + 
    output_ntokens, family = binomial, data = scaled_acronym_llama_inp_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.1888  -0.8390  -0.7396   1.4110   1.9519  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)    -0.95847    0.03198 -29.974  < 2e-16 ***
input_logprob   0.14519    0.03194   4.546 5.46e-06 ***
output_logprob  0.07204    0.03174   2.270   0.0232 *  
input_ntokens   0.00531    0.03206   0.166   0.8684    
output_ntokens  0.24748    0.03228   7.666 1.77e-14 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 5937.1  on 4999  degrees of freedom
Residual deviance: 5854.9  on 4995  degrees of freedom
AIC: 5864.9

Number of Fisher Scoring iterations: 4


In [200]:
summary(acronym_palm_inp_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob + input_ntokens + 
    output_ntokens, family = binomial, data = scaled_acronym_palm_inp_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-0.6329  -0.4220  -0.3761  -0.3257   2.6250  

Coefficients:
                Estimate Std. Error z value Pr(>|z|)    
(Intercept)    -2.590322   0.056920 -45.508   <2e-16 ***
input_logprob  -0.055296   0.088775  -0.623    0.533    
output_logprob  0.341394   0.056922   5.998    2e-09 ***
input_ntokens   0.037945   0.090231   0.421    0.674    
output_ntokens  0.005228   0.055497   0.094    0.925    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 2618.4  on 4999  degrees of freedom
Residual deviance: 2578.3  on 4995  degrees of freedom
AIC: 2588.3

Number of Fisher Scoring iterations: 5


In [201]:
vif(acronym_gpt4_outp_model)

In [202]:
vif(acronym_gpt35_outp_model)

In [203]:
vif(acronym_llama_outp_model)

In [204]:
vif(acronym_palm_outp_model)

In [205]:
vif(acronym_gpt4_inp_model)

In [206]:
vif(acronym_gpt35_inp_model)

In [207]:
vif(acronym_llama_inp_model)

In [208]:
vif(acronym_palm_inp_model)

### Vary task

In [397]:
# Read in data
acronym_gpt4_1and2_df <- read.table(file = 'table_acronym_varytask_gpt-4-0613.tsv', sep = '\t', header = TRUE)
acronym_gpt35_1and2_df <- read.table(file = 'table_acronym_varytask_gpt-3.5-turbo-0613.tsv', sep = '\t', header = TRUE)
acronym_llama_1and2_df <- read.table(file = 'table_acronym_varytask_llama-2-70b-chat.tsv', sep = '\t', header = TRUE)
acronym_palm_1and2_df <- read.table(file = 'table_acronym_varytask_text-bison-001.tsv', sep = '\t', header = TRUE)



In [398]:
# Z-score data
scaled_acronym_gpt4_1and2_df <- scale_taskpair_df(acronym_gpt4_1and2_df)
scaled_acronym_gpt35_1and2_df <- scale_taskpair_df(acronym_gpt35_1and2_df)
scaled_acronym_llama_1and2_df <- scale_taskpair_df(acronym_llama_1and2_df)
scaled_acronym_palm_1and2_df <- scale_taskpair_df(acronym_palm_1and2_df)

In [399]:
acronym_gpt4_1and2_model <- bayesglm(correct ~ task + input_logprob + output_logprob, 
               data=scaled_acronym_gpt4_1and2_df, family=binomial)
acronym_gpt35_1and2_model <- bayesglm(correct ~ task + input_logprob + output_logprob, 
               data=scaled_acronym_gpt35_1and2_df, family=binomial)
acronym_llama_1and2_model <- bayesglm(correct ~ task + input_logprob + output_logprob + input_ntokens + output_ntokens, 
               data=scaled_acronym_llama_1and2_df, family=binomial)
acronym_palm_1and2_model <- bayesglm(correct ~ task + input_logprob + output_logprob + input_ntokens + output_ntokens, 
               data=scaled_acronym_palm_1and2_df, family=binomial)

In [400]:
summary(acronym_gpt4_1and2_model)


Call:
bayesglm(formula = correct ~ task + input_logprob + output_logprob, 
    family = binomial, data = scaled_acronym_gpt4_1and2_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.9392  -0.2452  -0.2053   0.6980   2.8814  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     1.18066    0.07532  15.674  < 2e-16 ***
taskacronym2   -4.82454    0.21268 -22.684  < 2e-16 ***
input_logprob  -0.06966    0.06591  -1.057 0.290499    
output_logprob  0.26550    0.07002   3.792 0.000149 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 2682.0  on 1999  degrees of freedom
Residual deviance: 1323.3  on 1996  degrees of freedom
AIC: 1331.3

Number of Fisher Scoring iterations: 7


In [401]:
summary(acronym_gpt35_1and2_model)


Call:
bayesglm(formula = correct ~ task + input_logprob + output_logprob, 
    family = binomial, data = scaled_acronym_gpt35_1and2_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.20491  -0.93012  -0.02338  -0.02057   1.57853  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)    -0.44672    0.06506  -6.866 6.58e-12 ***
taskacronym2   -7.80307    1.83266  -4.258 2.06e-05 ***
input_logprob   0.04556    0.06047   0.753   0.4512    
output_logprob  0.15608    0.06549   2.383   0.0172 *  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 1976.4  on 1999  degrees of freedom
Residual deviance: 1332.3  on 1996  degrees of freedom
AIC: 1340.3

Number of Fisher Scoring iterations: 16


In [402]:
summary(acronym_llama_1and2_model)


Call:
bayesglm(formula = correct ~ task + input_logprob + output_logprob + 
    input_ntokens + output_ntokens, family = binomial, data = scaled_acronym_llama_1and2_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.03420  -0.83530  -0.02423  -0.02193   1.67417  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)    -0.76478    0.06829 -11.199  < 2e-16 ***
taskacronym2   -7.45042    1.79839  -4.143 3.43e-05 ***
input_logprob   0.04959    0.06316   0.785   0.4324    
output_logprob  0.02623    0.06825   0.384   0.7008    
input_ntokens   0.03593    0.07050   0.510   0.6103    
output_ntokens  0.13057    0.06852   1.906   0.0567 .  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 1752.0  on 1999  degrees of freedom
Residual deviance: 1246.6  on 1994  degrees of freedom
AIC: 1258.6

Number of Fisher Scoring iterations: 16


In [403]:
summary(acronym_palm_1and2_model)


Call:
bayesglm(formula = correct ~ task + input_logprob + output_logprob + 
    input_ntokens + output_ntokens, family = binomial, data = scaled_acronym_palm_1and2_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-0.64323  -0.33824  -0.03407  -0.02388   2.64943  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)    -2.77007    0.14050 -19.716  < 2e-16 ***
taskacronym2   -5.36029    1.59775  -3.355 0.000794 ***
input_logprob   0.02488    0.12821   0.194 0.846131    
output_logprob  0.44863    0.13670   3.282 0.001031 ** 
input_ntokens   0.16924    0.13770   1.229 0.219077    
output_ntokens  0.01082    0.13274   0.082 0.935041    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 559.67  on 1999  degrees of freedom
Residual deviance: 457.90  on 1994  degrees of freedom
AIC: 469.9

Number of Fisher Scoring iterations: 18


In [404]:
vif(acronym_gpt4_1and2_model)

In [405]:
vif(acronym_gpt35_1and2_model)

In [406]:
vif(acronym_llama_1and2_model)

In [407]:
vif(acronym_palm_1and2_model)

# Multiplication

In [215]:
# Omitting Llama because it scored 0% across the board

In [217]:
# Read in data
mult_gpt4_df <- read.table(file = 'table_multiplication_gpt-4-0613.tsv', sep = '\t', header = TRUE)
mult_gpt35_df <- read.table(file = 'table_multiplication_gpt-3.5-turbo-0613.tsv', sep = '\t', header = TRUE)
mult_palm_df <- read.table(file = 'table_multiplication_text-bison-001.tsv', sep = '\t', header = TRUE)

In [218]:
mult_gpt4_df$method <- factor(mult_gpt4_df$method)
mult_gpt35_df$method <- factor(mult_gpt35_df$method)
mult_palm_df$method <- factor(mult_palm_df$method)

In [219]:
head(mult_gpt4_df)

Unnamed: 0_level_0,index,method,correct
Unnamed: 0_level_1,<int>,<fct>,<int>
1,0,multiplication_number,0
2,1,multiplication_number,1
3,2,multiplication_number,0
4,3,multiplication_number,0
5,4,multiplication_number,1
6,5,multiplication_number,0


In [220]:
contrasts(mult_gpt4_df$method) <- contr.sum(4)
contrasts(mult_gpt35_df$method) <- contr.sum(4)
contrasts(mult_palm_df$method) <- contr.sum(4)

In [221]:
mult_gpt4_model <- glmer(correct ~ method + (1|index), 
               data=mult_gpt4_df, family=binomial)
mult_gpt4_null_model <- glmer(correct ~ (1|index), 
               data=mult_gpt4_df, family=binomial)   

In [222]:
mult_gpt35_model <- glmer(correct ~ method + (1|index), 
               data=mult_gpt35_df, family=binomial)
mult_gpt35_null_model <- glmer(correct ~ (1|index), 
               data=mult_gpt35_df, family=binomial)   

In [223]:
mult_palm_model <- bglmer(correct ~ method + (1|index), 
               data=mult_palm_df, family=binomial, fixef.prior = normal(cov = diag(9,4)))
mult_palm_null_model <- bglmer(correct ~ (1|index), 
               data=mult_palm_df, family=binomial)   

In [224]:
anova(mult_gpt4_model,mult_gpt4_null_model,test="Chisq")

Unnamed: 0_level_0,npar,AIC,BIC,logLik,deviance,Chisq,Df,Pr(>Chisq)
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
mult_gpt4_null_model,2,431.4605,439.4434,-213.7303,427.4605,,,
mult_gpt4_model,5,393.3867,413.344,-191.6933,383.3867,44.07383,3.0,1.455654e-09


In [225]:
anova(mult_gpt35_model,mult_gpt35_null_model,test="Chisq")

Unnamed: 0_level_0,npar,AIC,BIC,logLik,deviance,Chisq,Df,Pr(>Chisq)
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
mult_gpt35_null_model,2,395.8202,403.8031,-195.9101,391.8202,,,
mult_gpt35_model,5,324.5254,344.4827,-157.2627,314.5254,77.29482,3.0,1.16727e-16


In [226]:
anova(mult_palm_model,mult_palm_null_model,test="Chisq")

Unnamed: 0_level_0,npar,AIC,BIC,logLik,deviance,Chisq,Df,Pr(>Chisq)
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
mult_palm_null_model,2,155.574,163.5569,-75.78701,151.574,,,
mult_palm_model,5,132.222,152.1794,-61.11102,122.222,29.35197,3.0,1.888704e-06


In [227]:
gpt4_mult_multcomp <- glht(mult_gpt4_model, linfct=mcp(method="Tukey"))


In [228]:
summary(gpt4_mult_multcomp)


	 Simultaneous Tests for General Linear Hypotheses

Multiple Comparisons of Means: Tukey Contrasts


Fit: glmer(formula = correct ~ method + (1 | index), data = mult_gpt4_df, 
    family = binomial)

Linear Hypotheses:
                                                             Estimate
multiplication_alternatingcaps - multiplication_allcaps == 0  -2.1294
multiplication_number - multiplication_allcaps == 0            1.0515
multiplication_word - multiplication_allcaps == 0              0.3911
multiplication_number - multiplication_alternatingcaps == 0    3.1809
multiplication_word - multiplication_alternatingcaps == 0      2.5205
multiplication_word - multiplication_number == 0              -0.6604
                                                             Std. Error z value
multiplication_alternatingcaps - multiplication_allcaps == 0     0.5432  -3.920
multiplication_number - multiplication_allcaps == 0              0.4521   2.326
multiplication_word - multiplication_allcaps == 0 

In [229]:
gpt35_mult_multcomp <- glht(mult_gpt35_model, linfct=mcp(method="Tukey"))


In [230]:
summary(gpt35_mult_multcomp)


	 Simultaneous Tests for General Linear Hypotheses

Multiple Comparisons of Means: Tukey Contrasts


Fit: glmer(formula = correct ~ method + (1 | index), data = mult_gpt35_df, 
    family = binomial)

Linear Hypotheses:
                                                             Estimate
multiplication_alternatingcaps - multiplication_allcaps == 0  -2.8676
multiplication_number - multiplication_allcaps == 0            1.8968
multiplication_word - multiplication_allcaps == 0              0.1711
multiplication_number - multiplication_alternatingcaps == 0    4.7644
multiplication_word - multiplication_alternatingcaps == 0      3.0387
multiplication_word - multiplication_number == 0              -1.7257
                                                             Std. Error z value
multiplication_alternatingcaps - multiplication_allcaps == 0     1.0538  -2.721
multiplication_number - multiplication_allcaps == 0              0.4042   4.693
multiplication_word - multiplication_allcaps == 0

In [231]:
palm_mult_multcomp <- glht(mult_palm_model, linfct=mcp(method="Tukey"))


In [232]:
summary(palm_mult_multcomp)


	 Simultaneous Tests for General Linear Hypotheses

Multiple Comparisons of Means: Tukey Contrasts


Fit: bglmer(formula = correct ~ method + (1 | index), data = mult_palm_df, 
    family = binomial, fixef.prior = normal(cov = diag(9, 4)))

Linear Hypotheses:
                                                               Estimate
multiplication_alternatingcaps - multiplication_allcaps == 0 -6.766e-06
multiplication_number - multiplication_allcaps == 0           4.236e+00
multiplication_word - multiplication_allcaps == 0             4.098e+00
multiplication_number - multiplication_alternatingcaps == 0   4.236e+00
multiplication_word - multiplication_alternatingcaps == 0     4.098e+00
multiplication_word - multiplication_number == 0             -1.378e-01
                                                             Std. Error z value
multiplication_alternatingcaps - multiplication_allcaps == 0  2.092e+00   0.000
multiplication_number - multiplication_allcaps == 0           1.691e+00   2

# Linear function

In [235]:
# Read in data
linfwd_gpt4_df <- read.table(file = 'table_conversion_fwd_gpt-4-0613.tsv', sep = '\t', header = TRUE)
linfwd_gpt35_df <- read.table(file = 'table_conversion_fwd_gpt-3.5-turbo-0613.tsv', sep = '\t', header = TRUE)
linfwd_llama_df <- read.table(file = 'table_conversion_fwd_llama-2-70b-chat.tsv', sep = '\t', header = TRUE)
linfwd_palm_df <- read.table(file = 'table_conversion_fwd_text-bison-001.tsv', sep = '\t', header = TRUE)


linrev_gpt4_df <- read.table(file = 'table_conversion_rev_gpt-4-0613.tsv', sep = '\t', header = TRUE)
linrev_gpt35_df <- read.table(file = 'table_conversion_rev_gpt-3.5-turbo-0613.tsv', sep = '\t', header = TRUE)
linrev_llama_df <- read.table(file = 'table_conversion_rev_llama-2-70b-chat.tsv', sep = '\t', header = TRUE)
linrev_palm_df <- read.table(file = 'table_conversion_rev_text-bison-001.tsv', sep = '\t', header = TRUE)

In [236]:
head(linfwd_gpt4_df)

Unnamed: 0_level_0,index,task,input,output,correct
Unnamed: 0_level_1,<int>,<chr>,<int>,<dbl>,<int>
1,0,conversion_actual,328,622.4,1
2,1,conversion_actual,941,1725.8,0
3,2,conversion_actual,476,888.8,0
4,3,conversion_actual,230,446.0,1
5,4,conversion_actual,577,1070.6,0
6,5,conversion_actual,64,147.2,1


In [237]:
scale_lin_df <- function(df) {
    new_df <- data.frame(scale(df[3:5]))
    new_df$index <- as.factor(df$index)
    new_df$correct <- df$correct
    new_df$task <- factor(df$task)
    
    return(new_df)
}

In [238]:
scaled_linfwd_gpt4_df <- scale_lin_df(linfwd_gpt4_df)
scaled_linfwd_gpt35_df <- scale_lin_df(linfwd_gpt35_df)
scaled_linfwd_llama_df <- scale_lin_df(linfwd_llama_df)
scaled_linfwd_palm_df <- scale_lin_df(linfwd_palm_df)

scaled_linrev_gpt4_df <- scale_lin_df(linrev_gpt4_df)
scaled_linrev_gpt35_df <- scale_lin_df(linrev_gpt35_df)
scaled_linrev_llama_df <- scale_lin_df(linrev_llama_df)
scaled_linrev_palm_df <- scale_lin_df(linrev_palm_df)

In [239]:
linfwd_gpt4_model <- bayesglm(correct ~ task + input + output, 
               data=scaled_linfwd_gpt4_df, family=binomial)
linfwd_gpt35_model <- bayesglm(correct ~ task + input + output, 
               data=scaled_linfwd_gpt35_df, family=binomial)
linfwd_llama_model <- bayesglm(correct ~ task + input + output, 
               data=scaled_linfwd_llama_df, family=binomial)
linfwd_palm_model <- bayesglm(correct ~ task + input + output, 
               data=scaled_linfwd_palm_df, family=binomial)

linrev_gpt4_model <- bayesglm(correct ~ task + input + output, 
               data=scaled_linrev_gpt4_df, family=binomial)
linrev_gpt35_model <- bayesglm(correct ~ task + input + output, 
               data=scaled_linrev_gpt35_df, family=binomial)
linrev_llama_model <- bayesglm(correct ~ task + input + output, 
               data=scaled_linrev_llama_df, family=binomial)
linrev_palm_model <- bayesglm(correct ~ task + input + output, 
               data=scaled_linrev_palm_df, family=binomial)

In [240]:
summary(linfwd_gpt4_model)


Call:
bayesglm(formula = correct ~ task + input + output, family = binomial, 
    data = scaled_linfwd_gpt4_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.38997  -0.40293  -0.08789  -0.03170   2.28552  

Coefficients:
                    Estimate Std. Error z value Pr(>|z|)   
(Intercept)          -0.8416     0.2924  -2.878  0.00400 **
taskconversion_fake  -5.4764     1.6658  -3.288  0.00101 **
input                -0.5816     0.8363  -0.695  0.48679   
output               -0.5344     0.7688  -0.695  0.48703   
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 179.15  on 199  degrees of freedom
Residual deviance: 104.27  on 196  degrees of freedom
AIC: 112.27

Number of Fisher Scoring iterations: 18


In [241]:
summary(linfwd_gpt35_model)


Call:
bayesglm(formula = correct ~ task + input + output, family = binomial, 
    data = scaled_linfwd_gpt35_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.93263  -0.26941  -0.04796  -0.01200   2.05953  

Coefficients:
                    Estimate Std. Error z value Pr(>|z|)    
(Intercept)          -0.1080     0.3119  -0.346 0.729269    
taskconversion_fake  -6.9788     1.8301  -3.813 0.000137 ***
input                -0.8874     0.9413  -0.943 0.345840    
output               -0.8883     0.8660  -1.026 0.305031    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 218.099  on 199  degrees of freedom
Residual deviance:  88.084  on 196  degrees of freedom
AIC: 96.084

Number of Fisher Scoring iterations: 16


In [242]:
summary(linfwd_llama_model)


Call:
bayesglm(formula = correct ~ task + input + output, family = binomial, 
    data = scaled_linfwd_llama_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-0.2497  -0.1925  -0.1720  -0.1468   3.0471  

Coefficients:
                    Estimate Std. Error z value Pr(>|z|)    
(Intercept)         -4.48544    0.86647  -5.177 2.26e-07 ***
taskconversion_fake  0.47910    1.02907   0.466    0.642    
input               -0.26588    0.77513  -0.343    0.732    
output              -0.07714    0.80190  -0.096    0.923    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 31.153  on 199  degrees of freedom
Residual deviance: 30.377  on 196  degrees of freedom
AIC: 38.377

Number of Fisher Scoring iterations: 13


In [243]:
summary(linfwd_palm_model)


Call:
bayesglm(formula = correct ~ task + input + output, family = binomial, 
    data = scaled_linfwd_palm_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-0.27674  -0.11698  -0.06745  -0.03631   2.63745  

Coefficients:
                    Estimate Std. Error z value Pr(>|z|)    
(Intercept)          -5.5201     1.5091  -3.658 0.000254 ***
taskconversion_fake  -1.1932     1.5738  -0.758 0.448345    
input                -0.7045     1.0819  -0.651 0.514957    
output               -0.7189     1.0699  -0.672 0.501644    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 12.5916  on 199  degrees of freedom
Residual deviance:  9.0947  on 196  degrees of freedom
AIC: 17.095

Number of Fisher Scoring iterations: 28


In [244]:
summary(linrev_gpt4_model)


Call:
bayesglm(formula = correct ~ task + input + output, family = binomial, 
    data = scaled_linrev_gpt4_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.40818  -0.60031  -0.07781  -0.02974   1.97667  

Coefficients:
                           Estimate Std. Error z value Pr(>|z|)    
(Intercept)                 -0.3915     0.2716  -1.442 0.149357    
taskconversion_fakeinverse  -5.6718     1.6886  -3.359 0.000782 ***
input                       -0.4566     0.8349  -0.547 0.584435    
output                      -0.4366     0.7182  -0.608 0.543210    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 210.76  on 199  degrees of freedom
Residual deviance: 122.78  on 196  degrees of freedom
AIC: 130.78

Number of Fisher Scoring iterations: 18


In [245]:
summary(linrev_gpt35_model)


Call:
bayesglm(formula = correct ~ task + input + output, family = binomial, 
    data = scaled_linrev_gpt35_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.8763  -0.2377  -0.1307   0.7056   3.2600  

Coefficients:
                           Estimate Std. Error z value Pr(>|z|)    
(Intercept)                  0.4090     0.2653   1.541    0.123    
taskconversion_fakeinverse  -4.9606     0.9044  -5.485 4.13e-08 ***
input                       -0.1544     0.8135  -0.190    0.849    
output                      -0.6415     0.7081  -0.906    0.365    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 246.02  on 199  degrees of freedom
Residual deviance: 131.42  on 196  degrees of freedom
AIC: 139.42

Number of Fisher Scoring iterations: 11


In [246]:
summary(linrev_llama_model)


Call:
bayesglm(formula = correct ~ task + input + output, family = binomial, 
    data = scaled_linrev_llama_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-0.14645  -0.13044  -0.08449  -0.07382   3.08845  

Coefficients:
                           Estimate Std. Error z value Pr(>|z|)    
(Intercept)                -4.83438    1.05788  -4.570 4.88e-06 ***
taskconversion_fakeinverse -1.09854    1.58680  -0.692    0.489    
input                      -0.14046    0.89732  -0.157    0.876    
output                     -0.06508    0.85226  -0.076    0.939    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 12.592  on 199  degrees of freedom
Residual deviance: 11.702  on 196  degrees of freedom
AIC: 19.702

Number of Fisher Scoring iterations: 19


In [247]:
summary(linrev_palm_model)


Call:
bayesglm(formula = correct ~ task + input + output, family = binomial, 
    data = scaled_linrev_palm_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-0.92077  -0.15152  -0.04256  -0.00754   2.10451  

Coefficients:
                           Estimate Std. Error z value Pr(>|z|)    
(Intercept)                 -5.8778     1.6177  -3.633  0.00028 ***
taskconversion_fakeinverse  -2.6174     1.4338  -1.826  0.06792 .  
input                       -0.8414     1.2724  -0.661  0.50843    
output                      -2.6590     1.4757  -1.802  0.07156 .  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 46.763  on 199  degrees of freedom
Residual deviance: 24.172  on 196  degrees of freedom
AIC: 32.172

Number of Fisher Scoring iterations: 68


In [248]:
vif(linfwd_gpt4_model)

In [249]:
vif(linfwd_gpt35_model)

In [250]:
vif(linfwd_llama_model)

In [251]:
vif(linfwd_palm_model)

In [252]:
vif(linrev_gpt4_model)

In [253]:
vif(linrev_gpt35_model)

In [254]:
vif(linrev_llama_model)

In [255]:
vif(linrev_palm_model)

### Comparing methods

In [256]:
# We ended up not using this part, but instead using the OOD part

In [257]:
linmethod_gpt4_df <- read.table(file = 'table_conversion_method_gpt-4-0613.tsv', sep = '\t', header = TRUE)
linmethod_gpt35_df <- read.table(file = 'table_conversion_method_gpt-3.5-turbo-0613.tsv', sep = '\t', header = TRUE)


In [258]:
scaled_linmethod_gpt4_df <- scale_lin_df(linmethod_gpt4_df)
scaled_linmethod_gpt35_df <- scale_lin_df(linmethod_gpt35_df)


In [259]:
contrasts(scaled_linmethod_gpt4_df$task) <- contr.sum(3)
contrasts(scaled_linmethod_gpt35_df$task) <- contr.sum(3)


In [260]:
linmethod_gpt4_model <- glmer(correct ~ task + (1|index), 
               data=scaled_linmethod_gpt4_df, family=binomial)
linmethod_gpt35_model <- glmer(correct ~ task + (1|index), 
               data=scaled_linmethod_gpt35_df , family=binomial)


In [261]:
linmethod_gpt4_null_model <- glmer(correct ~ (1|index), 
               data=scaled_linmethod_gpt4_df, family=binomial)
linmethod_gpt35_null_model <- glmer(correct ~ (1|index), 
               data=scaled_linmethod_gpt35_df , family=binomial)


In [262]:
summary(linmethod_gpt4_model)

Generalized linear mixed model fit by maximum likelihood (Laplace
  Approximation) [glmerMod]
 Family: binomial  ( logit )
Formula: correct ~ task + (1 | index)
   Data: scaled_linmethod_gpt4_df

     AIC      BIC   logLik deviance df.resid 
   311.7    326.5   -151.9    303.7      296 

Scaled residuals: 
     Min       1Q   Median       3Q      Max 
-2.25915 -0.20466  0.04086  0.44265  2.02756 

Random effects:
 Groups Name        Variance Std.Dev.
 index  (Intercept) 11.22    3.35    
Number of obs: 300, groups:  index, 100

Fixed effects:
            Estimate Std. Error z value Pr(>|z|)    
(Intercept)   0.9127     0.4387   2.081   0.0375 *  
task1        -2.6689     0.5598  -4.767 1.87e-06 ***
task2         2.5676     0.5447   4.713 2.44e-06 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Correlation of Fixed Effects:
      (Intr) task1 
task1 -0.354       
task2  0.350 -0.870

In [263]:
summary(linmethod_gpt35_model)

Generalized linear mixed model fit by maximum likelihood (Laplace
  Approximation) [glmerMod]
 Family: binomial  ( logit )
Formula: correct ~ task + (1 | index)
   Data: scaled_linmethod_gpt35_df

     AIC      BIC   logLik deviance df.resid 
   322.8    337.6   -157.4    314.8      296 

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
-2.2141 -0.3941  0.2185  0.4516  2.5372 

Random effects:
 Groups Name        Variance Std.Dev.
 index  (Intercept) 5.497    2.345   
Number of obs: 300, groups:  index, 100

Fixed effects:
            Estimate Std. Error z value Pr(>|z|)    
(Intercept)   1.3924     0.3640   3.826  0.00013 ***
task1        -1.5561     0.3001  -5.185 2.16e-07 ***
task2         1.8958     0.3556   5.331 9.74e-08 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Correlation of Fixed Effects:
      (Intr) task1 
task1 -0.374       
task2  0.414 -0.735

In [264]:
anova(linmethod_gpt4_model,linmethod_gpt4_null_model,test="Chisq")

Unnamed: 0_level_0,npar,AIC,BIC,logLik,deviance,Chisq,Df,Pr(>Chisq)
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
linmethod_gpt4_null_model,2,392.7577,400.1652,-194.3788,388.7577,,,
linmethod_gpt4_model,4,311.7095,326.5246,-151.8547,303.7095,85.04822,2.0,3.404193e-19


In [265]:
anova(linmethod_gpt35_model,linmethod_gpt35_null_model,test="Chisq")

Unnamed: 0_level_0,npar,AIC,BIC,logLik,deviance,Chisq,Df,Pr(>Chisq)
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
linmethod_gpt35_null_model,2,374.2759,381.6835,-185.1379,370.2759,,,
linmethod_gpt35_model,4,322.8321,337.6472,-157.4161,314.8321,55.44378,2.0,9.131353e-13


In [266]:
gpt4_lin_multcomp <- glht(linmethod_gpt4_model, linfct=mcp(task="Tukey"))
gpt35_lin_multcomp <- glht(linmethod_gpt35_model, linfct=mcp(task="Tukey"))


In [267]:
summary(gpt4_lin_multcomp)


	 Simultaneous Tests for General Linear Hypotheses

Multiple Comparisons of Means: Tukey Contrasts


Fit: glmer(formula = correct ~ task + (1 | index), data = scaled_linmethod_gpt4_df, 
    family = binomial)

Linear Hypotheses:
                                                              Estimate
conversion_actualprimed - conversion_actual == 0                5.2365
conversion_actualprimedcontrol - conversion_actual == 0         2.7703
conversion_actualprimedcontrol - conversion_actualprimed == 0  -2.4662
                                                              Std. Error
conversion_actualprimed - conversion_actual == 0                  1.0681
conversion_actualprimedcontrol - conversion_actual == 0           0.6992
conversion_actualprimedcontrol - conversion_actualprimed == 0     0.6625
                                                              z value Pr(>|z|)
conversion_actualprimed - conversion_actual == 0                4.903  < 1e-04
conversion_actualprimedcontrol - con

In [268]:
summary(gpt35_lin_multcomp)


	 Simultaneous Tests for General Linear Hypotheses

Multiple Comparisons of Means: Tukey Contrasts


Fit: glmer(formula = correct ~ task + (1 | index), data = scaled_linmethod_gpt35_df, 
    family = binomial)

Linear Hypotheses:
                                                              Estimate
conversion_actualprimed - conversion_actual == 0                3.4519
conversion_actualprimedcontrol - conversion_actual == 0         1.2163
conversion_actualprimedcontrol - conversion_actualprimed == 0  -2.2356
                                                              Std. Error
conversion_actualprimed - conversion_actual == 0                  0.6110
conversion_actualprimedcontrol - conversion_actual == 0           0.4161
conversion_actualprimedcontrol - conversion_actualprimed == 0     0.5313
                                                              z value Pr(>|z|)
conversion_actualprimed - conversion_actual == 0                5.650  < 0.001
conversion_actualprimedcontrol - co

### Basic test: OOD

In [269]:
# Read in data
linfwdood_gpt4_df <- read.table(file = 'table_conversion_ood_fwd_gpt-4-0613.tsv', sep = '\t', header = TRUE)
linfwdood_gpt35_df <- read.table(file = 'table_conversion_ood_fwd_gpt-3.5-turbo-0613.tsv', sep = '\t', header = TRUE)

linrevood_gpt4_df <- read.table(file = 'table_conversion_ood_rev_gpt-4-0613.tsv', sep = '\t', header = TRUE)
linrevood_gpt35_df <- read.table(file = 'table_conversion_ood_rev_gpt-3.5-turbo-0613.tsv', sep = '\t', header = TRUE)

In [270]:
scaled_linfwdood_gpt4_df <- scale_lin_df(linfwdood_gpt4_df)
scaled_linfwdood_gpt35_df <- scale_lin_df(linfwdood_gpt35_df)
scaled_linrevood_gpt4_df <- scale_lin_df(linrevood_gpt4_df)
scaled_linrevood_gpt35_df <- scale_lin_df(linrevood_gpt35_df)

In [271]:
linfwdood_gpt4_model <- bayesglm(correct ~ task + input + output, 
               data=scaled_linfwdood_gpt4_df, family=binomial)
linfwdood_gpt35_model <- bayesglm(correct ~ task + input + output, 
               data=scaled_linfwdood_gpt35_df, family=binomial)
linrevood_gpt4_model <- bayesglm(correct ~ task + input + output, 
               data=scaled_linrevood_gpt4_df, family=binomial)
linrevood_gpt35_model <- bayesglm(correct ~ task + input + output, 
               data=scaled_linrevood_gpt35_df, family=binomial)

In [272]:
summary(linfwdood_gpt4_model)


Call:
bayesglm(formula = correct ~ task + input + output, family = binomial, 
    data = scaled_linfwdood_gpt4_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-0.7808  -0.3932  -0.1228  -0.0655   2.4532  

Coefficients:
                        Estimate Std. Error z value Pr(>|z|)    
(Intercept)              -2.2336     0.5434  -4.110 3.95e-05 ***
taskconversion_ood_fake  -3.6606     1.6255  -2.252   0.0243 *  
input                    -0.5776     0.7349  -0.786   0.4319    
output                   -0.1299     0.7855  -0.165   0.8687    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 85.193  on 199  degrees of freedom
Residual deviance: 64.937  on 196  degrees of freedom
AIC: 72.937

Number of Fisher Scoring iterations: 21


In [273]:
summary(linfwdood_gpt35_model)


Call:
bayesglm(formula = correct ~ task + input + output, family = binomial, 
    data = scaled_linfwdood_gpt35_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.41999  -0.33987  -0.20057  -0.06683   2.69279  

Coefficients:
                        Estimate Std. Error z value Pr(>|z|)   
(Intercept)              -1.8413     0.5660  -3.253  0.00114 **
taskconversion_ood_fake  -3.3915     1.1394  -2.977  0.00292 **
input                    -1.1050     0.7961  -1.388  0.16512   
output                   -0.3337     0.8472  -0.394  0.69364   
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 130.033  on 199  degrees of freedom
Residual deviance:  83.716  on 196  degrees of freedom
AIC: 91.716

Number of Fisher Scoring iterations: 15


In [274]:
summary(linrevood_gpt4_model)


Call:
bayesglm(formula = correct ~ task + input + output, family = binomial, 
    data = scaled_linrevood_gpt4_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-0.85211  -0.69168  -0.09061  -0.07368   1.85504  

Coefficients:
                               Estimate Std. Error z value Pr(>|z|)   
(Intercept)                     -1.2347     0.4630  -2.666  0.00767 **
taskconversion_ood_fakeinverse  -4.4110     1.7146  -2.573  0.01009 * 
input                           -0.1056     0.8171  -0.129  0.89714   
output                           0.3170     0.6315   0.502  0.61571   
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 146.77  on 199  degrees of freedom
Residual deviance: 109.74  on 196  degrees of freedom
AIC: 117.74

Number of Fisher Scoring iterations: 20


In [275]:
summary(linrevood_gpt35_model)


Call:
bayesglm(formula = correct ~ task + input + output, family = binomial, 
    data = scaled_linrevood_gpt35_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.05965  -0.83455  -0.08504  -0.06645   1.60478  

Coefficients:
                                Estimate Std. Error z value Pr(>|z|)   
(Intercept)                    -0.792035   0.458688  -1.727  0.08421 . 
taskconversion_ood_fakeinverse -4.812195   1.725003  -2.790  0.00528 **
input                          -0.310161   0.823762  -0.377  0.70653   
output                          0.003018   0.631042   0.005  0.99618   
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 185.49  on 199  degrees of freedom
Residual deviance: 128.92  on 196  degrees of freedom
AIC: 136.92

Number of Fisher Scoring iterations: 19


In [276]:
vif(linfwdood_gpt4_model)

In [277]:
vif(linfwdood_gpt35_model)

In [278]:
vif(linrevood_gpt4_model)

In [279]:
vif(linrevood_gpt35_model)

### Comparing methods: OOD

In [280]:
# Omit Llama: 0% accuracy throughout

In [281]:
linmethodood_gpt4_df <- read.table(file = 'table_conversion_ood_method_gpt-4-0613.tsv', sep = '\t', header = TRUE)
linmethodood_gpt35_df <- read.table(file = 'table_conversion_ood_method_gpt-3.5-turbo-0613.tsv', sep = '\t', header = TRUE)
linmethodood_palm_df <- read.table(file = 'table_conversion_ood_method_text-bison-001.tsv', sep = '\t', header = TRUE)




In [282]:
scaled_linmethodood_gpt4_df <- scale_lin_df(linmethodood_gpt4_df)
scaled_linmethodood_gpt35_df <- scale_lin_df(linmethodood_gpt35_df)
scaled_linmethodood_palm_df <- scale_lin_df(linmethodood_palm_df)


In [283]:
contrasts(scaled_linmethodood_gpt4_df$task) <- contr.sum(3)
contrasts(scaled_linmethodood_gpt35_df$task) <- contr.sum(3)
contrasts(scaled_linmethodood_palm_df$task) <- contr.sum(3)

In [286]:
linmethodood_gpt4_model <- glmer(correct ~ task + (1|index), 
               data=scaled_linmethodood_gpt4_df, family=binomial, control=glmerControl(optimizer="bobyqa",optCtrl=list(maxfun=2e5)))
linmethodood_gpt35_model <- glmer(correct ~ task + (1|index), 
               data=scaled_linmethodood_gpt35_df , family=binomial, control=glmerControl(optimizer="bobyqa",optCtrl=list(maxfun=2e5)))
linmethodood_palm_model <- bglmer(correct ~ task + (1|index), fixef.prior = normal(cov = diag(9,3)),
               data=scaled_linmethodood_palm_df , family=binomial, control=glmerControl(optimizer="bobyqa",optCtrl=list(maxfun=2e5)))


In [287]:
linmethodood_gpt4_null_model <- glmer(correct ~ (1|index), 
               data=scaled_linmethodood_gpt4_df, family=binomial, control=glmerControl(optimizer="bobyqa",optCtrl=list(maxfun=2e5)))
linmethodood_gpt35_null_model <- glmer(correct ~ (1|index), 
               data=scaled_linmethodood_gpt35_df , family=binomial, control=glmerControl(optimizer="bobyqa",optCtrl=list(maxfun=2e5)))
linmethodood_palm_null_model <- bglmer(correct ~ (1|index), 
               data=scaled_linmethodood_palm_df , family=binomial, control=glmerControl(optimizer="bobyqa",optCtrl=list(maxfun=2e5)))



In [288]:
summary(linmethodood_gpt4_model)

Generalized linear mixed model fit by maximum likelihood (Laplace
  Approximation) [glmerMod]
 Family: binomial  ( logit )
Formula: correct ~ task + (1 | index)
   Data: scaled_linmethodood_gpt4_df
Control: glmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e+05))

     AIC      BIC   logLik deviance df.resid 
   293.2    308.0   -142.6    285.2      296 

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
-1.9160 -0.2553 -0.0708  0.1482  3.9168 

Random effects:
 Groups Name        Variance Std.Dev.
 index  (Intercept) 14.51    3.809   
Number of obs: 300, groups:  index, 100

Fixed effects:
            Estimate Std. Error z value Pr(>|z|)    
(Intercept)  -1.7345     0.5903  -2.939   0.0033 ** 
task1        -3.5266     0.8475  -4.161 3.17e-05 ***
task2         3.0222     0.7353   4.110 3.96e-05 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Correlation of Fixed Effects:
      (Intr) task1 
task1  0.575       
task2 -0.577 -0.927

In [289]:
summary(linmethodood_gpt35_model)

Generalized linear mixed model fit by maximum likelihood (Laplace
  Approximation) [glmerMod]
 Family: binomial  ( logit )
Formula: correct ~ task + (1 | index)
   Data: scaled_linmethodood_gpt35_df
Control: glmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e+05))

     AIC      BIC   logLik deviance df.resid 
   309.1    323.9   -150.5    301.1      296 

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
-6.9842 -0.4462 -0.0373  0.2255  2.1213 

Random effects:
 Groups Name        Variance Std.Dev.
 index  (Intercept) 12.06    3.472   
Number of obs: 300, groups:  index, 100

Fixed effects:
            Estimate Std. Error z value Pr(>|z|)    
(Intercept)  -1.0616     0.4753  -2.233   0.0255 *  
task1        -2.5275     0.5756  -4.391 1.13e-05 ***
task2         2.8639     0.6580   4.352 1.35e-05 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Correlation of Fixed Effects:
      (Intr) task1 
task1  0.431       
task2 -0.447 -0.886

In [290]:
summary(linmethodood_palm_model)

Cov prior  : index ~ wishart(df = 3.5, scale = Inf, posterior.scale = cov, common.scale = TRUE)
Fixef prior: normal(sd = c(3, 3, ...), corr = c(0 ...), common.scale = FALSE)
Prior dev  : 25.3343

Generalized linear mixed model fit by maximum likelihood (Laplace
  Approximation) [bglmerMod]
 Family: binomial  ( logit )
Formula: correct ~ task + (1 | index)
   Data: scaled_linmethodood_palm_df
Control: glmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e+05))

     AIC      BIC   logLik deviance df.resid 
    66.4     81.2    -29.2     58.4      296 

Scaled residuals: 
      Min        1Q    Median        3Q       Max 
-0.078937 -0.025076 -0.000616 -0.000508  0.311345 

Random effects:
 Groups Name        Variance Std.Dev.
 index  (Intercept) 123.5    11.11   
Number of obs: 300, groups:  index, 100

Fixed effects:
            Estimate Std. Error z value Pr(>|z|)    
(Intercept) -12.3647     1.6158  -7.653 1.97e-14 ***
task1        -2.3413     0.8961  -2.613  0.00898 ** 
task2 

In [291]:
anova(linmethodood_gpt4_model,linmethodood_gpt4_null_model,test="Chisq")

Unnamed: 0_level_0,npar,AIC,BIC,logLik,deviance,Chisq,Df,Pr(>Chisq)
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
linmethodood_gpt4_null_model,2,387.6053,395.0128,-191.8026,383.6053,,,
linmethodood_gpt4_model,4,293.2305,308.0456,-142.6152,285.2305,98.37479,2.0,4.346951e-22


In [292]:
anova(linmethodood_gpt35_model,linmethodood_gpt35_null_model,test="Chisq")

Unnamed: 0_level_0,npar,AIC,BIC,logLik,deviance,Chisq,Df,Pr(>Chisq)
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
linmethodood_gpt35_null_model,2,391.3457,398.7533,-193.6729,387.3457,,,
linmethodood_gpt35_model,4,309.0817,323.8968,-150.5409,301.0817,86.26404,2.0,1.8535379999999998e-19


In [293]:
anova(linmethodood_palm_model,linmethodood_palm_null_model,test="Chisq")

Unnamed: 0_level_0,npar,AIC,BIC,logLik,deviance,Chisq,Df,Pr(>Chisq)
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
linmethodood_palm_null_model,2,124.34909,131.75665,-60.17454,120.34909,,,
linmethodood_palm_model,4,66.36689,81.18202,-29.18344,58.36689,61.9822,2.0,3.473252e-14


In [294]:
gpt4_linood_multcomp <- glht(linmethodood_gpt4_model, linfct=mcp(task="Tukey"))

In [295]:
gpt35_linood_multcomp <- glht(linmethodood_gpt35_model, linfct=mcp(task="Tukey"))

In [296]:
palm_linood_multcomp <- glht(linmethodood_palm_model, linfct=mcp(task="Tukey"))

In [297]:
summary(gpt4_linood_multcomp)


	 Simultaneous Tests for General Linear Hypotheses

Multiple Comparisons of Means: Tukey Contrasts


Fit: glmer(formula = correct ~ task + (1 | index), data = scaled_linmethodood_gpt4_df, 
    family = binomial, control = glmerControl(optimizer = "bobyqa", 
        optCtrl = list(maxfun = 2e+05)))

Linear Hypotheses:
                                                                      Estimate
conversion_ood_actualprimed - conversion_ood_actual == 0                6.5488
conversion_ood_actualprimedcontrol - conversion_ood_actual == 0         4.0310
conversion_ood_actualprimedcontrol - conversion_ood_actualprimed == 0  -2.5178
                                                                      Std. Error
conversion_ood_actualprimed - conversion_ood_actual == 0                  1.5538
conversion_ood_actualprimedcontrol - conversion_ood_actual == 0           1.0505
conversion_ood_actualprimedcontrol - conversion_ood_actualprimed == 0     0.7555
                                        

In [298]:
summary(gpt35_linood_multcomp)


	 Simultaneous Tests for General Linear Hypotheses

Multiple Comparisons of Means: Tukey Contrasts


Fit: glmer(formula = correct ~ task + (1 | index), data = scaled_linmethodood_gpt35_df, 
    family = binomial, control = glmerControl(optimizer = "bobyqa", 
        optCtrl = list(maxfun = 2e+05)))

Linear Hypotheses:
                                                                      Estimate
conversion_ood_actualprimed - conversion_ood_actual == 0                5.3914
conversion_ood_actualprimedcontrol - conversion_ood_actual == 0         2.1912
conversion_ood_actualprimedcontrol - conversion_ood_actualprimed == 0  -3.2002
                                                                      Std. Error
conversion_ood_actualprimed - conversion_ood_actual == 0                  1.1982
conversion_ood_actualprimedcontrol - conversion_ood_actual == 0           0.6444
conversion_ood_actualprimedcontrol - conversion_ood_actualprimed == 0     0.8488
                                       

In [299]:
summary(palm_linood_multcomp)


	 Simultaneous Tests for General Linear Hypotheses

Multiple Comparisons of Means: Tukey Contrasts


Fit: bglmer(formula = correct ~ task + (1 | index), data = scaled_linmethodood_palm_df, 
    family = binomial, control = glmerControl(optimizer = "bobyqa", 
        optCtrl = list(maxfun = 2e+05)), fixef.prior = normal(cov = diag(9, 
        3)))

Linear Hypotheses:
                                                                      Estimate
conversion_ood_actualprimed - conversion_ood_actual == 0                7.4119
conversion_ood_actualprimedcontrol - conversion_ood_actual == 0        -0.3881
conversion_ood_actualprimedcontrol - conversion_ood_actualprimed == 0  -7.8000
                                                                      Std. Error
conversion_ood_actualprimed - conversion_ood_actual == 0                  1.4592
conversion_ood_actualprimedcontrol - conversion_ood_actual == 0           1.6844
conversion_ood_actualprimedcontrol - conversion_ood_actualprimed == 0  

# Counting

In [301]:
# Read in data
countwords_gpt4_df <- read.table(file = 'table_counting_words_gpt-4-0613.tsv', sep = '\t', header = TRUE)
countwords_gpt35_df <- read.table(file = 'table_counting_words_gpt-3.5-turbo-0613.tsv', sep = '\t', header = TRUE)
countwords_llama_df <- read.table(file = 'table_counting_words_llama-2-70b-chat.tsv', sep = '\t', header = TRUE)
countwords_palm_df <- read.table(file = 'table_counting_words_text-bison-001.tsv', sep = '\t', header = TRUE)


countchars_gpt4_df <- read.table(file = 'table_counting_chars_gpt-4-0613.tsv', sep = '\t', header = TRUE)
countchars_gpt35_df <- read.table(file = 'table_counting_chars_gpt-3.5-turbo-0613.tsv', sep = '\t', header = TRUE)
countchars_llama_df <- read.table(file = 'table_counting_chars_llama-2-70b-chat.tsv', sep = '\t', header = TRUE)
countchars_palm_df <- read.table(file = 'table_counting_chars_text-bison-001.tsv', sep = '\t', header = TRUE)



In [302]:
# Z-score data
scaled_countwords_gpt4_df <- scale_df_with_index(countwords_gpt4_df)
scaled_countwords_gpt35_df <- scale_df_with_index(countwords_gpt35_df)
scaled_countwords_llama_df <- scale_df_with_index(countwords_llama_df)
scaled_countwords_palm_df <- scale_df_with_index(countwords_palm_df)

scaled_countchars_gpt4_df <- scale_df_with_index(countchars_gpt4_df)
scaled_countchars_gpt35_df <- scale_df_with_index(countchars_gpt35_df)
scaled_countchars_llama_df <- scale_df_with_index(countchars_llama_df)
scaled_countchars_palm_df <- scale_df_with_index(countchars_palm_df)

In [303]:
head(scaled_countwords_gpt4_df)

Unnamed: 0_level_0,index,input_nchars,input_ntokens,input_logprob,output_nchars,output_ntokens,output_logprob,correct
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<int>
1,-1.71453,-1.741827,-1.739786,-4.4424536,-3.006613,,2.316207,1
2,-1.71453,-1.738059,-1.771344,8.7549067,-3.006613,,2.316207,1
3,-1.71453,-1.726757,-1.755565,-1.9608672,-3.006613,,2.316207,1
4,-1.71453,-1.738059,-1.739786,-4.623797,-3.006613,,2.316207,1
5,-1.71453,-1.726757,-1.739786,-3.510291,-3.006613,,2.316207,1
6,-1.71453,-1.726757,-1.739786,-0.5541231,-3.006613,,2.316207,1


In [304]:
countwords_gpt4_model <- glm(correct ~ index + input_logprob + output_logprob, 
                   data=scaled_countwords_gpt4_df, family=binomial)
countwords_gpt35_model <- glm(correct ~ index + input_logprob + output_logprob, 
                   data=scaled_countwords_gpt35_df, family=binomial)
countwords_llama_model <- glm(correct ~ index + input_logprob + output_logprob, 
                   data=scaled_countwords_llama_df, family=binomial)
countwords_palm_model <- glm(correct ~ index + input_logprob + output_logprob, 
                   data=scaled_countwords_palm_df, family=binomial)


countchars_gpt4_model <- glm(correct ~ index + input_logprob + output_logprob, 
                   data=scaled_countchars_gpt4_df, family=binomial)
countchars_gpt35_model <- glm(correct ~ index + input_logprob + output_logprob, 
                   data=scaled_countchars_gpt35_df, family=binomial)
countchars_llama_model <- glm(correct ~ index + input_logprob + output_logprob, 
                   data=scaled_countchars_llama_df, family=binomial)
countchars_palm_model <- glm(correct ~ index + input_logprob + output_logprob, 
                   data=scaled_countchars_palm_df, family=binomial)


In [305]:
summary(countwords_gpt4_model)


Call:
glm(formula = correct ~ index + input_logprob + output_logprob, 
    family = binomial, data = scaled_countwords_gpt4_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-2.57541  -0.16382  -0.04986  -0.02723   2.91722  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)    -3.35701    0.15836 -21.199   <2e-16 ***
index          -0.19417    0.10797  -1.798   0.0721 .  
input_logprob  -0.10037    0.08225  -1.220   0.2224    
output_logprob  3.59077    0.17453  20.574   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 3264.4  on 2999  degrees of freedom
Residual deviance: 1137.6  on 2996  degrees of freedom
AIC: 1145.6

Number of Fisher Scoring iterations: 7


In [306]:
summary(countwords_gpt35_model)


Call:
glm(formula = correct ~ index + input_logprob + output_logprob, 
    family = binomial, data = scaled_countwords_gpt35_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.0485  -0.2589  -0.0675  -0.0346   3.4371  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)    -4.31424    0.19524 -22.098   <2e-16 ***
index           0.06015    0.10323   0.583    0.560    
input_logprob  -0.01359    0.05816  -0.234    0.815    
output_logprob  2.95668    0.15685  18.851   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 2283.9  on 2999  degrees of freedom
Residual deviance: 1074.0  on 2996  degrees of freedom
AIC: 1082

Number of Fisher Scoring iterations: 7


In [307]:
summary(countwords_llama_model)


Call:
glm(formula = correct ~ index + input_logprob + output_logprob, 
    family = binomial, data = scaled_countwords_llama_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.41362  -0.15480  -0.02343  -0.00392   2.76378  

Coefficients:
                Estimate Std. Error z value Pr(>|z|)    
(Intercept)    -7.072627   0.550403 -12.850  < 2e-16 ***
index          -4.056163   0.633850  -6.399 1.56e-10 ***
input_logprob   0.005911   0.046164   0.128    0.898    
output_logprob  0.258873   0.305385   0.848    0.397    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 1394.61  on 2999  degrees of freedom
Residual deviance:  759.17  on 2996  degrees of freedom
AIC: 767.17

Number of Fisher Scoring iterations: 9


In [308]:
summary(countwords_palm_model)


Call:
glm(formula = correct ~ index + input_logprob + output_logprob, 
    family = binomial, data = scaled_countwords_palm_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.4636  -0.4250  -0.2061  -0.1219   2.9356  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)    -3.01357    0.10955 -27.508  < 2e-16 ***
index          -0.62386    0.13046  -4.782 1.73e-06 ***
input_logprob   0.02943    0.04433   0.664    0.507    
output_logprob  1.12174    0.11582   9.685  < 2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 2132.9  on 2999  degrees of freedom
Residual deviance: 1512.3  on 2996  degrees of freedom
AIC: 1520.3

Number of Fisher Scoring iterations: 6


In [309]:
summary(countchars_gpt4_model)


Call:
glm(formula = correct ~ index + input_logprob + output_logprob, 
    family = binomial, data = scaled_countchars_gpt4_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-2.46780  -0.15937  -0.03976   0.07145   3.08559  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)    -3.08292    0.15416 -19.998  < 2e-16 ***
index          -0.82241    0.10574  -7.778  7.4e-15 ***
input_logprob  -0.05048    0.07837  -0.644     0.52    
output_logprob  3.55316    0.17447  20.365  < 2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 3505.5  on 2999  degrees of freedom
Residual deviance: 1070.3  on 2996  degrees of freedom
AIC: 1078.3

Number of Fisher Scoring iterations: 7


In [310]:
summary(countchars_gpt35_model)


Call:
glm(formula = correct ~ index + input_logprob + output_logprob, 
    family = binomial, data = scaled_countchars_gpt35_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.0301  -0.5265  -0.2664   0.4923   2.3786  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)    -1.08394    0.05786 -18.733   <2e-16 ***
index          -0.71066    0.07709  -9.218   <2e-16 ***
input_logprob  -0.06039    0.05344  -1.130    0.258    
output_logprob  1.44736    0.08107  17.852   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 3836.9  on 2999  degrees of freedom
Residual deviance: 2254.9  on 2996  degrees of freedom
AIC: 2262.9

Number of Fisher Scoring iterations: 5


In [311]:
summary(countchars_llama_model)


Call:
glm(formula = correct ~ index + input_logprob + output_logprob, 
    family = binomial, data = scaled_countchars_llama_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.4595  -0.0948  -0.0066  -0.0004   3.7637  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)    -9.82994    0.89754 -10.952  < 2e-16 ***
index          -7.05727    0.95977  -7.353 1.94e-13 ***
input_logprob  -0.19112    0.09955  -1.920   0.0549 .  
output_logprob -0.87847    0.39234  -2.239   0.0252 *  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 1249.29  on 2999  degrees of freedom
Residual deviance:  647.68  on 2996  degrees of freedom
AIC: 655.68

Number of Fisher Scoring iterations: 10


In [312]:
summary(countchars_palm_model)


Call:
glm(formula = correct ~ index + input_logprob + output_logprob, 
    family = binomial, data = scaled_countchars_palm_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.6167  -0.1608  -0.0312  -0.0162   4.0666  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)    -5.81871    0.30797 -18.894  < 2e-16 ***
index           0.68475    0.12378   5.532 3.17e-08 ***
input_logprob  -0.09322    0.09276  -1.005    0.315    
output_logprob  3.72132    0.21930  16.969  < 2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 1657.92  on 2999  degrees of freedom
Residual deviance:  741.69  on 2996  degrees of freedom
AIC: 749.69

Number of Fisher Scoring iterations: 8


In [313]:
vif(countwords_gpt4_model)

In [314]:
vif(countwords_gpt35_model)

In [315]:
vif(countwords_llama_model)

In [316]:
vif(countwords_palm_model)

In [317]:
vif(countchars_gpt4_model)

In [318]:
vif(countchars_gpt35_model)

In [319]:
vif(countchars_llama_model)

In [320]:
vif(countchars_palm_model)

### Varying input

In [321]:
# Read in data
countwords_both_gpt4_df <- read.table(file = 'table_counting_words_both_gpt-4-0613.tsv', sep = '\t', header = TRUE)
countwords_both_gpt35_df <- read.table(file = 'table_counting_words_both_gpt-3.5-turbo-0613.tsv', sep = '\t', header = TRUE)
countwords_both_llama_df <- read.table(file = 'table_counting_words_both_llama-2-70b-chat.tsv', sep = '\t', header = TRUE)
countwords_both_palm_df <- read.table(file = 'table_counting_words_both_text-bison-001.tsv', sep = '\t', header = TRUE)

countchars_both_gpt4_df <- read.table(file = 'table_counting_chars_both_gpt-4-0613.tsv', sep = '\t', header = TRUE)
countchars_both_gpt35_df <- read.table(file = 'table_counting_chars_both_gpt-3.5-turbo-0613.tsv', sep = '\t', header = TRUE)
countchars_both_llama_df <- read.table(file = 'table_counting_chars_both_llama-2-70b-chat.tsv', sep = '\t', header = TRUE)
countchars_both_palm_df <- read.table(file = 'table_counting_chars_both_text-bison-001.tsv', sep = '\t', header = TRUE)


In [322]:
# Z-score data
scaled_countwords_both_gpt4_df <- scale_df_with_index(countwords_both_gpt4_df)
scaled_countwords_both_gpt35_df <- scale_df_with_index(countwords_both_gpt35_df)
scaled_countwords_both_llama_df <- scale_df_with_index(countwords_both_llama_df)
scaled_countwords_both_palm_df <- scale_df_with_index(countwords_both_palm_df)


scaled_countchars_both_gpt4_df <- scale_df_with_index(countchars_both_gpt4_df)
scaled_countchars_both_gpt35_df <- scale_df_with_index(countchars_both_gpt35_df)
scaled_countchars_both_llama_df <- scale_df_with_index(countchars_both_llama_df)
scaled_countchars_both_palm_df <- scale_df_with_index(countchars_both_palm_df)

In [323]:
countwords_both_gpt4_model <- glm(correct ~ index + input_logprob + output_logprob, 
                   data=scaled_countwords_both_gpt4_df, family=binomial)
countwords_both_gpt35_model <- glm(correct ~ index + input_logprob + output_logprob, 
                   data=scaled_countwords_both_gpt35_df, family=binomial)
countwords_both_llama_model <- glm(correct ~ index + input_logprob + output_logprob, 
                   data=scaled_countwords_both_llama_df, family=binomial)
countwords_both_palm_model <- glm(correct ~ index + input_logprob + output_logprob, 
                   data=scaled_countwords_both_palm_df, family=binomial)

countchars_both_gpt4_model <- glm(correct ~ index + input_logprob + output_logprob + input_ntokens, 
                   data=scaled_countchars_both_gpt4_df, family=binomial)
countchars_both_gpt35_model <- glm(correct ~ index + input_logprob + output_logprob + input_ntokens, 
                   data=scaled_countchars_both_gpt35_df, family=binomial)
countchars_both_llama_model <- glm(correct ~ index + input_logprob + output_logprob + input_ntokens, 
                   data=scaled_countchars_both_llama_df, family=binomial)
countchars_both_palm_model <- glm(correct ~ index + input_logprob + output_logprob + input_ntokens, 
                   data=scaled_countchars_both_palm_df, family=binomial)

“glm.fit: fitted probabilities numerically 0 or 1 occurred”


In [324]:
summary(countwords_both_gpt4_model)


Call:
glm(formula = correct ~ index + input_logprob + output_logprob, 
    family = binomial, data = scaled_countwords_both_gpt4_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-2.85516  -0.14205  -0.04240  -0.02356   3.10665  

Coefficients:
                Estimate Std. Error z value Pr(>|z|)    
(Intercept)    -3.597483   0.121388 -29.636   <2e-16 ***
index           0.006453   0.079863   0.081   0.9356    
input_logprob   0.117880   0.054903   2.147   0.0318 *  
output_logprob  3.927246   0.138109  28.436   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 6427.5  on 5999  degrees of freedom
Residual deviance: 2162.5  on 5996  degrees of freedom
AIC: 2170.5

Number of Fisher Scoring iterations: 7


In [325]:
summary(countwords_both_gpt35_model)


Call:
glm(formula = correct ~ index + input_logprob + output_logprob, 
    family = binomial, data = scaled_countwords_both_gpt35_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.8573  -0.3448  -0.1093  -0.0759   3.2168  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)    -3.47416    0.09882 -35.157  < 2e-16 ***
index           0.43608    0.07043   6.192 5.95e-10 ***
input_logprob  -0.11203    0.05037  -2.224   0.0261 *  
output_logprob  2.63160    0.09325  28.222  < 2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 4749.4  on 5999  degrees of freedom
Residual deviance: 2581.4  on 5996  degrees of freedom
AIC: 2589.4

Number of Fisher Scoring iterations: 7


In [326]:
summary(countwords_both_llama_model)


Call:
glm(formula = correct ~ index + input_logprob + output_logprob, 
    family = binomial, data = scaled_countwords_both_llama_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.3292  -0.2499  -0.0789  -0.0340   3.4913  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)    -4.96355    0.19941 -24.891  < 2e-16 ***
index          -1.48618    0.24633  -6.033 1.61e-09 ***
input_logprob   0.17337    0.06424   2.699  0.00696 ** 
output_logprob  1.05018    0.15995   6.566 5.18e-11 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 2550.1  on 5999  degrees of freedom
Residual deviance: 1644.4  on 5996  degrees of freedom
AIC: 1652.4

Number of Fisher Scoring iterations: 8


In [327]:
summary(countwords_both_palm_model)


Call:
glm(formula = correct ~ index + input_logprob + output_logprob, 
    family = binomial, data = scaled_countwords_both_palm_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.5044  -0.4178  -0.1894  -0.1248   3.1914  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)    -3.09837    0.07936 -39.041   <2e-16 ***
index          -0.15649    0.07286  -2.148   0.0317 *  
input_logprob   0.09857    0.04801   2.053   0.0401 *  
output_logprob  1.50143    0.07633  19.669   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 4090.8  on 5999  degrees of freedom
Residual deviance: 2890.3  on 5996  degrees of freedom
AIC: 2898.3

Number of Fisher Scoring iterations: 6


In [328]:
summary(countchars_both_gpt4_model)


Call:
glm(formula = correct ~ index + input_logprob + output_logprob + 
    input_ntokens, family = binomial, data = scaled_countchars_both_gpt4_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-3.9362  -0.1098  -0.0262   0.0312   3.4657  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     -4.1767     0.1621 -25.768  < 2e-16 ***
index           -0.6219     0.1174  -5.296 1.18e-07 ***
input_logprob   -0.6803     0.1281  -5.313 1.08e-07 ***
output_logprob   3.3781     0.1396  24.202  < 2e-16 ***
input_ntokens   -2.5456     0.3061  -8.315  < 2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 6819.6  on 5999  degrees of freedom
Residual deviance: 1846.0  on 5995  degrees of freedom
AIC: 1856

Number of Fisher Scoring iterations: 8


In [329]:
summary(countchars_both_gpt35_model)


Call:
glm(formula = correct ~ index + input_logprob + output_logprob + 
    input_ntokens, family = binomial, data = scaled_countchars_both_gpt35_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.2093  -0.3962  -0.1922   0.2725   2.6262  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)    -1.86072    0.05794 -32.115  < 2e-16 ***
index          -0.38644    0.08268  -4.674 2.95e-06 ***
input_logprob   0.49788    0.07428   6.702 2.05e-11 ***
output_logprob  2.01289    0.07029  28.636  < 2e-16 ***
input_ntokens  -0.24959    0.12127  -2.058   0.0396 *  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 7098.4  on 5999  degrees of freedom
Residual deviance: 3590.7  on 5995  degrees of freedom
AIC: 3600.7

Number of Fisher Scoring iterations: 6


In [330]:
summary(countchars_both_llama_model)


Call:
glm(formula = correct ~ index + input_logprob + output_logprob + 
    input_ntokens, family = binomial, data = scaled_countchars_both_llama_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.3121  -0.0323  -0.0007   0.0000   5.4187  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)    -17.4791     1.3647 -12.808  < 2e-16 ***
index           -4.6999     0.8336  -5.638 1.72e-08 ***
input_logprob   -1.1518     0.1702  -6.767 1.32e-11 ***
output_logprob  -0.3352     0.3363  -0.997    0.319    
input_ntokens  -12.5236     1.6980  -7.376 1.64e-13 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 2434.90  on 5999  degrees of freedom
Residual deviance:  970.16  on 5995  degrees of freedom
AIC: 980.16

Number of Fisher Scoring iterations: 11


In [331]:
summary(countchars_both_palm_model)


Call:
glm(formula = correct ~ index + input_logprob + output_logprob + 
    input_ntokens, family = binomial, data = scaled_countchars_both_palm_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.8196  -0.1014  -0.0129  -0.0058   4.3669  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)    -7.41014    0.31438 -23.571  < 2e-16 ***
index           0.87012    0.12124   7.177 7.13e-13 ***
input_logprob  -0.05129    0.09177  -0.559 0.576214    
output_logprob  4.39207    0.19899  22.072  < 2e-16 ***
input_ntokens  -1.30804    0.37556  -3.483 0.000496 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 3141.0  on 5999  degrees of freedom
Residual deviance: 1195.7  on 5995  degrees of freedom
AIC: 1205.7

Number of Fisher Scoring iterations: 9


In [332]:
vif(countwords_both_gpt4_model)

In [333]:
vif(countwords_both_gpt35_model)

In [334]:
vif(countwords_both_llama_model)

In [335]:
vif(countwords_both_palm_model)

In [336]:
vif(countchars_both_gpt4_model)

In [337]:
vif(countchars_both_gpt35_model)

In [338]:
vif(countchars_both_llama_model)

In [339]:
vif(countchars_both_palm_model)

### Varying output

In [341]:
# Read in data
countwords_binary_gpt4_df <- read.table(file = 'table_counting_words_binary_gpt-4-0613.tsv', sep = '\t', header = TRUE)
countwords_binary_gpt35_df <- read.table(file = 'table_counting_words_binary_gpt-3.5-turbo-0613.tsv', sep = '\t', header = TRUE)
countwords_binary_llama_df <- read.table(file = 'table_counting_words_binary_llama-2-70b-chat.tsv', sep = '\t', header = TRUE)
countwords_binary_palm_df <- read.table(file = 'table_counting_words_binary_text-bison-001.tsv', sep = '\t', header = TRUE)


countchars_binary_gpt4_df <- read.table(file = 'table_counting_chars_binary_gpt-4-0613.tsv', sep = '\t', header = TRUE)
countchars_binary_gpt35_df <- read.table(file = 'table_counting_chars_binary_gpt-3.5-turbo-0613.tsv', sep = '\t', header = TRUE)
countchars_binary_llama_df <- read.table(file = 'table_counting_chars_binary_llama-2-70b-chat.tsv', sep = '\t', header = TRUE)
countchars_binary_palm_df <- read.table(file = 'table_counting_chars_binary_text-bison-001.tsv', sep = '\t', header = TRUE)


In [342]:
# Z-score data
scaled_countwords_binary_gpt4_df <- scale_df_with_index(countwords_binary_gpt4_df)
scaled_countwords_binary_gpt35_df <- scale_df_with_index(countwords_binary_gpt35_df)
scaled_countwords_binary_llama_df <- scale_df_with_index(countwords_binary_llama_df)
scaled_countwords_binary_palm_df <- scale_df_with_index(countwords_binary_palm_df)


scaled_countchars_binary_gpt4_df <- scale_df_with_index(countchars_binary_gpt4_df)
scaled_countchars_binary_gpt35_df <- scale_df_with_index(countchars_binary_gpt35_df)
scaled_countchars_binary_llama_df <- scale_df_with_index(countchars_binary_llama_df)
scaled_countchars_binary_palm_df <- scale_df_with_index(countchars_binary_palm_df)

In [343]:
countwords_binary_gpt4_model <- glm(correct ~ index + input_logprob + output_logprob, 
                   data=scaled_countwords_binary_gpt4_df, family=binomial)
countwords_binary_gpt35_model <- glm(correct ~ index + input_logprob + output_logprob, 
                   data=scaled_countwords_binary_gpt35_df, family=binomial)
countwords_binary_llama_model <- glm(correct ~ index + input_logprob + output_logprob, 
                   data=scaled_countwords_binary_llama_df, family=binomial)
countwords_binary_palm_model <- glm(correct ~ index + input_logprob + output_logprob, 
                   data=scaled_countwords_binary_palm_df, family=binomial)

countchars_binary_gpt4_model <- glm(correct ~ index + input_logprob + output_logprob, 
                   data=scaled_countchars_binary_gpt4_df, family=binomial)
countchars_binary_gpt35_model <- glm(correct ~ index + input_logprob + output_logprob, 
                   data=scaled_countchars_binary_gpt35_df, family=binomial)
countchars_binary_llama_model <- glm(correct ~ index + input_logprob + output_logprob, 
                   data=scaled_countchars_binary_llama_df, family=binomial)
countchars_binary_palm_model <- glm(correct ~ index + input_logprob + output_logprob, 
                   data=scaled_countchars_binary_palm_df, family=binomial)

In [344]:
summary(countwords_binary_gpt4_model)


Call:
glm(formula = correct ~ index + input_logprob + output_logprob, 
    family = binomial, data = scaled_countwords_binary_gpt4_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-2.13331  -0.37089  -0.08181  -0.04723   2.96403  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)    -2.73270    0.12591 -21.704  < 2e-16 ***
index           0.52051    0.08623   6.036 1.58e-09 ***
input_logprob   0.12293    0.06573   1.870   0.0615 .  
output_logprob  3.22597    0.15411  20.933  < 2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 2633.6  on 2399  degrees of freedom
Residual deviance: 1259.9  on 2396  degrees of freedom
AIC: 1267.9

Number of Fisher Scoring iterations: 7


In [345]:
summary(countwords_binary_gpt35_model)


Call:
glm(formula = correct ~ index + input_logprob + output_logprob, 
    family = binomial, data = scaled_countwords_binary_gpt35_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.6350  -0.3409  -0.1558  -0.0843   3.4431  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     -3.8385     0.1616 -23.747   <2e-16 ***
index            1.0924     0.0941  11.609   <2e-16 ***
input_logprob   -0.2086     0.1051  -1.984   0.0473 *  
output_logprob   1.8626     0.1338  13.917   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 1154.44  on 2399  degrees of freedom
Residual deviance:  764.18  on 2396  degrees of freedom
AIC: 772.18

Number of Fisher Scoring iterations: 7


In [346]:
summary(countwords_binary_llama_model)


Call:
glm(formula = correct ~ index + input_logprob + output_logprob, 
    family = binomial, data = scaled_countwords_binary_llama_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-0.4809  -0.0747  -0.0458  -0.0279   3.6841  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     -6.7785     0.6582 -10.299  < 2e-16 ***
index           -0.3322     0.4562  -0.728  0.46648    
input_logprob   -0.7650     0.2961  -2.584  0.00977 ** 
output_logprob   0.9642     0.5206   1.852  0.06404 .  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 107.23  on 2399  degrees of freedom
Residual deviance:  88.73  on 2396  degrees of freedom
AIC: 96.73

Number of Fisher Scoring iterations: 10


In [347]:
summary(countwords_binary_palm_model)


Call:
glm(formula = correct ~ index + input_logprob + output_logprob, 
    family = binomial, data = scaled_countwords_binary_palm_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-0.9611  -0.3958  -0.2368  -0.1281   2.9216  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)    -3.33678    0.13469 -24.774  < 2e-16 ***
index          -0.35685    0.09266  -3.851 0.000118 ***
input_logprob   0.05451    0.07101   0.768 0.442684    
output_logprob  1.05121    0.11768   8.933  < 2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 1133.00  on 2399  degrees of freedom
Residual deviance:  954.16  on 2396  degrees of freedom
AIC: 962.16

Number of Fisher Scoring iterations: 6


In [348]:
summary(countchars_binary_gpt4_model)


Call:
glm(formula = correct ~ index + input_logprob + output_logprob, 
    family = binomial, data = scaled_countchars_binary_gpt4_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.8510  -0.4543  -0.1344  -0.0681   2.2621  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)    -2.65994    0.11628 -22.875   <2e-16 ***
index           0.01801    0.06977   0.258    0.796    
input_logprob  -0.09741    0.06844  -1.423    0.155    
output_logprob  2.54242    0.11741  21.655   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 2488.0  on 2399  degrees of freedom
Residual deviance: 1364.7  on 2396  degrees of freedom
AIC: 1372.7

Number of Fisher Scoring iterations: 6


In [349]:
summary(countchars_binary_gpt35_model)


Call:
glm(formula = correct ~ index + input_logprob + output_logprob, 
    family = binomial, data = scaled_countchars_binary_gpt35_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.8611  -0.6616  -0.2926   0.5626   2.4637  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)    -1.48069    0.06776 -21.851   <2e-16 ***
index          -0.53780    0.05743  -9.365   <2e-16 ***
input_logprob  -0.03054    0.05630  -0.542    0.588    
output_logprob  1.37065    0.07117  19.259   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 2821.3  on 2399  degrees of freedom
Residual deviance: 1974.3  on 2396  degrees of freedom
AIC: 1982.3

Number of Fisher Scoring iterations: 5


In [350]:
summary(countchars_binary_llama_model)


Call:
glm(formula = correct ~ index + input_logprob + output_logprob, 
    family = binomial, data = scaled_countchars_binary_llama_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-0.5864  -0.0960  -0.0314  -0.0080   3.3782  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     -7.6677     0.8833  -8.680  < 2e-16 ***
index           -0.8514     0.3367  -2.529   0.0115 *  
input_logprob   -0.1675     0.2048  -0.818   0.4135    
output_logprob   2.4472     0.5964   4.103 4.07e-05 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 277.96  on 2399  degrees of freedom
Residual deviance: 192.88  on 2396  degrees of freedom
AIC: 200.88

Number of Fisher Scoring iterations: 10


In [351]:
summary(countchars_binary_palm_model)


Call:
glm(formula = correct ~ index + input_logprob + output_logprob, 
    family = binomial, data = scaled_countchars_binary_palm_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.1034  -0.2386  -0.0588  -0.0111   3.3192  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)    -5.93638    0.36003 -16.489   <2e-16 ***
index           1.32128    0.11851  11.149   <2e-16 ***
input_logprob  -0.09977    0.12218  -0.817    0.414    
output_logprob  3.75458    0.27605  13.601   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 1149.10  on 2399  degrees of freedom
Residual deviance:  503.06  on 2396  degrees of freedom
AIC: 511.06

Number of Fisher Scoring iterations: 8


In [352]:
vif(countwords_binary_gpt4_model)

In [353]:
vif(countwords_binary_gpt35_model)

In [354]:
vif(countwords_binary_llama_model)

In [355]:
vif(countwords_binary_palm_model)

In [356]:
vif(countchars_binary_gpt4_model)

In [357]:
vif(countchars_binary_gpt35_model)

In [358]:
vif(countchars_binary_llama_model)

In [359]:
vif(countchars_binary_palm_model)

# Sorting

In [361]:
# Read in data
sortwords_gpt4_df <- read.table(file = 'table_sortwords_gpt-4-0613.tsv', sep = '\t', header = TRUE)
sortwords_gpt35_df <- read.table(file = 'table_sortwords_gpt-3.5-turbo-0613.tsv', sep = '\t', header = TRUE)
sortwords_llama_df <- read.table(file = 'table_sortwords_llama-2-70b-chat.tsv', sep = '\t', header = TRUE)
sortwords_palm_df <- read.table(file = 'table_sortwords_text-bison-001.tsv', sep = '\t', header = TRUE)


sortnumbers_gpt4_df <- read.table(file = 'table_sortnumbers_gpt-4-0613.tsv', sep = '\t', header = TRUE)
sortnumbers_gpt35_df <- read.table(file = 'table_sortnumbers_gpt-3.5-turbo-0613.tsv', sep = '\t', header = TRUE)
sortnumbers_llama_df <- read.table(file = 'table_sortnumbers_llama-2-70b-chat.tsv', sep = '\t', header = TRUE)
sortnumbers_palm_df <- read.table(file = 'table_sortnumbers_text-bison-001.tsv', sep = '\t', header = TRUE)


In [362]:
# Z-score data
scaled_sortwords_gpt4_df <- scale_taskpair_df(sortwords_gpt4_df)
scaled_sortwords_gpt35_df <- scale_taskpair_df(sortwords_gpt35_df)
scaled_sortwords_llama_df <- scale_taskpair_df(sortwords_llama_df)
scaled_sortwords_palm_df <- scale_taskpair_df(sortwords_palm_df)


scaled_sortnumbers_gpt4_df <- scale_taskpair_df(sortnumbers_gpt4_df)
scaled_sortnumbers_gpt35_df <- scale_taskpair_df(sortnumbers_gpt35_df)
scaled_sortnumbers_llama_df <- scale_taskpair_df(sortnumbers_llama_df)
scaled_sortnumbers_palm_df <- scale_taskpair_df(sortnumbers_palm_df)

In [363]:
head(scaled_sortnumbers_palm_df)

Unnamed: 0_level_0,input_nchars,input_ntokens,input_logprob,output_nchars,output_ntokens,output_logprob,index,task,correct
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<fct>,<fct>,<int>
1,-1.216674,-1.216674,1.1557494,-1.216674,-1.216674,1.0952577,0,ascending,1
2,-1.5466941,-1.5466941,1.7367319,-1.5466941,-1.5466941,1.432666,1,ascending,0
3,-1.8767142,-1.8767142,1.896608,-1.8767142,-1.8767142,1.7989276,2,ascending,0
4,0.2684163,0.2684163,-0.3247421,0.2684163,0.2684163,-0.177441,3,ascending,1
5,-0.2266138,-0.2266138,0.3877534,-0.2266138,-0.2266138,0.2251411,4,ascending,1
6,1.6985033,1.6985033,-1.6558483,1.6985033,1.6985033,-1.2426926,5,ascending,0


In [364]:
sort_gpt4_words_model <- glm(correct ~ task + input_nchars + input_ntokens + input_logprob + output_logprob, 
               data=scaled_sortwords_gpt4_df, family=binomial)
sort_gpt35_words_model <- glm(correct ~ task + input_nchars + input_ntokens + input_logprob + output_logprob, 
               data=scaled_sortwords_gpt35_df, family=binomial)
sort_llama_words_model <- bayesglm(correct ~ task + input_nchars + input_ntokens + input_logprob + output_logprob, 
               data=scaled_sortwords_llama_df, family=binomial)
sort_palm_words_model <- bayesglm(correct ~ task + input_nchars + input_ntokens + input_logprob + output_logprob, 
               data=scaled_sortwords_palm_df, family=binomial)


In [365]:
sort_gpt4_numbers_model <- glm(correct ~ task + input_nchars + input_ntokens + input_logprob + output_logprob, 
               data=scaled_sortnumbers_gpt4_df, family=binomial)
sort_gpt35_numbers_model <- glm(correct ~ task + input_nchars + input_ntokens + input_logprob + output_logprob, 
               data=scaled_sortnumbers_gpt35_df, family=binomial)
sort_llama_numbers_model <- glm(correct ~ task + input_nchars + input_logprob + output_logprob, 
               data=scaled_sortnumbers_llama_df, family=binomial)
sort_palm_numbers_model <- glm(correct ~ task + input_nchars + input_logprob + output_logprob, 
               data=scaled_sortnumbers_palm_df, family=binomial)

In [366]:
summary(sort_gpt4_words_model)


Call:
glm(formula = correct ~ task + input_nchars + input_ntokens + 
    input_logprob + output_logprob, family = binomial, data = scaled_sortwords_gpt4_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.2232  -0.8916   0.4892   0.7003   1.9249  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     1.59101    0.31446   5.059 4.20e-07 ***
taskrev        -2.38837    0.45625  -5.235 1.65e-07 ***
input_nchars    0.38206    0.64249   0.595    0.552    
input_ntokens  -0.67072    0.87302  -0.768    0.442    
input_logprob   0.17060    0.91596   0.186    0.852    
output_logprob  0.02066    0.78236   0.026    0.979    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 273.33  on 199  degrees of freedom
Residual deviance: 211.06  on 194  degrees of freedom
AIC: 223.06

Number of Fisher Scoring iterations: 4


In [367]:
summary(sort_gpt35_words_model)


Call:
glm(formula = correct ~ task + input_nchars + input_ntokens + 
    input_logprob + output_logprob, family = binomial, data = scaled_sortwords_gpt35_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.9744  -0.6160  -0.3827   0.7223   2.2874  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)      1.0275     0.2797   3.674 0.000239 ***
taskrev         -2.6897     0.4560  -5.898 3.67e-09 ***
input_nchars    -0.2244     0.7079  -0.317 0.751250    
input_ntokens    0.2777     0.9500   0.292 0.770045    
input_logprob   -0.4364     0.9579  -0.456 0.648674    
output_logprob   0.9603     0.8631   1.113 0.265902    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 275.64  on 199  degrees of freedom
Residual deviance: 188.36  on 194  degrees of freedom
AIC: 200.36

Number of Fisher Scoring iterations: 4


In [368]:
summary(sort_llama_words_model)


Call:
bayesglm(formula = correct ~ task + input_nchars + input_ntokens + 
    input_logprob + output_logprob, family = binomial, data = scaled_sortwords_llama_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-0.78660  -0.55687  -0.09895  -0.07674   2.06574  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     -1.7717     0.3177  -5.576 2.46e-08 ***
taskrev         -3.8418     1.5115  -2.542    0.011 *  
input_nchars    -0.2115     0.6940  -0.305    0.761    
input_ntokens    0.5152     0.7227   0.713    0.476    
input_logprob    0.2999     0.7205   0.416    0.677    
output_logprob   0.3221     0.7626   0.422    0.673    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 111.508  on 199  degrees of freedom
Residual deviance:  86.521  on 194  degrees of freedom
AIC: 98.521

Number of Fisher Scoring iterations: 20


In [369]:
summary(sort_palm_words_model)


Call:
bayesglm(formula = correct ~ task + input_nchars + input_ntokens + 
    input_logprob + output_logprob, family = binomial, data = scaled_sortwords_palm_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-0.3435  -0.1690  -0.1001  -0.0673   2.9432  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     -4.4554     1.0051  -4.433 9.31e-06 ***
taskrev         -1.6181     1.4782  -1.095    0.274    
input_nchars    -0.5279     0.9794  -0.539    0.590    
input_ntokens    0.1313     0.9460   0.139    0.890    
input_logprob    0.3859     0.9747   0.396    0.692    
output_logprob   0.2236     0.9887   0.226    0.821    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 22.401  on 199  degrees of freedom
Residual deviance: 18.208  on 194  degrees of freedom
AIC: 30.208

Number of Fisher Scoring iterations: 21


In [370]:
summary(sort_gpt4_numbers_model)


Call:
glm(formula = correct ~ task + input_nchars + input_ntokens + 
    input_logprob + output_logprob, family = binomial, data = scaled_sortnumbers_gpt4_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.2265   0.4663   0.5864   0.6988   0.9245  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     1.52436    0.35167   4.335 1.46e-05 ***
taskdescending -0.03131    0.58923  -0.053    0.958    
input_nchars   -8.09348    6.40191  -1.264    0.206    
input_ntokens   6.68226    6.03090   1.108    0.268    
input_logprob  -1.26602    1.20055  -1.055    0.292    
output_logprob  0.19837    0.89414   0.222    0.824    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 194.49  on 199  degrees of freedom
Residual deviance: 188.87  on 194  degrees of freedom
AIC: 200.87

Number of Fisher Scoring iterations: 4


In [371]:
summary(sort_gpt35_numbers_model)


Call:
glm(formula = correct ~ task + input_nchars + input_ntokens + 
    input_logprob + output_logprob, family = binomial, data = scaled_sortnumbers_gpt35_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.9344  -1.0194   0.6502   0.8800   2.0047  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)      0.7018     0.3059   2.294   0.0218 *  
taskdescending  -0.2664     0.5108  -0.522   0.6020    
input_nchars   -23.7603     6.0042  -3.957 7.58e-05 ***
input_ntokens   24.8100     5.7384   4.324 1.54e-05 ***
input_logprob    1.1925     1.0635   1.121   0.2621    
output_logprob   0.1633     0.7746   0.211   0.8330    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 264.63  on 199  degrees of freedom
Residual deviance: 233.73  on 194  degrees of freedom
AIC: 245.73

Number of Fisher Scoring iterations: 4


In [372]:
summary(sort_llama_numbers_model)


Call:
glm(formula = correct ~ task + input_nchars + input_logprob + 
    output_logprob, family = binomial, data = scaled_sortnumbers_llama_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.2860  -0.5447  -0.3703  -0.2424   2.4428  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     -1.7188     0.4003  -4.293 1.76e-05 ***
taskdescending  -1.2593     0.6968  -1.807   0.0707 .  
input_nchars     2.2068     1.5263   1.446   0.1482    
input_logprob    3.8789     1.6085   2.412   0.0159 *  
output_logprob  -0.9442     1.1334  -0.833   0.4048    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 146.77  on 199  degrees of freedom
Residual deviance: 129.47  on 195  degrees of freedom
AIC: 139.47

Number of Fisher Scoring iterations: 5


In [373]:
summary(sort_palm_numbers_model)


Call:
glm(formula = correct ~ task + input_nchars + input_logprob + 
    output_logprob, family = binomial, data = scaled_sortnumbers_palm_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.0073  -0.7359  -0.4172   0.8284   2.3689  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)    -0.05316    0.31709  -0.168 0.866872    
taskdescending -2.14853    0.57669  -3.726 0.000195 ***
input_nchars   -0.48252    1.19707  -0.403 0.686888    
input_logprob   1.09231    1.18640   0.921 0.357207    
output_logprob -0.58512    0.89994  -0.650 0.515576    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 246.02  on 199  degrees of freedom
Residual deviance: 190.58  on 195  degrees of freedom
AIC: 200.58

Number of Fisher Scoring iterations: 5


In [374]:
vif(sort_gpt4_words_model)

In [375]:
vif(sort_gpt35_words_model)

In [376]:
vif(sort_llama_words_model)

In [377]:
vif(sort_palm_words_model)

In [378]:
vif(sort_gpt4_numbers_model)

In [379]:
vif(sort_gpt35_numbers_model)

In [380]:
vif(sort_llama_numbers_model)

In [381]:
vif(sort_palm_numbers_model)

# Birthdays

In [383]:
# Read in data
birthdays_gpt4_df <- read.table(file = 'table_birthdays_gpt-4-0613.tsv', sep = '\t', header = TRUE)
birthdays_gpt35_df <- read.table(file = 'table_birthdays_gpt-3.5-turbo-0613.tsv', sep = '\t', header = TRUE)
birthdays_llama_df <- read.table(file = 'table_birthdays_llama-2-70b-chat.tsv', sep = '\t', header = TRUE)
birthdays_palm_df <- read.table(file = 'table_birthdays_text-bison-001.tsv', sep = '\t', header = TRUE)



In [384]:
# Z-score data
scaled_birthdays_gpt4_df <- scale_df(birthdays_gpt4_df)
scaled_birthdays_gpt35_df <- scale_df(birthdays_gpt35_df)
scaled_birthdays_llama_df <- scale_df(birthdays_llama_df)
scaled_birthdays_palm_df <- scale_df(birthdays_palm_df)

In [385]:
birthdays_gpt4_model <- glm(correct ~ input_logprob + output_logprob, 
               data=scaled_birthdays_gpt4_df, family=binomial)
birthdays_gpt35_model <- glm(correct ~ input_logprob + output_logprob, 
               data=scaled_birthdays_gpt35_df, family=binomial)
birthdays_llama_model <- glm(correct ~ input_logprob + output_logprob, 
               data=scaled_birthdays_llama_df, family=binomial)
birthdays_palm_model <- glm(correct ~ input_logprob + output_logprob, 
               data=scaled_birthdays_palm_df, family=binomial)

In [386]:
summary(birthdays_gpt4_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob, family = binomial, 
    data = scaled_birthdays_gpt4_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.8769  -0.6683   0.1843   0.5264   2.0459  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)      1.4605     0.1915   7.626 2.43e-14 ***
input_logprob    2.2209     0.2229   9.964  < 2e-16 ***
output_logprob   0.1372     0.1440   0.953    0.341    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 505.92  on 399  degrees of freedom
Residual deviance: 305.07  on 397  degrees of freedom
AIC: 311.07

Number of Fisher Scoring iterations: 6


In [387]:
summary(birthdays_gpt35_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob, family = binomial, 
    data = scaled_birthdays_gpt35_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.9662  -0.4500   0.1340   0.4589   2.3463  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)    0.880415   0.178538   4.931 8.17e-07 ***
input_logprob  2.770293   0.258676  10.710  < 2e-16 ***
output_logprob 0.001702   0.156567   0.011    0.991    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 544.23  on 399  degrees of freedom
Residual deviance: 261.61  on 397  degrees of freedom
AIC: 267.61

Number of Fisher Scoring iterations: 6


In [388]:
summary(birthdays_llama_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob, family = binomial, 
    data = scaled_birthdays_llama_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.99414  -0.53714  -0.19661  -0.09632   2.97083  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     -2.1302     0.2308  -9.228   <2e-16 ***
input_logprob    2.0660     0.2283   9.050   <2e-16 ***
output_logprob   0.1883     0.1537   1.225    0.221    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 433.82  on 399  degrees of freedom
Residual deviance: 274.88  on 397  degrees of freedom
AIC: 280.88

Number of Fisher Scoring iterations: 6


In [389]:
summary(birthdays_palm_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob, family = binomial, 
    data = scaled_birthdays_palm_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-2.17824  -0.43235  -0.11441  -0.04777   2.68036  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     -2.7396     0.3135  -8.737   <2e-16 ***
input_logprob    2.6834     0.3020   8.884   <2e-16 ***
output_logprob   0.1545     0.1689   0.915     0.36    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 424.04  on 399  degrees of freedom
Residual deviance: 229.08  on 397  degrees of freedom
AIC: 235.08

Number of Fisher Scoring iterations: 6


In [390]:
vif(birthdays_gpt4_model)

In [391]:
vif(birthdays_gpt35_model)

In [392]:
vif(birthdays_llama_model)

In [393]:
vif(birthdays_palm_model)