In [1]:
# Might need to install arm - if so, uncomment the following line
#install.packages("arm")
library(arm)
library(lme4)
library(blme)
library(ggplot2)
library(stringr)
library(psycho)
library(glmnet)
library(car)
library("multcomp")

Loading required package: MASS

Loading required package: Matrix

Loading required package: lme4


arm (Version 1.13-1, built: 2022-8-25)


Working directory is /Users/tommccoy/Documents/tommccoy/Documents/GitHub/embers-of-autoregression/evaluation



Attaching package: ‘psycho’


The following object is masked from ‘package:lme4’:

    golden


Loaded glmnet 4.1-7

Loading required package: carData


Attaching package: ‘car’


The following object is masked from ‘package:arm’:

    logit


Loading required package: mvtnorm

Loading required package: survival

Loading required package: TH.data


Attaching package: ‘TH.data’


The following object is masked from ‘package:MASS’:

    geyser




In [2]:
# Set ggplot theme
base_size <- 20
theme_set(theme_bw(base_size=base_size) +
            theme(#panel.grid.major=element_blank(),
              panel.grid.minor=element_blank(),
              axis.title.y=element_text(angle=90,vjust=0.5),
              axis.text.x=element_text(angle=0, hjust=0.5),
              axis.title.x=element_blank()))

In [3]:
correct_vs_length_and_prob <- function(df, include_output_chars=TRUE, include_output_tokens=TRUE){
 
    
    if (include_output_chars & include_output_tokens){
        model <- glm(correct ~ input_logprob + output_logprob + input_ntokens + output_ntokens + input_nchars + output_nchars, 
               data=df, family=binomial)
    } else if (include_output_chars & !include_output_tokens){
        model <- glm(correct ~ input_logprob + output_logprob + input_ntokens + input_nchars + output_nchars, 
               data=df, family=binomial)
    } else if (!include_output_chars & include_output_tokens){
        model <- glm(correct ~ input_logprob + output_logprob + input_ntokens + output_ntokens + input_nchars, 
               data=df, family=binomial)
    } else if (!include_output_chars & !include_output_tokens){
        model <- glm(correct ~ input_logprob + output_logprob + input_ntokens + input_nchars, 
               data=df, family=binomial)
    }
  
    
  return(model)
}

In [4]:
# For Z-scoring datasets

scale_df <- function(df) {
    new_df <- data.frame(scale(df[2:7]))
    new_df$index <- as.factor(df$index)
    new_df$correct <- df$correct
    
    return(new_df)
}

scale_taskpair_df <- function(df) {
    new_df <- data.frame(scale(df[3:8]))
    new_df$index <- as.factor(df$index)
    new_df$task <- as.factor(df$task)
    new_df$correct <- df$correct
    
    return(new_df)
}

scale_taskpair_prob_df <- function(df) {
    new_df <- data.frame(scale(df[2:8]))
    new_df$index <- as.factor(df$index)
    new_df$correct <- df$correct
    
    return(new_df)
}



scale_df_with_index <- function(df) {
    new_df <- data.frame(scale(df[1:7]))
    new_df$correct <- df$correct
    
    return(new_df)
}


scale_df_with_index_demos <- function(df) {
    new_df <- data.frame(scale(df[1:8]))
    new_df$correct <- df$correct
    
    return(new_df)
}

# Shift ciphers

In [55]:
# Read in data
rot13enc_gpt4_df <- read.table(file = 'table_rot13enc_gpt-4-0613.tsv', sep = '\t', header = TRUE)
rot13enc_gpt35_df <- read.table(file = 'table_rot13enc_gpt-3.5-turbo-0613.tsv', sep = '\t', header = TRUE)
rot13enc_llama3_df <- read.table(file = 'table_rot13enc_llama-3-70b-chat-hf.tsv', sep = '\t', header = TRUE)
rot13enc_claude3_df <- read.table(file = 'table_rot13enc_claude-3-opus-20240229.tsv', sep = '\t', header = TRUE)
rot13enc_gemini1_df <- read.table(file = 'table_rot13enc_gemini-1.0-pro-001.tsv', sep = '\t', header = TRUE)

rot13dec_gpt4_df <- read.table(file = 'table_rot13dec_gpt-4-0613.tsv', sep = '\t', header = TRUE)
rot13dec_gpt35_df <- read.table(file = 'table_rot13dec_gpt-3.5-turbo-0613.tsv', sep = '\t', header = TRUE)
rot13dec_llama3_df <- read.table(file = 'table_rot13dec_llama-3-70b-chat-hf.tsv', sep = '\t', header = TRUE)
rot13dec_claude3_df <- read.table(file = 'table_rot13dec_claude-3-opus-20240229.tsv', sep = '\t', header = TRUE)
rot13dec_gemini1_df <- read.table(file = 'table_rot13dec_gemini-1.0-pro-001.tsv', sep = '\t', header = TRUE)


In [56]:
# Z-score data
scaled_rot13enc_gpt4_df <- scale_df(rot13enc_gpt4_df)
scaled_rot13enc_gpt35_df <- scale_df(rot13enc_gpt35_df)
scaled_rot13enc_llama3_df <- scale_df(rot13enc_llama3_df)
scaled_rot13enc_claude3_df <- scale_df(rot13enc_claude3_df)
scaled_rot13enc_gemini1_df <- scale_df(rot13enc_gemini1_df)


scaled_rot13dec_gpt4_df <- scale_df(rot13dec_gpt4_df)
scaled_rot13dec_gpt35_df <- scale_df(rot13dec_gpt35_df)
scaled_rot13dec_llama3_df <- scale_df(rot13dec_llama3_df)
scaled_rot13dec_claude3_df <- scale_df(rot13dec_claude3_df)
scaled_rot13dec_gemini1_df <- scale_df(rot13dec_gemini1_df)




In [57]:

rot13enc_gpt4_model <- bayesglm(correct ~ input_logprob + output_logprob + input_ntokens + output_ntokens + input_nchars, 
               data=scaled_rot13enc_gpt4_df, family=binomial)



In [58]:
rot13enc_gpt35_model <- bayesglm(correct ~ input_logprob + output_logprob + input_ntokens + output_ntokens + input_nchars, 
               data=scaled_rot13enc_gpt35_df, family=binomial)


In [59]:
rot13enc_claude3_notokens_model <- bayesglm(correct ~ input_logprob + output_logprob + input_nchars, 
               data=scaled_rot13enc_claude3_df, family=binomial)


In [61]:
rot13dec_gpt4_model <- bayesglm(correct ~ input_logprob + output_logprob + input_ntokens + output_ntokens + input_nchars, 
               data=scaled_rot13dec_gpt4_df, family=binomial)



In [62]:
rot13dec_gpt35_model <- bayesglm(correct ~ input_logprob + output_logprob + input_ntokens + output_ntokens + input_nchars, 
               data=scaled_rot13dec_gpt35_df, family=binomial)



In [63]:
rot13dec_llama3_model <- bayesglm(correct ~ input_logprob + output_logprob + input_ntokens + output_ntokens + input_nchars, 
               data=scaled_rot13dec_llama3_df, family=binomial)


In [64]:
rot13dec_claude3_notokens_model <- bayesglm(correct ~ input_logprob + output_logprob + input_nchars, 
               data=scaled_rot13dec_claude3_df, family=binomial)


In [65]:
summary(rot13enc_gpt4_model)


Call:
bayesglm(formula = correct ~ input_logprob + output_logprob + 
    input_ntokens + output_ntokens + input_nchars, family = binomial, 
    data = scaled_rot13enc_gpt4_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.92342  -0.49003  -0.17866  -0.02434   2.85555  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     -3.7200     0.4851  -7.669 1.74e-14 ***
input_logprob    1.0373     0.4259   2.436 0.014869 *  
output_logprob   4.7741     1.4615   3.267 0.001088 ** 
input_ntokens    2.6572     0.7034   3.778 0.000158 ***
output_ntokens  -0.2733     1.0073  -0.271 0.786108    
input_nchars     0.1475     1.0598   0.139 0.889317    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 242.98  on 299  degrees of freedom
Residual deviance: 157.21  on 294  degrees of freedom
AIC: 169.21

Number of Fisher Scoring iterations: 19


In [66]:
summary(rot13enc_gpt35_model)


Call:
bayesglm(formula = correct ~ input_logprob + output_logprob + 
    input_ntokens + output_ntokens + input_nchars, family = binomial, 
    data = scaled_rot13enc_gpt35_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.33707  -0.12623  -0.03419  -0.00108   2.90055  

Coefficients:
                Estimate Std. Error z value Pr(>|z|)    
(Intercept)    -9.075410   1.885411  -4.813 1.48e-06 ***
input_logprob  -0.119177   0.739537  -0.161  0.87197    
output_logprob  7.712581   2.350568   3.281  0.00103 ** 
input_ntokens   1.361564   1.276409   1.067  0.28610    
output_ntokens  0.005853   1.170686   0.005  0.99601    
input_nchars   -0.121317   1.180869  -0.103  0.91817    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 87.687  on 299  degrees of freedom
Residual deviance: 41.666  on 294  degrees of freedom
AIC: 53.666

Number of Fisher Scoring iterations: 

In [69]:
summary(rot13enc_claude3_notokens_model)


Call:
bayesglm(formula = correct ~ input_logprob + output_logprob + 
    input_nchars, family = binomial, data = scaled_rot13enc_claude3_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.6320  -1.3178   0.8353   0.9394   1.4932  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)      0.5019     0.1213   4.139 3.49e-05 ***
input_logprob    0.3205     0.1622   1.976   0.0482 *  
output_logprob   1.4396     0.7533   1.911   0.0560 .  
input_nchars     1.3973     0.7477   1.869   0.0617 .  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 398.44  on 299  degrees of freedom
Residual deviance: 383.53  on 296  degrees of freedom
AIC: 391.53

Number of Fisher Scoring iterations: 24


In [74]:
summary(rot13dec_gpt4_model)


Call:
bayesglm(formula = correct ~ input_logprob + output_logprob + 
    input_ntokens + output_ntokens + input_nchars, family = binomial, 
    data = scaled_rot13dec_gpt4_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.3901  -0.8476  -0.4254   0.9795   2.7135  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     -1.4029     0.1929  -7.271 3.57e-13 ***
input_logprob    0.2163     0.7732   0.280   0.7796    
output_logprob   1.8513     0.3227   5.738 9.60e-09 ***
input_ntokens   -0.8139     0.8317  -0.979   0.3278    
output_ntokens   0.8600     0.4629   1.858   0.0632 .  
input_nchars     0.5246     0.8650   0.606   0.5442    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 359.48  on 299  degrees of freedom
Residual deviance: 288.31  on 294  degrees of freedom
AIC: 300.31

Number of Fisher Scoring iterations: 12


In [75]:
summary(rot13dec_gpt35_model)


Call:
bayesglm(formula = correct ~ input_logprob + output_logprob + 
    input_ntokens + output_ntokens + input_nchars, family = binomial, 
    data = scaled_rot13dec_gpt35_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.34688  -0.40711  -0.14816  -0.02251   2.56717  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)    -4.34685    0.60444  -7.191 6.41e-13 ***
input_logprob   0.59053    1.01852   0.580 0.562054    
output_logprob  2.95504    0.72006   4.104 4.06e-05 ***
input_ntokens  -2.90217    1.39163  -2.085 0.037029 *  
output_ntokens  2.94379    0.78781   3.737 0.000186 ***
input_nchars   -0.08583    0.99556  -0.086 0.931299    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 203.69  on 299  degrees of freedom
Residual deviance: 127.75  on 294  degrees of freedom
AIC: 139.75

Number of Fisher Scoring iterations: 34


In [76]:
summary(rot13dec_llama3_model)


Call:
bayesglm(formula = correct ~ input_logprob + output_logprob + 
    input_ntokens + output_ntokens + input_nchars, family = binomial, 
    data = scaled_rot13dec_llama3_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-0.68915  -0.22357  -0.09065  -0.02453   2.55521  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)    -6.20600    1.27613  -4.863 1.16e-06 ***
input_logprob   0.37816    1.01170   0.374   0.7086    
output_logprob  3.25764    1.43725   2.267   0.0234 *  
input_ntokens  -0.60053    1.05447  -0.570   0.5690    
output_ntokens  0.61985    0.90132   0.688   0.4916    
input_nchars    0.02311    0.99816   0.023   0.9815    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 58.823  on 299  degrees of freedom
Residual deviance: 42.219  on 294  degrees of freedom
AIC: 54.219

Number of Fisher Scoring iterations: 32


In [78]:
summary(rot13dec_claude3_notokens_model)


Call:
bayesglm(formula = correct ~ input_logprob + output_logprob + 
    input_nchars, family = binomial, data = scaled_rot13dec_claude3_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.9459   0.1536   0.3023   0.4555   0.8934  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)      3.0428     0.3404   8.940  < 2e-16 ***
input_logprob   -2.0510     0.9312  -2.203 0.027627 *  
output_logprob   1.4972     0.3929   3.811 0.000139 ***
input_nchars    -0.1007     0.8806  -0.114 0.908975    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 167.26  on 299  degrees of freedom
Residual deviance: 141.57  on 296  degrees of freedom
AIC: 149.57

Number of Fisher Scoring iterations: 20


In [71]:
vif(rot13enc_gpt4_model)

In [72]:
vif(rot13enc_gpt35_model)

In [73]:
vif(rot13enc_claude3_notokens_model)

In [79]:
vif(rot13dec_gpt4_model)

In [80]:
vif(rot13dec_gpt35_model)

In [81]:
vif(rot13dec_llama3_model)

In [82]:
vif(rot13dec_claude3_notokens_model)

### Comparing rot-12 to rot-13

In [5]:
# Read in data
rot13and12enc_gpt4_df <- read.table(file = 'table_rot13and12enc_gpt-4-0613.tsv', sep = '\t', header = TRUE)
rot13and12enc_gpt35_df <- read.table(file = 'table_rot13and12enc_gpt-3.5-turbo-0613.tsv', sep = '\t', header = TRUE)
rot13and12enc_llama3_df <- read.table(file = 'table_rot13and12enc_llama-3-70b-chat-hf.tsv', sep = '\t', header = TRUE)
rot13and12enc_claude3_df <- read.table(file = 'table_rot13and12enc_claude-3-opus-20240229.tsv', sep = '\t', header = TRUE)
rot13and12enc_gemini1_df <- read.table(file = 'table_rot13and12enc_gemini-1.0-pro-001.tsv', sep = '\t', header = TRUE)

rot13and12dec_gpt4_df <- read.table(file = 'table_rot13and12dec_gpt-4-0613.tsv', sep = '\t', header = TRUE)
rot13and12dec_gpt35_df <- read.table(file = 'table_rot13and12dec_gpt-3.5-turbo-0613.tsv', sep = '\t', header = TRUE)
rot13and12dec_llama3_df <- read.table(file = 'table_rot13and12dec_llama-3-70b-chat-hf.tsv', sep = '\t', header = TRUE)
rot13and12dec_claude3_df <- read.table(file = 'table_rot13and12dec_claude-3-opus-20240229.tsv', sep = '\t', header = TRUE)
rot13and12dec_gemini1_df <- read.table(file = 'table_rot13and12dec_gemini-1.0-pro-001.tsv', sep = '\t', header = TRUE)



In [6]:
# Z-score data
scaled_rot13and12enc_gpt4_df <- scale_taskpair_df(rot13and12enc_gpt4_df)
scaled_rot13and12enc_gpt35_df <- scale_taskpair_df(rot13and12enc_gpt35_df)
scaled_rot13and12enc_llama3_df <- scale_taskpair_df(rot13and12enc_llama3_df)
scaled_rot13and12enc_claude3_df <- scale_taskpair_df(rot13and12enc_claude3_df)
scaled_rot13and12enc_gemini1_df <- scale_taskpair_df(rot13and12enc_gemini1_df)

scaled_rot13and12dec_gpt4_df <- scale_taskpair_df(rot13and12dec_gpt4_df)
scaled_rot13and12dec_gpt35_df <- scale_taskpair_df(rot13and12dec_gpt35_df)
scaled_rot13and12dec_llama3_df <- scale_taskpair_df(rot13and12dec_llama3_df)
scaled_rot13and12dec_claude3_df <- scale_taskpair_df(rot13and12dec_claude3_df)
scaled_rot13and12dec_gemini1_df <- scale_taskpair_df(rot13and12dec_gemini1_df)


In [7]:
model_taskenc4 <- bayesglm(correct ~ task + input_nchars + input_ntokens + output_ntokens + input_logprob + output_logprob, 
               data=scaled_rot13and12enc_gpt4_df, family=binomial)


In [8]:
model_taskenc35 <- bayesglm(correct ~ task + input_nchars + input_ntokens + output_ntokens + input_logprob + output_logprob, 
               data=scaled_rot13and12enc_gpt35_df, family=binomial)

In [9]:
model_taskencllama <- bayesglm(correct ~ task + input_nchars + input_ntokens + output_ntokens + input_logprob + output_logprob, 
               data=scaled_rot13and12enc_llama3_df, family=binomial)

In [10]:
model_taskencclaude_notokens <- bayesglm(correct ~ task + input_nchars + input_logprob + output_logprob, 
               data=scaled_rot13and12enc_claude3_df, family=binomial)

In [11]:
model_taskencgemini <- bayesglm(correct ~ task + input_nchars + input_ntokens + output_ntokens + input_logprob + output_logprob, 
               data=scaled_rot13and12enc_gemini1_df, family=binomial)

In [12]:
model_taskdec4 <- bayesglm(correct ~ task + input_nchars + input_ntokens + output_ntokens + input_logprob + output_logprob, 
               data=scaled_rot13and12dec_gpt4_df, family=binomial)

In [13]:
model_taskdec35 <- bayesglm(correct ~ task + input_nchars + input_ntokens + output_ntokens + input_logprob + output_logprob, 
               data=scaled_rot13and12dec_gpt35_df, family=binomial)

In [14]:
model_taskdecllama <- bayesglm(correct ~ task + input_nchars + input_ntokens + output_ntokens + input_logprob + output_logprob, 
               data=scaled_rot13and12dec_llama3_df, family=binomial)

In [15]:
model_taskdecclaude_notokens <- bayesglm(correct ~ task + input_nchars + input_logprob + output_logprob, 
               data=scaled_rot13and12dec_claude3_df, family=binomial)

In [16]:
model_taskdecgemini <- bayesglm(correct ~ task + input_nchars + input_ntokens + output_ntokens + input_logprob + output_logprob, 
               data=scaled_rot13and12dec_gemini1_df, family=binomial)

In [17]:
summary(model_taskenc4)


Call:
bayesglm(formula = correct ~ task + input_nchars + input_ntokens + 
    output_ntokens + input_logprob + output_logprob, family = binomial, 
    data = scaled_rot13and12enc_gpt4_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.28312  -0.21774  -0.08058  -0.01366   2.80135  

Coefficients:
                       Estimate Std. Error z value Pr(>|z|)    
(Intercept)           -7.272566   1.675602  -4.340 1.42e-05 ***
taskrot13enc_highprob  4.432712   1.581314   2.803  0.00506 ** 
input_nchars           0.001381   1.062100   0.001  0.99896    
input_ntokens          2.016459   0.887241   2.273  0.02304 *  
output_ntokens        -1.066433   1.248852  -0.854  0.39314    
input_logprob          0.893642   0.737659   1.211  0.22572    
output_logprob         2.661051   1.479676   1.798  0.07211 .  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 134.373  on 199

In [18]:
summary(model_taskenc35)


Call:
bayesglm(formula = correct ~ task + input_nchars + input_ntokens + 
    output_ntokens + input_logprob + output_logprob, family = binomial, 
    data = scaled_rot13and12enc_gpt35_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-0.43464  -0.14896  -0.08135  -0.03616   2.66492  

Coefficients:
                      Estimate Std. Error z value Pr(>|z|)    
(Intercept)            -6.8305     1.8784  -3.636 0.000277 ***
taskrot13enc_highprob   1.5992     1.4802   1.080 0.279968    
input_nchars           -0.5488     1.1410  -0.481 0.630529    
input_ntokens           0.2414     1.0308   0.234 0.814841    
output_ntokens         -0.3848     1.1211  -0.343 0.731392    
input_logprob           0.8396     1.1280   0.744 0.456668    
output_logprob          0.5746     1.1457   0.501 0.616035    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 22.401  on 199  degree

In [19]:
summary(model_taskencllama)


Call:
bayesglm(formula = correct ~ task + input_nchars + input_ntokens + 
    output_ntokens + input_logprob + output_logprob, family = binomial, 
    data = scaled_rot13and12enc_llama3_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-0.03007  -0.03007  -0.03007  -0.03007  -0.03007  

Coefficients:
                        Estimate Std. Error z value Pr(>|z|)  
(Intercept)           -7.701e+00  3.323e+00  -2.318   0.0205 *
taskrot13enc_highprob -7.049e-16  2.233e+00   0.000   1.0000  
input_nchars          -1.958e-16  1.153e+00   0.000   1.0000  
input_ntokens         -1.587e-16  1.150e+00   0.000   1.0000  
output_ntokens        -2.019e-16  1.153e+00   0.000   1.0000  
input_logprob          2.984e-16  1.149e+00   0.000   1.0000  
output_logprob         1.959e-16  1.152e+00   0.000   1.0000  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 0.00000  on 199  degr

In [20]:
summary(model_taskencclaude_notokens)


Call:
bayesglm(formula = correct ~ task + input_nchars + input_logprob + 
    output_logprob, family = binomial, data = scaled_rot13and12enc_claude3_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.8686  -0.7500  -0.4167   0.7911   2.0890  

Coefficients:
                      Estimate Std. Error z value Pr(>|z|)    
(Intercept)            -1.4703     0.2671  -5.504 3.71e-08 ***
taskrot13enc_highprob   2.2867     0.3653   6.259 3.87e-10 ***
input_nchars            0.5065     0.7414   0.683    0.494    
input_logprob           0.3360     0.4022   0.835    0.404    
output_logprob          0.7875     0.7329   1.074    0.283    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 274.83  on 199  degrees of freedom
Residual deviance: 206.14  on 195  degrees of freedom
AIC: 216.14

Number of Fisher Scoring iterations: 11


In [21]:
summary(model_taskencgemini)


Call:
bayesglm(formula = correct ~ task + input_nchars + input_ntokens + 
    output_ntokens + input_logprob + output_logprob, family = binomial, 
    data = scaled_rot13and12enc_gemini1_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-0.03007  -0.03007  -0.03007  -0.03007  -0.03007  

Coefficients:
                        Estimate Std. Error z value Pr(>|z|)  
(Intercept)           -7.701e+00  3.323e+00  -2.318   0.0205 *
taskrot13enc_highprob -1.035e-16  2.233e+00   0.000   1.0000  
input_nchars          -9.108e-17  1.153e+00   0.000   1.0000  
input_ntokens         -9.870e-17  1.149e+00   0.000   1.0000  
output_ntokens        -8.006e-17  1.152e+00   0.000   1.0000  
input_logprob          1.062e-16  1.149e+00   0.000   1.0000  
output_logprob         6.396e-17  1.152e+00   0.000   1.0000  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 0.00000  on 199  deg

In [22]:
summary(model_taskdec4)


Call:
bayesglm(formula = correct ~ task + input_nchars + input_ntokens + 
    output_ntokens + input_logprob + output_logprob, family = binomial, 
    data = scaled_rot13and12dec_gpt4_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.49950  -0.13260  -0.06963   0.63622   1.84601  

Coefficients:
                      Estimate Std. Error z value Pr(>|z|)    
(Intercept)            -6.0516     1.7385  -3.481 0.000500 ***
taskrot13dec_highprob   6.0198     1.7507   3.438 0.000585 ***
input_nchars            0.3016     0.8979   0.336 0.736918    
input_ntokens          -0.3545     0.8826  -0.402 0.687951    
output_ntokens          1.0815     0.6332   1.708 0.087645 .  
input_logprob           0.1191     0.8326   0.143 0.886289    
output_logprob          1.4528     0.5552   2.617 0.008877 ** 
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 227.10  on 199  degrees

In [23]:
summary(model_taskdec35)


Call:
bayesglm(formula = correct ~ task + input_nchars + input_ntokens + 
    output_ntokens + input_logprob + output_logprob, family = binomial, 
    data = scaled_rot13and12dec_gpt35_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.28297  -0.30232  -0.09453  -0.02526   2.29894  

Coefficients:
                      Estimate Std. Error z value Pr(>|z|)    
(Intercept)            -6.8275     1.6299  -4.189  2.8e-05 ***
taskrot13dec_highprob   4.4997     1.5854   2.838  0.00454 ** 
input_nchars           -0.1492     0.9860  -0.151  0.87973    
input_ntokens          -0.8666     1.0905  -0.795  0.42678    
output_ntokens          2.1791     0.8559   2.546  0.01090 *  
input_logprob           0.7897     1.0254   0.770  0.44125    
output_logprob          2.3232     0.8689   2.674  0.00750 ** 
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 134.373  on 199  degre

In [24]:
summary(model_taskdecllama)


Call:
bayesglm(formula = correct ~ task + input_nchars + input_ntokens + 
    output_ntokens + input_logprob + output_logprob, family = binomial, 
    data = scaled_rot13and12dec_llama3_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-0.67248  -0.24449  -0.10674  -0.05042   2.54849  

Coefficients:
                      Estimate Std. Error z value Pr(>|z|)    
(Intercept)            -6.4064     1.5754  -4.067 4.77e-05 ***
taskrot13dec_highprob   2.4704     1.4370   1.719   0.0856 .  
input_nchars           -0.0869     1.0201  -0.085   0.9321    
input_ntokens          -0.7023     1.1243  -0.625   0.5322    
output_ntokens          0.3266     0.9191   0.355   0.7223    
input_logprob           0.5646     1.0722   0.527   0.5985    
output_logprob          0.7525     0.9427   0.798   0.4247    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 46.763  on 199  degre

In [26]:
summary(model_taskdecclaude_notokens)


Call:
bayesglm(formula = correct ~ task + input_nchars + input_logprob + 
    output_logprob, family = binomial, data = scaled_rot13and12dec_claude3_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.7354  -0.7998   0.2026   0.2381   1.6626  

Coefficients:
                      Estimate Std. Error z value Pr(>|z|)    
(Intercept)           -0.79018    0.22616  -3.494 0.000476 ***
taskrot13dec_highprob  4.54183    0.69554   6.530 6.58e-11 ***
input_nchars           0.49815    0.74170   0.672 0.501815    
input_logprob          0.07247    0.71469   0.101 0.919233    
output_logprob         0.19334    0.45542   0.425 0.671169    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 260.19  on 199  degrees of freedom
Residual deviance: 141.37  on 195  degrees of freedom
AIC: 151.37

Number of Fisher Scoring iterations: 11


In [27]:
summary(model_taskdecgemini)


Call:
bayesglm(formula = correct ~ task + input_nchars + input_ntokens + 
    output_ntokens + input_logprob + output_logprob, family = binomial, 
    data = scaled_rot13and12dec_gemini1_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-0.03007  -0.03007  -0.03007  -0.03007  -0.03007  

Coefficients:
                        Estimate Std. Error z value Pr(>|z|)  
(Intercept)           -7.701e+00  3.323e+00  -2.318   0.0205 *
taskrot13dec_highprob -1.035e-16  2.233e+00   0.000   1.0000  
input_nchars          -9.095e-17  1.153e+00   0.000   1.0000  
input_ntokens         -7.168e-17  1.152e+00   0.000   1.0000  
output_ntokens        -9.978e-17  1.149e+00   0.000   1.0000  
input_logprob          7.115e-17  1.152e+00   0.000   1.0000  
output_logprob         1.076e-16  1.149e+00   0.000   1.0000  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 0.00000  on 199  deg

In [28]:
vif(model_taskenc4)

In [29]:
vif(model_taskenc35)

In [30]:
vif(model_taskencllama)

In [32]:
vif(model_taskencclaude_notokens)

In [33]:
vif(model_taskencgemini)

In [34]:
vif(model_taskdec4)

In [35]:
vif(model_taskdec35)

In [36]:
vif(model_taskdecllama)

In [38]:
vif(model_taskdecclaude_notokens)

In [39]:
vif(model_taskdecgemini)

### Input and output logprob for different prompt styles

In [83]:
# Read in data
rot13encbasic_gpt4_df <- read.table(file = 'table_rot13enc_gpt-4-0613.tsv', sep = '\t', header = TRUE)
rot13decbasic_gpt4_df <- read.table(file = 'table_rot13dec_gpt-4-0613.tsv', sep = '\t', header = TRUE)

rot13encstep_gpt4_df <- read.table(file = 'table_rot13encstep_gpt-4-0613.tsv', sep = '\t', header = TRUE)
rot13decstep_gpt4_df <- read.table(file = 'table_rot13decstep_gpt-4-0613.tsv', sep = '\t', header = TRUE)

rot13enccot_gpt4_df <- read.table(file = 'table_rot13enccot_gpt-4-0613.tsv', sep = '\t', header = TRUE)
rot13deccot_gpt4_df <- read.table(file = 'table_rot13deccot_gpt-4-0613.tsv', sep = '\t', header = TRUE)


In [84]:
# Z-score data
scaled_rot13encbasic_gpt4_df <- scale_df(rot13encbasic_gpt4_df)
scaled_rot13decbasic_gpt4_df <- scale_df(rot13decbasic_gpt4_df)

scaled_rot13encstep_gpt4_df <- scale_df(rot13encstep_gpt4_df)
scaled_rot13decstep_gpt4_df <- scale_df(rot13decstep_gpt4_df)

scaled_rot13enccot_gpt4_df <- scale_df(rot13enccot_gpt4_df)
scaled_rot13deccot_gpt4_df <- scale_df(rot13deccot_gpt4_df)


In [85]:
rot13encbasic_gpt4_model <- correct_vs_length_and_prob(scaled_rot13encbasic_gpt4_df, include_output_chars=FALSE)
rot13decbasic_gpt4_model <- correct_vs_length_and_prob(scaled_rot13decbasic_gpt4_df, include_output_chars=FALSE)

rot13encstep_gpt4_model <- correct_vs_length_and_prob(scaled_rot13encstep_gpt4_df, include_output_chars=FALSE)
rot13decstep_gpt4_model <- correct_vs_length_and_prob(scaled_rot13decstep_gpt4_df, include_output_chars=FALSE)

rot13enccot_gpt4_model <- correct_vs_length_and_prob(scaled_rot13enccot_gpt4_df, include_output_chars=FALSE)
rot13deccot_gpt4_model <- correct_vs_length_and_prob(scaled_rot13deccot_gpt4_df, include_output_chars=FALSE)



In [86]:
summary(rot13encbasic_gpt4_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob + input_ntokens + 
    output_ntokens + input_nchars, family = binomial, data = df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.99812  -0.44168  -0.14960  -0.01557   2.95372  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     -4.1263     0.5728  -7.204 5.84e-13 ***
input_logprob    1.2287     0.4840   2.539  0.01112 *  
output_logprob   7.8371     3.2790   2.390  0.01684 *  
input_ntokens    2.7704     0.9766   2.837  0.00456 ** 
output_ntokens  -0.9145     2.5316  -0.361  0.71793    
input_nchars     3.5012     3.3936   1.032  0.30221    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 242.98  on 299  degrees of freedom
Residual deviance: 155.44  on 294  degrees of freedom
AIC: 167.44

Number of Fisher Scoring iterations: 7


In [87]:
summary(rot13decbasic_gpt4_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob + input_ntokens + 
    output_ntokens + input_nchars, family = binomial, data = df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.5419  -0.8313  -0.4035   0.9128   2.6598  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     -1.4549     0.2014  -7.224 5.04e-13 ***
input_logprob    0.8315     1.7469   0.476    0.634    
output_logprob   1.9277     0.3396   5.676 1.38e-08 ***
input_ntokens   -2.5684     1.6014  -1.604    0.109    
output_ntokens   0.8680     0.6047   1.436    0.151    
input_nchars     2.8939     2.0645   1.402    0.161    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 359.48  on 299  degrees of freedom
Residual deviance: 286.22  on 294  degrees of freedom
AIC: 298.22

Number of Fisher Scoring iterations: 6


In [88]:
summary(rot13encstep_gpt4_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob + input_ntokens + 
    output_ntokens + input_nchars, family = binomial, data = df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.73379  -0.51483  -0.20383  -0.03238   2.75493  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     -3.5963     0.4842  -7.427 1.11e-13 ***
input_logprob    0.8294     0.4394   1.888   0.0591 .  
output_logprob   6.8981     3.0990   2.226   0.0260 *  
input_ntokens    2.1459     0.9113   2.355   0.0185 *  
output_ntokens   0.4738     2.4126   0.196   0.8443    
input_nchars     2.1449     3.1837   0.674   0.5005    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 235.60  on 299  degrees of freedom
Residual deviance: 166.53  on 294  degrees of freedom
AIC: 178.53

Number of Fisher Scoring iterations: 7


In [89]:
summary(rot13decstep_gpt4_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob + input_ntokens + 
    output_ntokens + input_nchars, family = binomial, data = df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.6857  -0.8601  -0.4367   0.9461   2.8363  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     -0.9332     0.1641  -5.686 1.30e-08 ***
input_logprob    1.6491     1.6681   0.989   0.3229    
output_logprob   1.3987     0.2705   5.172 2.32e-07 ***
input_ntokens   -1.4723     1.5050  -0.978   0.3279    
output_ntokens   1.4870     0.5932   2.507   0.0122 *  
input_nchars     1.5574     1.9448   0.801   0.4232    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 392.05  on 299  degrees of freedom
Residual deviance: 314.33  on 294  degrees of freedom
AIC: 326.33

Number of Fisher Scoring iterations: 5


In [90]:
summary(rot13enccot_gpt4_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob + input_ntokens + 
    output_ntokens + input_nchars, family = binomial, data = df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.28774  -0.20523  -0.04222  -0.00230   2.98322  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     -7.5672     1.3560  -5.581  2.4e-08 ***
input_logprob    1.8554     0.9072   2.045   0.0408 *  
output_logprob   7.4052     5.2162   1.420   0.1557    
input_ntokens    4.2542     1.8051   2.357   0.0184 *  
output_ntokens  -6.2785     4.9176  -1.277   0.2017    
input_nchars     5.2889     5.8233   0.908   0.3638    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 141.63  on 299  degrees of freedom
Residual deviance:  77.40  on 294  degrees of freedom
AIC: 89.4

Number of Fisher Scoring iterations: 9


In [91]:
summary(rot13deccot_gpt4_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob + input_ntokens + 
    output_ntokens + input_nchars, family = binomial, data = df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.9243  -1.0111   0.6140   0.8735   2.4145  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     0.16840    0.13331   1.263    0.207    
input_logprob   0.83460    1.42938   0.584    0.559    
output_logprob  1.29187    0.21888   5.902 3.59e-09 ***
input_ntokens  -2.08775    1.36528  -1.529    0.126    
output_ntokens  0.08215    0.50933   0.161    0.872    
input_nchars    2.97586    1.83157   1.625    0.104    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 412.47  on 299  degrees of freedom
Residual deviance: 339.83  on 294  degrees of freedom
AIC: 351.83

Number of Fisher Scoring iterations: 4


In [92]:
vif(rot13encbasic_gpt4_model)

In [93]:
vif(rot13decbasic_gpt4_model)

In [94]:
vif(rot13encstep_gpt4_model)

In [95]:
vif(rot13decstep_gpt4_model)

In [96]:
vif(rot13enccot_gpt4_model)

In [97]:
vif(rot13deccot_gpt4_model)

### Comparing rot-13 and rot-12 for different prompt styles

In [104]:
# Read in data
rot13and12encbasic_gpt4_df <- read.table(file = 'table_rot13and12enc_gpt-4-0613.tsv', sep = '\t', header = TRUE)
rot13and12encstep_gpt4_df <- read.table(file = 'table_rot13and12encstep_gpt-4-0613.tsv', sep = '\t', header = TRUE)
rot13and12enccot_gpt4_df <- read.table(file = 'table_rot13and12enccot_gpt-4-0613.tsv', sep = '\t', header = TRUE)

rot13and12decbasic_gpt4_df <- read.table(file = 'table_rot13and12dec_gpt-4-0613.tsv', sep = '\t', header = TRUE)
rot13and12decstep_gpt4_df <- read.table(file = 'table_rot13and12decstep_gpt-4-0613.tsv', sep = '\t', header = TRUE)
rot13and12deccot_gpt4_df <- read.table(file = 'table_rot13and12deccot_gpt-4-0613.tsv', sep = '\t', header = TRUE)


In [105]:
# Z-score data
scaled_rot13and12encbasic_gpt4_df <- scale_taskpair_df(rot13and12encbasic_gpt4_df)
scaled_rot13and12encstep_gpt4_df <- scale_taskpair_df(rot13and12encstep_gpt4_df)
scaled_rot13and12enccot_gpt4_df <- scale_taskpair_df(rot13and12enccot_gpt4_df)


scaled_rot13and12decbasic_gpt4_df <- scale_taskpair_df(rot13and12decbasic_gpt4_df)
scaled_rot13and12decstep_gpt4_df <- scale_taskpair_df(rot13and12decstep_gpt4_df)
scaled_rot13and12deccot_gpt4_df <- scale_taskpair_df(rot13and12deccot_gpt4_df)



In [106]:
model_taskencbasic4 <- bayesglm(correct ~ task + input_nchars + input_ntokens + output_ntokens + input_logprob + output_logprob, 
               data=scaled_rot13and12encbasic_gpt4_df, family=binomial)
model_taskencstep4 <- bayesglm(correct ~ task + input_nchars + input_ntokens + output_ntokens + input_logprob + output_logprob, 
               data=scaled_rot13and12encstep_gpt4_df, family=binomial)
model_taskenccot4 <- bayesglm(correct ~ task + input_nchars + input_ntokens + output_ntokens + input_logprob + output_logprob, 
               data=scaled_rot13and12enccot_gpt4_df, family=binomial)

model_taskdecbasic4 <- bayesglm(correct ~ task + input_nchars + input_ntokens + output_ntokens + input_logprob + output_logprob, 
               data=scaled_rot13and12decbasic_gpt4_df, family=binomial)
model_taskdecstep4 <- bayesglm(correct ~ task + input_nchars + input_ntokens + output_ntokens + input_logprob + output_logprob, 
               data=scaled_rot13and12decstep_gpt4_df, family=binomial)
model_taskdeccot4 <- bayesglm(correct ~ task + input_nchars + input_ntokens + output_ntokens + input_logprob + output_logprob, 
               data=scaled_rot13and12deccot_gpt4_df, family=binomial)

In [107]:
summary(model_taskencbasic4)


Call:
bayesglm(formula = correct ~ task + input_nchars + input_ntokens + 
    output_ntokens + input_logprob + output_logprob, family = binomial, 
    data = scaled_rot13and12encbasic_gpt4_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.28312  -0.21774  -0.08058  -0.01366   2.80135  

Coefficients:
                       Estimate Std. Error z value Pr(>|z|)    
(Intercept)           -7.272566   1.675602  -4.340 1.42e-05 ***
taskrot13enc_highprob  4.432712   1.581314   2.803  0.00506 ** 
input_nchars           0.001381   1.062100   0.001  0.99896    
input_ntokens          2.016459   0.887241   2.273  0.02304 *  
output_ntokens        -1.066433   1.248852  -0.854  0.39314    
input_logprob          0.893642   0.737659   1.211  0.22572    
output_logprob         2.661051   1.479676   1.798  0.07211 .  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 134.373  o

In [108]:
summary(model_taskencstep4)


Call:
bayesglm(formula = correct ~ task + input_nchars + input_ntokens + 
    output_ntokens + input_logprob + output_logprob, family = binomial, 
    data = scaled_rot13and12encstep_gpt4_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.31219  -0.34216  -0.09744  -0.04783   2.48651  

Coefficients:
                 Estimate Std. Error z value Pr(>|z|)  
(Intercept)       -1.4424     1.3754  -1.049   0.2943  
taskshiftstep_12  -5.3878     2.3775  -2.266   0.0234 *
input_nchars      -0.2802     0.9068  -0.309   0.7573  
input_ntokens      0.3709     1.0397   0.357   0.7213  
output_ntokens    -0.6206     0.9262  -0.670   0.5028  
input_logprob      0.9663     1.2766   0.757   0.4491  
output_logprob     0.9442     1.0563   0.894   0.3714  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 130.033  on 199  degrees of freedom
Residual deviance:  78.851  on 193  deg

In [109]:
summary(model_taskenccot4)


Call:
bayesglm(formula = correct ~ task + input_nchars + input_ntokens + 
    output_ntokens + input_logprob + output_logprob, family = binomial, 
    data = scaled_rot13and12enccot_gpt4_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.24051  -0.19955  -0.08461  -0.03583   2.15062  

Coefficients:
                Estimate Std. Error z value Pr(>|z|)  
(Intercept)      -2.6646     1.5247  -1.748   0.0805 .
taskshiftcot_12  -4.9728     2.3010  -2.161   0.0307 *
input_nchars     -0.5043     1.0145  -0.497   0.6191  
input_ntokens     0.1931     1.0851   0.178   0.8588  
output_ntokens   -1.3570     1.1765  -1.153   0.2487  
input_logprob     0.9112     1.3067   0.697   0.4856  
output_logprob    0.9434     1.1740   0.804   0.4216  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 85.193  on 199  degrees of freedom
Residual deviance: 47.979  on 193  degrees of fre

In [110]:
summary(model_taskdecbasic4)


Call:
bayesglm(formula = correct ~ task + input_nchars + input_ntokens + 
    output_ntokens + input_logprob + output_logprob, family = binomial, 
    data = scaled_rot13and12decbasic_gpt4_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.49950  -0.13260  -0.06963   0.63622   1.84601  

Coefficients:
                      Estimate Std. Error z value Pr(>|z|)    
(Intercept)            -6.0516     1.7385  -3.481 0.000500 ***
taskrot13dec_highprob   6.0198     1.7507   3.438 0.000585 ***
input_nchars            0.3016     0.8979   0.336 0.736918    
input_ntokens          -0.3545     0.8826  -0.402 0.687951    
output_ntokens          1.0815     0.6332   1.708 0.087645 .  
input_logprob           0.1191     0.8326   0.143 0.886289    
output_logprob          1.4528     0.5552   2.617 0.008877 ** 
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 227.10  on 199  de

In [111]:
summary(model_taskdecstep4)


Call:
bayesglm(formula = correct ~ task + input_nchars + input_ntokens + 
    output_ntokens + input_logprob + output_logprob, family = binomial, 
    data = scaled_rot13and12decstep_gpt4_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.58037  -0.11766  -0.06859   0.68736   2.18400  

Coefficients:
                 Estimate Std. Error z value Pr(>|z|)    
(Intercept)       0.02093    0.23739   0.088 0.929756    
taskshiftstep_12 -6.19736    1.74206  -3.557 0.000374 ***
input_nchars      0.09611    0.88863   0.108 0.913876    
input_ntokens    -0.43838    0.90770  -0.483 0.629124    
output_ntokens    0.79551    0.62444   1.274 0.202675    
input_logprob     0.19651    0.84033   0.234 0.815101    
output_logprob    1.22614    0.55345   2.215 0.026730 *  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 231.29  on 199  degrees of freedom
Residual deviance: 117.1

In [112]:
summary(model_taskdeccot4)


Call:
bayesglm(formula = correct ~ task + input_nchars + input_ntokens + 
    output_ntokens + input_logprob + output_logprob, family = binomial, 
    data = scaled_rot13and12deccot_gpt4_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-2.15101  -0.08213  -0.04706   0.53706   1.39744  

Coefficients:
                Estimate Std. Error z value Pr(>|z|)    
(Intercept)      1.39453    0.27887   5.001 5.72e-07 ***
taskshiftcot_12 -7.60737    1.91115  -3.981 6.88e-05 ***
input_nchars     0.46638    0.92162   0.506    0.613    
input_ntokens    0.08506    0.88289   0.096    0.923    
output_ntokens  -0.55992    0.60273  -0.929    0.353    
input_logprob   -0.03679    0.84415  -0.044    0.965    
output_logprob   0.73405    0.53124   1.382    0.167    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 267.499  on 199  degrees of freedom
Residual deviance:  94.683  on 1

In [113]:
vif(model_taskencbasic4)

In [114]:
vif(model_taskencstep4)

In [115]:
vif(model_taskenccot4)

In [116]:
vif(model_taskdecbasic4)

In [117]:
vif(model_taskdecstep4)

In [118]:
vif(model_taskdeccot4)

### Comparing prompt techniques

In [119]:
# Read in data
rot13encprompts_gpt4_df <- read.table(file = 'table_rot13enc_prompt_comparison.tsv', sep = '\t', header = TRUE)
rot13decprompts_gpt4_df <- read.table(file = 'table_rot13dec_prompt_comparison.tsv', sep = '\t', header = TRUE)

In [120]:
head(rot13encprompts_gpt4_df)

Unnamed: 0_level_0,index,prompt,input_nchars,input_ntokens,input_logprob,output_nchars,output_ntokens,output_logprob,correct
Unnamed: 0_level_1,<int>,<chr>,<int>,<int>,<dbl>,<int>,<int>,<dbl>,<int>
1,0,,48,11,-27.93722,48,22,-140.6093,1
2,1,,52,11,-28.95337,52,26,-163.6525,0
3,2,,52,11,-26.48786,52,25,-171.8435,1
4,3,,65,15,-37.63794,65,31,-206.2903,1
5,4,,37,10,-24.1028,37,18,-123.9919,1
6,5,,46,11,-31.4136,46,21,-147.9776,1


In [121]:
scale_prompt_df <- function(df) {
    new_df <- data.frame(scale(df[3:8]))
    new_df$index <- as.factor(df$index)
    new_df$correct <- df$correct
    new_df$prompt <- df$prompt
    
    return(new_df)
}

scaled_rot13encprompts_gpt4_df <- scale_prompt_df(rot13encprompts_gpt4_df)
scaled_rot13decprompts_gpt4_df <- scale_prompt_df(rot13decprompts_gpt4_df)


In [122]:
head(scaled_rot13encprompts_gpt4_df)

Unnamed: 0_level_0,input_nchars,input_ntokens,input_logprob,output_nchars,output_ntokens,output_logprob,index,correct,prompt
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<fct>,<int>,<chr>
1,-1.0989416,-1.066837,1.2245026,-1.0989416,-1.1392848,1.226839,0,1,
2,-1.0122059,-1.066837,1.1836319,-1.0122059,-0.9522869,1.0522985,1,0,
3,-1.0122059,-1.066837,1.2827981,-1.0122059,-0.9990364,0.9902564,2,1,
4,-0.7303148,-0.578582,0.8343259,-0.7303148,-0.7185395,0.7293406,3,1,
5,-1.3374649,-1.188901,1.3787283,-1.3374649,-1.3262827,1.3527068,4,1,
6,-1.1423095,-1.066837,1.0846777,-1.1423095,-1.1860343,1.1710282,5,1,


In [123]:
scaled_rot13encprompts_gpt4_df$prompt <- factor(scaled_rot13encprompts_gpt4_df$prompt)
scaled_rot13decprompts_gpt4_df$prompt <- factor(scaled_rot13decprompts_gpt4_df$prompt)

contrasts(scaled_rot13encprompts_gpt4_df$prompt) <- contr.sum(3)
contrasts(scaled_rot13decprompts_gpt4_df$prompt) <- contr.sum(3)


In [124]:
rot13encprompts_gpt4_model <- glmer(correct ~ prompt + (1|index), 
               data=scaled_rot13encprompts_gpt4_df, family=binomial)
rot13decprompts_gpt4_model <- glmer(correct ~ prompt + (1|index), 
               data=scaled_rot13decprompts_gpt4_df, family=binomial)

In [125]:
rot13encprompts_gpt4_null_model <- glmer(correct ~ (1|index), 
               data=scaled_rot13encprompts_gpt4_df, family=binomial)
rot13decprompts_gpt4_null_model <- glmer(correct ~ (1|index), 
               data=scaled_rot13decprompts_gpt4_df, family=binomial)

In [126]:
anova(rot13encprompts_gpt4_model,rot13encprompts_gpt4_null_model,prompt="Chisq")

Unnamed: 0_level_0,npar,AIC,BIC,logLik,deviance,Chisq,Df,Pr(>Chisq)
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
rot13encprompts_gpt4_null_model,2,168.669,176.0766,-82.3345,164.669,,,
rot13encprompts_gpt4_model,4,146.33,161.1451,-69.16499,138.33,26.33901,2.0,1.907901e-06


In [127]:
anova(rot13decprompts_gpt4_model,rot13decprompts_gpt4_null_model,prompt="Chisq")

Unnamed: 0_level_0,npar,AIC,BIC,logLik,deviance,Chisq,Df,Pr(>Chisq)
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
rot13decprompts_gpt4_null_model,2,360.7307,368.1382,-178.3653,356.7307,,,
rot13decprompts_gpt4_model,4,325.0535,339.8686,-158.5267,317.0535,39.67717,2.0,2.42221e-09


In [128]:
gpt4enc_rot13_multcomp <- glht(rot13encprompts_gpt4_model, linfct=mcp(prompt="Tukey"))
gpt4dec_rot13_multcomp <- glht(rot13decprompts_gpt4_model, linfct=mcp(prompt="Tukey"))


In [129]:
summary(gpt4enc_rot13_multcomp)


	 Simultaneous Tests for General Linear Hypotheses

Multiple Comparisons of Means: Tukey Contrasts


Fit: glmer(formula = correct ~ prompt + (1 | index), data = scaled_rot13encprompts_gpt4_df, 
    family = binomial)

Linear Hypotheses:
                Estimate Std. Error z value Pr(>|z|)   
cot -  == 0      -6.3926     1.9508  -3.277  0.00290 **
step -  == 0     -0.4289     0.9364  -0.458  0.88607   
step - cot == 0   5.9637     1.8963   3.145  0.00405 **
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Adjusted p values reported -- single-step method)


In [130]:
summary(gpt4dec_rot13_multcomp)


	 Simultaneous Tests for General Linear Hypotheses

Multiple Comparisons of Means: Tukey Contrasts


Fit: glmer(formula = correct ~ prompt + (1 | index), data = scaled_rot13decprompts_gpt4_df, 
    family = binomial)

Linear Hypotheses:
                Estimate Std. Error z value Pr(>|z|)    
cot -  == 0       2.8054     0.5948   4.717  < 1e-05 ***
step -  == 0      0.1839     0.4291   0.429    0.902    
step - cot == 0  -2.6214     0.5818  -4.506 1.52e-05 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Adjusted p values reported -- single-step method)


## Shift ciphers: Few-shot

In [131]:
rot13enc_gpt4_0shot_df <- read.table(file = 'table_few_rot13enc_gpt-4-0613_0shot.tsv', sep = '\t', header = TRUE)
rot13enc_gpt4_5shot_df <- read.table(file = 'table_few_rot13enc_gpt-4-0613_5shot.tsv', sep = '\t', header = TRUE)
rot13enc_gpt4_10shot_df <- read.table(file = 'table_few_rot13enc_gpt-4-0613_10shot.tsv', sep = '\t', header = TRUE)

rot13enc_gpt35_0shot_df <- read.table(file = 'table_few_rot13enc_gpt-3.5-turbo-0613_0shot.tsv', sep = '\t', header = TRUE)
rot13enc_gpt35_5shot_df <- read.table(file = 'table_few_rot13enc_gpt-3.5-turbo-0613_5shot.tsv', sep = '\t', header = TRUE)
rot13enc_gpt35_10shot_df <- read.table(file = 'table_few_rot13enc_gpt-3.5-turbo-0613_10shot.tsv', sep = '\t', header = TRUE)

rot13enc_claude3_0shot_df <- read.table(file = 'table_few_rot13enc_claude-3-opus-20240229_0shot.tsv', sep = '\t', header = TRUE)
rot13enc_claude3_5shot_df <- read.table(file = 'table_few_rot13enc_claude-3-opus-20240229_5shot.tsv', sep = '\t', header = TRUE)
rot13enc_claude3_10shot_df <- read.table(file = 'table_few_rot13enc_claude-3-opus-20240229_10shot.tsv', sep = '\t', header = TRUE)

rot13enc_gpt35ft_0shot_df <- read.table(file = 'table_few_rot13enc_gpt-3.5-turbo-0613_0shot.tsv', sep = '\t', header = TRUE)
rot13enc_gpt35ft_10shot_df <- read.table(file = 'table_few_rot13enc_ft_gpt-3.5_10shot_0shot.tsv', sep = '\t', header = TRUE)
rot13enc_gpt35ft_100shot_df <- read.table(file = 'table_few_rot13enc_ft_gpt-3.5_100shot_0shot.tsv', sep = '\t', header = TRUE)



rot13dec_gpt4_0shot_df <- read.table(file = 'table_few_rot13dec_gpt-4-0613_0shot.tsv', sep = '\t', header = TRUE)
rot13dec_gpt4_5shot_df <- read.table(file = 'table_few_rot13dec_gpt-4-0613_5shot.tsv', sep = '\t', header = TRUE)
rot13dec_gpt4_10shot_df <- read.table(file = 'table_few_rot13dec_gpt-4-0613_10shot.tsv', sep = '\t', header = TRUE)

rot13dec_gpt35_0shot_df <- read.table(file = 'table_few_rot13dec_gpt-3.5-turbo-0613_0shot.tsv', sep = '\t', header = TRUE)
rot13dec_gpt35_5shot_df <- read.table(file = 'table_few_rot13dec_gpt-3.5-turbo-0613_5shot.tsv', sep = '\t', header = TRUE)
rot13dec_gpt35_10shot_df <- read.table(file = 'table_few_rot13dec_gpt-3.5-turbo-0613_10shot.tsv', sep = '\t', header = TRUE)

rot13dec_claude3_0shot_df <- read.table(file = 'table_few_rot13dec_claude-3-opus-20240229_0shot.tsv', sep = '\t', header = TRUE)
rot13dec_claude3_5shot_df <- read.table(file = 'table_few_rot13dec_claude-3-opus-20240229_5shot.tsv', sep = '\t', header = TRUE)
rot13dec_claude3_10shot_df <- read.table(file = 'table_few_rot13dec_claude-3-opus-20240229_10shot.tsv', sep = '\t', header = TRUE)

rot13dec_gpt35ft_0shot_df <- read.table(file = 'table_few_rot13dec_gpt-3.5-turbo-0613_0shot.tsv', sep = '\t', header = TRUE)
rot13dec_gpt35ft_10shot_df <- read.table(file = 'table_few_rot13dec_ft_gpt-3.5_10shot_0shot.tsv', sep = '\t', header = TRUE)
rot13dec_gpt35ft_100shot_df <- read.table(file = 'table_few_rot13dec_ft_gpt-3.5_100shot_0shot.tsv', sep = '\t', header = TRUE)



In [132]:
scaled_rot13enc_gpt4_0shot_df <- scale_df(rot13enc_gpt4_0shot_df)
scaled_rot13enc_gpt4_5shot_df <- scale_df(rot13enc_gpt4_5shot_df)
scaled_rot13enc_gpt4_10shot_df <- scale_df(rot13enc_gpt4_10shot_df)

scaled_rot13enc_gpt35_0shot_df <- scale_df(rot13enc_gpt35_0shot_df)
scaled_rot13enc_gpt35_5shot_df <- scale_df(rot13enc_gpt35_5shot_df)
scaled_rot13enc_gpt35_10shot_df <- scale_df(rot13enc_gpt35_10shot_df)

scaled_rot13enc_claude3_0shot_df <- scale_df(rot13enc_claude3_0shot_df)
scaled_rot13enc_claude3_5shot_df <- scale_df(rot13enc_claude3_5shot_df)
scaled_rot13enc_claude3_10shot_df <- scale_df(rot13enc_claude3_10shot_df)

scaled_rot13enc_gpt35ft_0shot_df <- scale_df(rot13enc_gpt35ft_0shot_df)
scaled_rot13enc_gpt35ft_10shot_df <- scale_df(rot13enc_gpt35ft_10shot_df)
scaled_rot13enc_gpt35ft_100shot_df <- scale_df(rot13enc_gpt35ft_100shot_df)



scaled_rot13dec_gpt4_0shot_df <- scale_df(rot13dec_gpt4_0shot_df)
scaled_rot13dec_gpt4_5shot_df <- scale_df(rot13dec_gpt4_5shot_df)
scaled_rot13dec_gpt4_10shot_df <- scale_df(rot13dec_gpt4_10shot_df)

scaled_rot13dec_gpt35_0shot_df <- scale_df(rot13dec_gpt35_0shot_df)
scaled_rot13dec_gpt35_5shot_df <- scale_df(rot13dec_gpt35_5shot_df)
scaled_rot13dec_gpt35_10shot_df <- scale_df(rot13dec_gpt35_10shot_df)

scaled_rot13dec_claude3_0shot_df <- scale_df(rot13dec_claude3_0shot_df)
scaled_rot13dec_claude3_5shot_df <- scale_df(rot13dec_claude3_5shot_df)
scaled_rot13dec_claude3_10shot_df <- scale_df(rot13dec_claude3_10shot_df)

scaled_rot13dec_gpt35ft_0shot_df <- scale_df(rot13dec_gpt35ft_0shot_df)
scaled_rot13dec_gpt35ft_10shot_df <- scale_df(rot13dec_gpt35ft_10shot_df)
scaled_rot13dec_gpt35ft_100shot_df <- scale_df(rot13dec_gpt35ft_100shot_df)



In [142]:
rot13enc_gpt4_0shot_model <- correct_vs_length_and_prob(scaled_rot13enc_gpt4_0shot_df, include_output_chars=FALSE)
rot13enc_gpt4_5shot_model <- correct_vs_length_and_prob(scaled_rot13enc_gpt4_5shot_df, include_output_chars=FALSE)
rot13enc_gpt4_10shot_model <- correct_vs_length_and_prob(scaled_rot13enc_gpt4_10shot_df, include_output_chars=FALSE)

rot13enc_gpt35_0shot_model <- correct_vs_length_and_prob(scaled_rot13enc_gpt35_0shot_df, include_output_chars=FALSE)
rot13enc_gpt35_5shot_model <- correct_vs_length_and_prob(scaled_rot13enc_gpt35_5shot_df, include_output_chars=FALSE)
rot13enc_gpt35_10shot_model <- correct_vs_length_and_prob(scaled_rot13enc_gpt35_10shot_df, include_output_chars=FALSE)

rot13enc_claude3_0shot_model <- correct_vs_length_and_prob(scaled_rot13enc_claude3_0shot_df, include_output_chars=FALSE)
rot13enc_claude3_5shot_model <- correct_vs_length_and_prob(scaled_rot13enc_claude3_5shot_df, include_output_chars=FALSE)
rot13enc_claude3_10shot_model <- correct_vs_length_and_prob(scaled_rot13enc_claude3_10shot_df, include_output_chars=FALSE)

rot13enc_claude3_0shot_notokens_model <- glm(correct ~ input_logprob + output_logprob + input_nchars, 
                                           data=scaled_rot13enc_claude3_0shot_df, family=binomial)
rot13enc_claude3_5shot_notokens_model <- glm(correct ~ input_logprob + output_logprob + input_nchars, 
                                           data=scaled_rot13enc_claude3_5shot_df, family=binomial)
rot13enc_claude3_10shot_notokens_model <- glm(correct ~ input_logprob + output_logprob + input_nchars, 
                                           data=scaled_rot13enc_claude3_10shot_df, family=binomial)

rot13enc_gpt35ft_0shot_model <- correct_vs_length_and_prob(scaled_rot13enc_gpt35ft_0shot_df, include_output_chars=FALSE)
rot13enc_gpt35ft_10shot_model <- correct_vs_length_and_prob(scaled_rot13enc_gpt35ft_10shot_df, include_output_chars=FALSE)
rot13enc_gpt35ft_100shot_model <- correct_vs_length_and_prob(scaled_rot13enc_gpt35ft_100shot_df, include_output_chars=FALSE)







rot13dec_gpt4_0shot_model <- correct_vs_length_and_prob(scaled_rot13dec_gpt4_0shot_df, include_output_chars=FALSE)
rot13dec_gpt4_5shot_model <- correct_vs_length_and_prob(scaled_rot13dec_gpt4_5shot_df, include_output_chars=FALSE)
rot13dec_gpt4_10shot_model <- correct_vs_length_and_prob(scaled_rot13dec_gpt4_10shot_df, include_output_chars=FALSE)

rot13dec_gpt35_0shot_model <- correct_vs_length_and_prob(scaled_rot13dec_gpt35_0shot_df, include_output_chars=FALSE)
rot13dec_gpt35_5shot_model <- correct_vs_length_and_prob(scaled_rot13dec_gpt35_5shot_df, include_output_chars=FALSE)
rot13dec_gpt35_10shot_model <- correct_vs_length_and_prob(scaled_rot13dec_gpt35_10shot_df, include_output_chars=FALSE)

rot13dec_claude3_0shot_notokens_model <- glm(correct ~ input_logprob + output_logprob + input_nchars, 
                                            data=scaled_rot13dec_claude3_0shot_df, family=binomial)
rot13dec_claude3_5shot_notokens_model <- glm(correct ~ input_logprob + output_logprob + input_nchars, 
                                           data=scaled_rot13dec_claude3_5shot_df, family=binomial)
rot13dec_claude3_10shot_notokens_model <- glm(correct ~ input_logprob + output_logprob + input_nchars, 
                                           data=scaled_rot13dec_claude3_10shot_df, family=binomial)

rot13dec_gpt35ft_0shot_model <- correct_vs_length_and_prob(scaled_rot13dec_gpt35ft_0shot_df, include_output_chars=FALSE)
rot13dec_gpt35ft_10shot_model <- correct_vs_length_and_prob(scaled_rot13dec_gpt35ft_10shot_df, include_output_chars=FALSE)
rot13dec_gpt35ft_100shot_model <- correct_vs_length_and_prob(scaled_rot13dec_gpt35ft_100shot_df, include_output_chars=FALSE)



“glm.fit: fitted probabilities numerically 0 or 1 occurred”
“glm.fit: fitted probabilities numerically 0 or 1 occurred”
“glm.fit: fitted probabilities numerically 0 or 1 occurred”


In [134]:
summary(rot13enc_gpt4_0shot_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob + input_ntokens + 
    output_ntokens + input_nchars, family = binomial, data = df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-2.00362  -0.36524  -0.09560  -0.01837   3.06018  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     -4.7026     0.6856  -6.860 6.91e-12 ***
input_logprob    0.6138     0.5802   1.058  0.29006    
output_logprob   9.5675     3.5331   2.708  0.00677 ** 
input_ntokens    2.8680     1.2482   2.298  0.02157 *  
output_ntokens  -4.4521     2.8445  -1.565  0.11755    
input_nchars     8.5451     3.9293   2.175  0.02965 *  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 199.41  on 299  degrees of freedom
Residual deviance: 118.67  on 294  degrees of freedom
AIC: 130.67

Number of Fisher Scoring iterations: 8


In [135]:
summary(rot13enc_gpt4_5shot_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob + input_ntokens + 
    output_ntokens + input_nchars, family = binomial, data = df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.72335  -0.41917  -0.10822  -0.02394   3.01113  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     -4.3756     0.6362  -6.877  6.1e-12 ***
input_logprob    0.9920     0.5684   1.745  0.08094 .  
output_logprob   2.7323     3.1778   0.860  0.38989    
input_ntokens    3.4030     1.1815   2.880  0.00398 ** 
output_ntokens  -4.9931     2.6991  -1.850  0.06433 .  
input_nchars     2.0966     3.5615   0.589  0.55607    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 212.06  on 299  degrees of freedom
Residual deviance: 132.72  on 294  degrees of freedom
AIC: 144.72

Number of Fisher Scoring iterations: 7


In [136]:
summary(rot13enc_gpt4_10shot_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob + input_ntokens + 
    output_ntokens + input_nchars, family = binomial, data = df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.62070  -0.47604  -0.15884  -0.04965   2.72257  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     -3.6809     0.4943  -7.447 9.56e-14 ***
input_logprob    0.9121     0.5280   1.728   0.0841 .  
output_logprob   3.4352     2.9377   1.169   0.2423    
input_ntokens    2.3214     1.0367   2.239   0.0251 *  
output_ntokens  -2.9121     2.4821  -1.173   0.2407    
input_nchars     2.3019     3.3504   0.687   0.4921    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 216.14  on 299  degrees of freedom
Residual deviance: 148.47  on 294  degrees of freedom
AIC: 160.47

Number of Fisher Scoring iterations: 7


In [137]:
summary(rot13enc_gpt35_0shot_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob + input_ntokens + 
    output_ntokens + input_nchars, family = binomial, data = df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.1093  -0.1526  -0.0237  -0.0031   2.6951  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     -8.0090     2.0673  -3.874 0.000107 ***
input_logprob   -0.4841     0.9724  -0.498 0.618575    
output_logprob   7.5290     6.0334   1.248 0.212070    
input_ntokens    6.8037     3.0151   2.257 0.024034 *  
output_ntokens  -6.8577     5.4911  -1.249 0.211709    
input_nchars     3.2967     6.0794   0.542 0.587634    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 80.845  on 299  degrees of freedom
Residual deviance: 50.967  on 294  degrees of freedom
AIC: 62.967

Number of Fisher Scoring iterations: 9


In [138]:
summary(rot13enc_gpt35_5shot_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob + input_ntokens + 
    output_ntokens + input_nchars, family = binomial, data = df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.22269  -0.09025  -0.01284  -0.00107   2.48707  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)      -9.393      2.473  -3.799 0.000145 ***
input_logprob     1.520      1.147   1.325 0.185305    
output_logprob    7.533      5.959   1.264 0.206186    
input_ntokens     8.558      3.393   2.522 0.011659 *  
output_ntokens  -17.849      6.568  -2.718 0.006573 ** 
input_nchars     13.533      7.110   1.903 0.056989 .  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 87.687  on 299  degrees of freedom
Residual deviance: 46.263  on 294  degrees of freedom
AIC: 58.263

Number of Fisher Scoring iterations: 10


In [139]:
summary(rot13enc_gpt35_10shot_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob + input_ntokens + 
    output_ntokens + input_nchars, family = binomial, data = df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.15280  -0.08938  -0.00952  -0.00096   2.57899  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)      -9.603      2.640  -3.637 0.000276 ***
input_logprob     1.124      1.171   0.960 0.337066    
output_logprob    5.465      6.202   0.881 0.378251    
input_ntokens     8.655      3.754   2.306 0.021120 *  
output_ntokens  -16.258      6.685  -2.432 0.015017 *  
input_nchars      9.166      7.216   1.270 0.204050    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 80.845  on 299  degrees of freedom
Residual deviance: 44.657  on 294  degrees of freedom
AIC: 56.657

Number of Fisher Scoring iterations: 10


In [143]:
summary(rot13enc_claude3_0shot_notokens_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob + input_nchars, 
    family = binomial, data = scaled_rot13enc_claude3_0shot_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.8109  -1.1962   0.6951   0.9515   1.9129  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)      0.2845     0.1244   2.287 0.022175 *  
input_logprob    0.0584     0.1831   0.319 0.749793    
output_logprob   4.7466     1.2070   3.932 8.41e-05 ***
input_nchars     4.1884     1.1818   3.544 0.000394 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 410.54  on 299  degrees of freedom
Residual deviance: 372.32  on 296  degrees of freedom
AIC: 380.32

Number of Fisher Scoring iterations: 4


In [144]:
summary(rot13enc_claude3_5shot_notokens_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob + input_nchars, 
    family = binomial, data = scaled_rot13enc_claude3_5shot_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.8394  -1.2174   0.7019   0.8671   1.8207  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)      0.6526     0.1281   5.095 3.48e-07 ***
input_logprob    0.3190     0.1825   1.748   0.0805 .  
output_logprob   1.5144     1.1326   1.337   0.1812    
input_nchars     1.1225     1.1172   1.005   0.3150    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 389.69  on 299  degrees of freedom
Residual deviance: 360.45  on 296  degrees of freedom
AIC: 368.45

Number of Fisher Scoring iterations: 4


In [145]:
summary(rot13enc_claude3_10shot_notokens_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob + input_nchars, 
    family = binomial, data = scaled_rot13enc_claude3_10shot_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.9143  -1.1221   0.6643   0.8739   1.6733  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)      0.6291     0.1298   4.847 1.26e-06 ***
input_logprob    0.5594     0.1895   2.953  0.00315 ** 
output_logprob   1.2675     1.1518   1.100  0.27113    
input_nchars     0.9719     1.1365   0.855  0.39244    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 392.05  on 299  degrees of freedom
Residual deviance: 352.34  on 296  degrees of freedom
AIC: 360.34

Number of Fisher Scoring iterations: 4


In [146]:
summary(rot13enc_gpt35ft_0shot_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob + input_ntokens + 
    output_ntokens + input_nchars, family = binomial, data = df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.1093  -0.1526  -0.0237  -0.0031   2.6951  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     -8.0090     2.0673  -3.874 0.000107 ***
input_logprob   -0.4841     0.9724  -0.498 0.618575    
output_logprob   7.5290     6.0334   1.248 0.212070    
input_ntokens    6.8037     3.0151   2.257 0.024034 *  
output_ntokens  -6.8577     5.4911  -1.249 0.211709    
input_nchars     3.2967     6.0794   0.542 0.587634    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 80.845  on 299  degrees of freedom
Residual deviance: 50.967  on 294  degrees of freedom
AIC: 62.967

Number of Fisher Scoring iterations: 9


In [147]:
summary(rot13enc_gpt35ft_10shot_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob + input_ntokens + 
    output_ntokens + input_nchars, family = binomial, data = df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.28990  -0.07266  -0.00829  -0.00048   2.48159  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)    -10.7402     3.1658  -3.393 0.000692 ***
input_logprob   -0.5928     1.7836  -0.332 0.739620    
output_logprob   5.7823    10.0214   0.577 0.563942    
input_ntokens    4.8158     5.7030   0.844 0.398427    
output_ntokens   2.0018     8.9335   0.224 0.822695    
input_nchars    -7.6824     8.5383  -0.900 0.368250    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 73.774  on 299  degrees of freedom
Residual deviance: 34.959  on 294  degrees of freedom
AIC: 46.959

Number of Fisher Scoring iterations: 10


In [148]:
summary(rot13enc_gpt35ft_100shot_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob + input_ntokens + 
    output_ntokens + input_nchars, family = binomial, data = df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.41608  -0.15216  -0.05196  -0.01152   2.80604  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     -6.6858     1.3084  -5.110 3.23e-07 ***
input_logprob   -0.6641     0.8715  -0.762   0.4461    
output_logprob  14.5041     5.9631   2.432   0.0150 *  
input_ntokens    1.8742     2.9341   0.639   0.5230    
output_ntokens -14.2708     5.8067  -2.458   0.0140 *  
input_nchars    23.5789     9.5195   2.477   0.0133 *  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 94.321  on 299  degrees of freedom
Residual deviance: 55.174  on 294  degrees of freedom
AIC: 67.174

Number of Fisher Scoring iterations: 9


In [149]:
summary(rot13dec_gpt4_0shot_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob + input_ntokens + 
    output_ntokens + input_nchars, family = binomial, data = df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.43519  -0.67174  -0.33480  -0.04397   2.83287  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     -2.0910     0.2606  -8.023 1.03e-15 ***
input_logprob    1.3058     1.7555   0.744 0.456962    
output_logprob   1.9958     0.4111   4.855 1.21e-06 ***
input_ntokens   -1.4740     1.7739  -0.831 0.405996    
output_ntokens   2.6066     0.7573   3.442 0.000577 ***
input_nchars     0.2345     2.2362   0.105 0.916499    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 321.13  on 299  degrees of freedom
Residual deviance: 237.35  on 294  degrees of freedom
AIC: 249.35

Number of Fisher Scoring iterations: 6


In [150]:
summary(rot13dec_gpt4_5shot_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob + input_ntokens + 
    output_ntokens + input_nchars, family = binomial, data = df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.42210  -0.69411  -0.33474  -0.04457   3.14239  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     -1.9209     0.2363  -8.130 4.31e-16 ***
input_logprob    3.8823     1.8220   2.131 0.033109 *  
output_logprob   1.3678     0.3644   3.754 0.000174 ***
input_ntokens   -1.9951     1.7451  -1.143 0.252934    
output_ntokens   1.6014     0.6914   2.316 0.020546 *  
input_nchars     3.7921     2.3204   1.634 0.102210    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 325.96  on 299  degrees of freedom
Residual deviance: 243.92  on 294  degrees of freedom
AIC: 255.92

Number of Fisher Scoring iterations: 6


In [151]:
summary(rot13dec_gpt4_10shot_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob + input_ntokens + 
    output_ntokens + input_nchars, family = binomial, data = df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.49084  -0.72461  -0.40703  -0.05883   3.09859  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     -1.7200     0.2094  -8.212  < 2e-16 ***
input_logprob    3.6413     1.6920   2.152 0.031392 *  
output_logprob   1.2077     0.3329   3.628 0.000286 ***
input_ntokens   -1.9499     1.6781  -1.162 0.245242    
output_ntokens   1.6088     0.6552   2.456 0.014067 *  
input_nchars     3.6930     2.1953   1.682 0.092521 .  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 328.32  on 299  degrees of freedom
Residual deviance: 260.11  on 294  degrees of freedom
AIC: 272.11

Number of Fisher Scoring iterations: 6


In [152]:
summary(rot13dec_gpt35_0shot_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob + input_ntokens + 
    output_ntokens + input_nchars, family = binomial, data = df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.44680  -0.24340  -0.07910  -0.00957   2.46987  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     -5.9929     1.0584  -5.662  1.5e-08 ***
input_logprob    5.4775     3.6977   1.481  0.13852    
output_logprob   4.4533     1.3895   3.205  0.00135 ** 
input_ntokens   -6.4843     3.5400  -1.832  0.06700 .  
output_ntokens   0.9994     1.4220   0.703  0.48219    
input_nchars    11.1729     5.0095   2.230  0.02572 *  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 152.183  on 299  degrees of freedom
Residual deviance:  89.723  on 294  degrees of freedom
AIC: 101.72

Number of Fisher Scoring iterations: 8


In [153]:
summary(rot13dec_gpt35_5shot_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob + input_ntokens + 
    output_ntokens + input_nchars, family = binomial, data = df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.4937  -0.0867  -0.0087  -0.0002   3.2799  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)      -9.680      1.895  -5.108 3.26e-07 ***
input_logprob    17.552      5.679   3.091  0.00200 ** 
output_logprob    6.278      1.911   3.284  0.00102 ** 
input_ntokens   -10.350      4.320  -2.396  0.01659 *  
output_ntokens    5.556      1.717   3.235  0.00122 ** 
input_nchars     21.160      6.667   3.174  0.00150 ** 
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 181.523  on 299  degrees of freedom
Residual deviance:  60.588  on 294  degrees of freedom
AIC: 72.588

Number of Fisher Scoring iterations: 9


In [154]:
summary(rot13dec_gpt35_10shot_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob + input_ntokens + 
    output_ntokens + input_nchars, family = binomial, data = df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.5988  -0.0917  -0.0117  -0.0003   3.2104  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)      -9.380      1.860  -5.042  4.6e-07 ***
input_logprob    17.134      5.652   3.032  0.00243 ** 
output_logprob    5.940      1.957   3.035  0.00241 ** 
input_ntokens   -19.766      6.054  -3.265  0.00109 ** 
output_ntokens    2.666      2.166   1.231  0.21845    
input_nchars     33.243      9.337   3.560  0.00037 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 176.854  on 299  degrees of freedom
Residual deviance:  56.941  on 294  degrees of freedom
AIC: 68.941

Number of Fisher Scoring iterations: 9


In [155]:
summary(rot13dec_claude3_0shot_notokens_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob + input_nchars, 
    family = binomial, data = scaled_rot13dec_claude3_0shot_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-3.2531   0.2266   0.3662   0.5784   1.3517  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)      2.2256     0.2367   9.403  < 2e-16 ***
input_logprob    1.6554     1.7498   0.946    0.344    
output_logprob   1.6171     0.3234   5.000 5.74e-07 ***
input_nchars     3.0810     1.7888   1.722    0.085 .  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 246.58  on 299  degrees of freedom
Residual deviance: 209.14  on 296  degrees of freedom
AIC: 217.14

Number of Fisher Scoring iterations: 6


In [156]:
summary(rot13dec_claude3_5shot_notokens_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob + input_nchars, 
    family = binomial, data = scaled_rot13dec_claude3_5shot_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.7415   0.2097   0.2733   0.3532   1.0016  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)      3.0797     0.3064  10.051   <2e-16 ***
input_logprob    4.3000     2.2550   1.907   0.0565 .  
output_logprob   0.7930     0.3458   2.293   0.0218 *  
input_nchars     4.5550     2.3044   1.977   0.0481 *  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 136.18  on 299  degrees of freedom
Residual deviance: 121.93  on 296  degrees of freedom
AIC: 129.93

Number of Fisher Scoring iterations: 6


In [157]:
summary(rot13dec_claude3_10shot_notokens_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob + input_nchars, 
    family = binomial, data = scaled_rot13dec_claude3_10shot_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.7908   0.2143   0.2700   0.3380   0.8803  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)      3.1515     0.3143  10.026   <2e-16 ***
input_logprob    4.5381     2.3908   1.898   0.0577 .  
output_logprob   0.6532     0.3586   1.821   0.0685 .  
input_nchars     4.7510     2.4456   1.943   0.0521 .  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 124.93  on 299  degrees of freedom
Residual deviance: 114.64  on 296  degrees of freedom
AIC: 122.64

Number of Fisher Scoring iterations: 6


In [158]:
summary(rot13dec_gpt35ft_0shot_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob + input_ntokens + 
    output_ntokens + input_nchars, family = binomial, data = df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.44680  -0.24340  -0.07910  -0.00957   2.46987  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     -5.9929     1.0584  -5.662  1.5e-08 ***
input_logprob    5.4775     3.6977   1.481  0.13852    
output_logprob   4.4533     1.3895   3.205  0.00135 ** 
input_ntokens   -6.4843     3.5400  -1.832  0.06700 .  
output_ntokens   0.9994     1.4220   0.703  0.48219    
input_nchars    11.1729     5.0095   2.230  0.02572 *  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 152.183  on 299  degrees of freedom
Residual deviance:  89.723  on 294  degrees of freedom
AIC: 101.72

Number of Fisher Scoring iterations: 8


In [159]:
summary(rot13dec_gpt35ft_10shot_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob + input_ntokens + 
    output_ntokens + input_nchars, family = binomial, data = df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.52907  -0.14029  -0.02940  -0.00182   2.84904  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)      -7.986      1.571  -5.084  3.7e-07 ***
input_logprob     5.988      4.545   1.318  0.18763    
output_logprob    5.524      1.789   3.088  0.00201 ** 
input_ntokens    -5.780      4.017  -1.439  0.15015    
output_ntokens    5.480      1.800   3.045  0.00233 ** 
input_nchars      6.103      5.294   1.153  0.24893    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 146.96  on 299  degrees of freedom
Residual deviance:  70.84  on 294  degrees of freedom
AIC: 82.84

Number of Fisher Scoring iterations: 9


In [160]:
summary(rot13dec_gpt35ft_100shot_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob + input_ntokens + 
    output_ntokens + input_nchars, family = binomial, data = df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.69499  -0.48392  -0.21269  -0.04005   3.11172  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     -3.3835     0.4282  -7.902 2.75e-15 ***
input_logprob    5.0800     2.5442   1.997 0.045860 *  
output_logprob   2.1103     0.5844   3.611 0.000305 ***
input_ntokens   -4.6000     2.3075  -1.994 0.046206 *  
output_ntokens   1.0892     0.9185   1.186 0.235701    
input_nchars     7.9714     3.2609   2.445 0.014503 *  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 242.98  on 299  degrees of freedom
Residual deviance: 163.08  on 294  degrees of freedom
AIC: 175.08

Number of Fisher Scoring iterations: 7


In [161]:
vif(rot13enc_gpt4_0shot_model)

In [162]:
vif(rot13enc_gpt4_5shot_model)

In [163]:
vif(rot13enc_gpt4_10shot_model)

In [164]:
vif(rot13enc_gpt35_0shot_model)

In [165]:
vif(rot13enc_gpt35_5shot_model)

In [166]:
vif(rot13enc_gpt35_10shot_model)

In [167]:
vif(rot13enc_claude3_0shot_notokens_model)

In [168]:
vif(rot13enc_claude3_5shot_notokens_model)

In [169]:
vif(rot13enc_claude3_10shot_notokens_model)

In [170]:
vif(rot13enc_gpt35ft_0shot_model)

In [171]:
vif(rot13enc_gpt35ft_10shot_model)

In [172]:
vif(rot13enc_gpt35ft_100shot_model)

In [173]:
vif(rot13dec_gpt4_0shot_model)

In [174]:
vif(rot13dec_gpt4_5shot_model)

In [175]:
vif(rot13dec_gpt4_10shot_model)

In [176]:
vif(rot13dec_gpt35_0shot_model)

In [177]:
vif(rot13dec_gpt35_5shot_model)

In [178]:
vif(rot13dec_gpt35_10shot_model)

In [179]:
vif(rot13dec_claude3_0shot_notokens_model)

In [180]:
vif(rot13dec_claude3_5shot_notokens_model)

In [181]:
vif(rot13dec_claude3_10shot_notokens_model)

In [182]:
vif(rot13dec_gpt35ft_0shot_model)

In [183]:
vif(rot13dec_gpt35ft_10shot_model)

In [184]:
vif(rot13dec_gpt35ft_100shot_model)

## Shift ciphers: Few-shot (rot-13 vs. rot-12)

In [845]:
rot13and12enc_gpt4_0shot_df <- read.table(file = 'table_few_rot13and12enc_gpt-4-0613_0shot.tsv', sep = '\t', header = TRUE)
rot13and12enc_gpt4_5shot_df <- read.table(file = 'table_few_rot13and12enc_gpt-4-0613_5shot.tsv', sep = '\t', header = TRUE)
rot13and12enc_gpt4_10shot_df <- read.table(file = 'table_few_rot13and12enc_gpt-4-0613_10shot.tsv', sep = '\t', header = TRUE)

rot13and12enc_gpt35_0shot_df <- read.table(file = 'table_few_rot13and12enc_gpt-3.5-turbo-0613_0shot.tsv', sep = '\t', header = TRUE)
rot13and12enc_gpt35_5shot_df <- read.table(file = 'table_few_rot13and12enc_gpt-3.5-turbo-0613_5shot.tsv', sep = '\t', header = TRUE)
rot13and12enc_gpt35_10shot_df <- read.table(file = 'table_few_rot13and12enc_gpt-3.5-turbo-0613_10shot.tsv', sep = '\t', header = TRUE)

rot13and12enc_claude3_0shot_df <- read.table(file = 'table_few_rot13and12enc_claude-3-opus-20240229_0shot.tsv', sep = '\t', header = TRUE)
rot13and12enc_claude3_5shot_df <- read.table(file = 'table_few_rot13and12enc_claude-3-opus-20240229_5shot.tsv', sep = '\t', header = TRUE)
rot13and12enc_claude3_10shot_df <- read.table(file = 'table_few_rot13and12enc_claude-3-opus-20240229_10shot.tsv', sep = '\t', header = TRUE)

rot13and12enc_gpt35ft_0shot_df <- read.table(file = 'table_few_rot13and12enc_gpt-3.5-turbo-0613_0shot.tsv', sep = '\t', header = TRUE)
rot13and12enc_gpt35ft_10shot_df <- read.table(file = 'table_few_rot13and12enc_ft_gpt-3.5_10shot_0shot.tsv', sep = '\t', header = TRUE)
rot13and12enc_gpt35ft_100shot_df <- read.table(file = 'table_few_rot13and12enc_ft_gpt-3.5_100shot_0shot.tsv', sep = '\t', header = TRUE)



rot13and12dec_gpt4_0shot_df <- read.table(file = 'table_few_rot13and12dec_gpt-4-0613_0shot.tsv', sep = '\t', header = TRUE)
rot13and12dec_gpt4_5shot_df <- read.table(file = 'table_few_rot13and12dec_gpt-4-0613_5shot.tsv', sep = '\t', header = TRUE)
rot13and12dec_gpt4_10shot_df <- read.table(file = 'table_few_rot13and12dec_gpt-4-0613_10shot.tsv', sep = '\t', header = TRUE)

rot13and12dec_gpt35_0shot_df <- read.table(file = 'table_few_rot13and12dec_gpt-3.5-turbo-0613_0shot.tsv', sep = '\t', header = TRUE)
rot13and12dec_gpt35_5shot_df <- read.table(file = 'table_few_rot13and12dec_gpt-3.5-turbo-0613_5shot.tsv', sep = '\t', header = TRUE)
rot13and12dec_gpt35_10shot_df <- read.table(file = 'table_few_rot13and12dec_gpt-3.5-turbo-0613_10shot.tsv', sep = '\t', header = TRUE)

rot13and12dec_claude3_0shot_df <- read.table(file = 'table_few_rot13and12dec_claude-3-opus-20240229_0shot.tsv', sep = '\t', header = TRUE)
rot13and12dec_claude3_5shot_df <- read.table(file = 'table_few_rot13and12dec_claude-3-opus-20240229_5shot.tsv', sep = '\t', header = TRUE)
rot13and12dec_claude3_10shot_df <- read.table(file = 'table_few_rot13and12dec_claude-3-opus-20240229_10shot.tsv', sep = '\t', header = TRUE)

rot13and12dec_gpt35ft_0shot_df <- read.table(file = 'table_few_rot13and12dec_gpt-3.5-turbo-0613_0shot.tsv', sep = '\t', header = TRUE)
rot13and12dec_gpt35ft_10shot_df <- read.table(file = 'table_few_rot13and12dec_ft_gpt-3.5_10shot_0shot.tsv', sep = '\t', header = TRUE)
rot13and12dec_gpt35ft_100shot_df <- read.table(file = 'table_few_rot13and12dec_ft_gpt-3.5_100shot_0shot.tsv', sep = '\t', header = TRUE)






In [846]:
rot13and12dec_gpt35ft_100shot_df

index,task,input_nchars,input_ntokens,input_logprob,output_nchars,output_ntokens,output_logprob,correct
<int>,<chr>,<int>,<int>,<dbl>,<int>,<int>,<dbl>,<int>
0,rot12dec_highprob_0shot,172,88,-563.12079,172,32,-100.14143,0
1,rot12dec_highprob_0shot,256,133,-784.13098,256,55,-169.54190,0
2,rot12dec_highprob_0shot,144,74,-466.72281,144,28,-77.32568,0
3,rot12dec_highprob_0shot,48,25,-172.43739,48,11,-27.93722,0
4,rot12dec_highprob_0shot,29,17,-117.29462,29,8,-44.12988,0
5,rot12dec_highprob_0shot,137,74,-470.77621,137,26,-69.82657,0
6,rot12dec_highprob_0shot,59,32,-203.57021,59,13,-50.92900,0
7,rot12dec_highprob_0shot,89,48,-318.73489,89,19,-56.93888,0
8,rot12dec_highprob_0shot,63,32,-208.13794,63,13,-50.78751,0
9,rot12dec_highprob_0shot,50,27,-191.91603,50,11,-38.27212,0


In [186]:
scaled_rot13and12enc_gpt4_0shot_df <- scale_taskpair_df(rot13and12enc_gpt4_0shot_df)
scaled_rot13and12enc_gpt4_5shot_df <- scale_taskpair_df(rot13and12enc_gpt4_5shot_df)
scaled_rot13and12enc_gpt4_10shot_df <- scale_taskpair_df(rot13and12enc_gpt4_10shot_df)

scaled_rot13and12enc_gpt35_0shot_df <- scale_taskpair_df(rot13and12enc_gpt35_0shot_df)
scaled_rot13and12enc_gpt35_5shot_df <- scale_taskpair_df(rot13and12enc_gpt35_5shot_df)
scaled_rot13and12enc_gpt35_10shot_df <- scale_taskpair_df(rot13and12enc_gpt35_10shot_df)

scaled_rot13and12enc_claude3_0shot_df <- scale_taskpair_df(rot13and12enc_claude3_0shot_df)
scaled_rot13and12enc_claude3_5shot_df <- scale_taskpair_df(rot13and12enc_claude3_5shot_df)
scaled_rot13and12enc_claude3_10shot_df <- scale_taskpair_df(rot13and12enc_claude3_10shot_df)

scaled_rot13and12enc_gpt35ft_0shot_df <- scale_taskpair_df(rot13and12enc_gpt35ft_0shot_df)
scaled_rot13and12enc_gpt35ft_10shot_df <- scale_taskpair_df(rot13and12enc_gpt35ft_10shot_df)
scaled_rot13and12enc_gpt35ft_100shot_df <- scale_taskpair_df(rot13and12enc_gpt35ft_100shot_df)




scaled_rot13and12dec_gpt4_0shot_df <- scale_taskpair_df(rot13and12dec_gpt4_0shot_df)
scaled_rot13and12dec_gpt4_5shot_df <- scale_taskpair_df(rot13and12dec_gpt4_5shot_df)
scaled_rot13and12dec_gpt4_10shot_df <- scale_taskpair_df(rot13and12dec_gpt4_10shot_df)

scaled_rot13and12dec_gpt35_0shot_df <- scale_taskpair_df(rot13and12dec_gpt35_0shot_df)
scaled_rot13and12dec_gpt35_5shot_df <- scale_taskpair_df(rot13and12dec_gpt35_5shot_df)
scaled_rot13and12dec_gpt35_10shot_df <- scale_taskpair_df(rot13and12dec_gpt35_10shot_df)

scaled_rot13and12dec_claude3_0shot_df <- scale_taskpair_df(rot13and12dec_claude3_0shot_df)
scaled_rot13and12dec_claude3_5shot_df <- scale_taskpair_df(rot13and12dec_claude3_5shot_df)
scaled_rot13and12dec_claude3_10shot_df <- scale_taskpair_df(rot13and12dec_claude3_10shot_df)

scaled_rot13and12dec_gpt35ft_0shot_df <- scale_taskpair_df(rot13and12dec_gpt35ft_0shot_df)
scaled_rot13and12dec_gpt35ft_10shot_df <- scale_taskpair_df(rot13and12dec_gpt35ft_10shot_df)
scaled_rot13and12dec_gpt35ft_100shot_df <- scale_taskpair_df(rot13and12dec_gpt35ft_100shot_df)



In [189]:
model_1312enc_gpt4_0shot <- bayesglm(correct ~ task + input_nchars + input_ntokens + output_ntokens + input_logprob + output_logprob, 
               data=scaled_rot13and12enc_gpt4_0shot_df, family=binomial)
model_1312enc_gpt4_5shot <- glm(correct ~ task + input_nchars + input_ntokens + output_ntokens + input_logprob + output_logprob, 
               data=scaled_rot13and12enc_gpt4_5shot_df, family=binomial)
model_1312enc_gpt4_10shot <- glm(correct ~ task + input_nchars + input_ntokens + output_ntokens + input_logprob + output_logprob, 
               data=scaled_rot13and12enc_gpt4_10shot_df, family=binomial)

model_1312enc_gpt35_0shot <- bayesglm(correct ~ task + input_nchars + input_ntokens + output_ntokens + input_logprob + output_logprob, 
               data=scaled_rot13and12enc_gpt35_0shot_df, family=binomial)
model_1312enc_gpt35_5shot <- bayesglm(correct ~ task + input_nchars + input_ntokens + output_ntokens + input_logprob + output_logprob, 
               data=scaled_rot13and12enc_gpt35_5shot_df, family=binomial)
model_1312enc_gpt35_10shot <- bayesglm(correct ~ task + input_nchars + input_ntokens + output_ntokens + input_logprob + output_logprob, 
               data=scaled_rot13and12enc_gpt35_10shot_df, family=binomial)


model_1312enc_claude3_0shot_notokens <- glm(correct ~ task + input_nchars + input_logprob + output_logprob, 
               data=scaled_rot13and12enc_claude3_0shot_df, family=binomial)
model_1312enc_claude3_5shot_notokens <- glm(correct ~ task + input_nchars + input_logprob + output_logprob, 
               data=scaled_rot13and12enc_claude3_5shot_df, family=binomial)
model_1312enc_claude3_10shot_notokens <- glm(correct ~ task + input_nchars + input_logprob + output_logprob, 
               data=scaled_rot13and12enc_claude3_10shot_df, family=binomial)


model_1312enc_gpt35ft_0shot <- bayesglm(correct ~ task + input_nchars + input_ntokens + output_ntokens + input_logprob + output_logprob, 
               data=scaled_rot13and12enc_gpt35ft_0shot_df, family=binomial)
model_1312enc_gpt35ft_10shot <- bayesglm(correct ~ task + input_nchars + input_ntokens + output_ntokens + input_logprob + output_logprob, 
               data=scaled_rot13and12enc_gpt35ft_10shot_df, family=binomial)
model_1312enc_gpt35ft_100shot <- bayesglm(correct ~ task + input_nchars + input_ntokens + output_ntokens + input_logprob + output_logprob, 
               data=scaled_rot13and12enc_gpt35ft_100shot_df, family=binomial)









In [190]:
model_1312dec_gpt4_0shot <- bayesglm(correct ~ task + input_nchars + input_ntokens + output_ntokens + input_logprob + output_logprob, 
               data=scaled_rot13and12dec_gpt4_0shot_df, family=binomial)
model_1312dec_gpt4_5shot <- bayesglm(correct ~ task + input_nchars + input_ntokens + output_ntokens + input_logprob + output_logprob, 
               data=scaled_rot13and12dec_gpt4_5shot_df, family=binomial)
model_1312dec_gpt4_10shot <- bayesglm(correct ~ task + input_nchars + input_ntokens + output_ntokens + input_logprob + output_logprob, 
               data=scaled_rot13and12dec_gpt4_10shot_df, family=binomial)

model_1312dec_gpt35_0shot <- bayesglm(correct ~ task + input_nchars + input_ntokens + output_ntokens + input_logprob + output_logprob, 
               data=scaled_rot13and12dec_gpt35_0shot_df, family=binomial)
model_1312dec_gpt35_5shot <- bayesglm(correct ~ task + input_nchars + input_ntokens + output_ntokens + input_logprob + output_logprob, 
               data=scaled_rot13and12dec_gpt35_5shot_df, family=binomial)
model_1312dec_gpt35_10shot <- bayesglm(correct ~ task + input_nchars + input_ntokens + output_ntokens + input_logprob + output_logprob, 
               data=scaled_rot13and12dec_gpt35_10shot_df, family=binomial)


model_1312dec_claude3_0shot_notokens <- glm(correct ~ task + input_nchars + input_logprob + output_logprob, 
               data=scaled_rot13and12dec_claude3_0shot_df, family=binomial)
model_1312dec_claude3_5shot_notokens <- glm(correct ~ task + input_nchars + input_logprob + output_logprob, 
               data=scaled_rot13and12dec_claude3_5shot_df, family=binomial)
model_1312dec_claude3_10shot_notokens <- glm(correct ~ task + input_nchars + input_logprob + output_logprob, 
               data=scaled_rot13and12dec_claude3_10shot_df, family=binomial)


model_1312dec_gpt35ft_0shot <- bayesglm(correct ~ task + input_nchars + input_ntokens + output_ntokens + input_logprob + output_logprob, 
               data=scaled_rot13and12dec_gpt35ft_0shot_df, family=binomial)
model_1312dec_gpt35ft_10shot <- bayesglm(correct ~ task + input_nchars + input_ntokens + output_ntokens + input_logprob + output_logprob, 
               data=scaled_rot13and12dec_gpt35ft_10shot_df, family=binomial)
model_1312dec_gpt35ft_100shot <- bayesglm(correct ~ task + input_nchars + input_ntokens + output_ntokens + input_logprob + output_logprob, 
               data=scaled_rot13and12dec_gpt35ft_100shot_df, family=binomial)









In [191]:
summary(model_1312enc_gpt4_0shot)


Call:
bayesglm(formula = correct ~ task + input_nchars + input_ntokens + 
    output_ntokens + input_logprob + output_logprob, family = binomial, 
    data = scaled_rot13and12enc_gpt4_0shot_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.72506  -0.16730  -0.06688  -0.01226   2.01228  

Coefficients:
                            Estimate Std. Error z value Pr(>|z|)    
(Intercept)                 -7.94480    1.78595  -4.448 8.65e-06 ***
taskrot13enc_highprob_0shot  4.14644    1.57770   2.628  0.00858 ** 
input_nchars                 0.02407    1.05850   0.023  0.98185    
input_ntokens                1.69637    1.04792   1.619  0.10549    
output_ntokens              -0.78348    1.18255  -0.663  0.50763    
input_logprob                1.83956    1.05650   1.741  0.08165 .  
output_logprob               2.05346    1.43909   1.427  0.15360    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family take

In [192]:
summary(model_1312enc_gpt4_5shot)


Call:
glm(formula = correct ~ task + input_nchars + input_ntokens + 
    output_ntokens + input_logprob + output_logprob, family = binomial, 
    data = scaled_rot13and12enc_gpt4_5shot_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-2.21273  -0.14995  -0.03780  -0.00216   2.61720  

Coefficients:
                             Estimate Std. Error z value Pr(>|z|)    
(Intercept)                  -8.36178    1.85713  -4.503 6.72e-06 ***
taskrot13enc_highprob_5shot   2.68344    1.27115   2.111   0.0348 *  
input_nchars                 10.04417    6.16002   1.631   0.1030    
input_ntokens                 4.30025    1.88350   2.283   0.0224 *  
output_ntokens              -10.26287    5.19754  -1.975   0.0483 *  
input_logprob                 0.07894    1.25548   0.063   0.9499    
output_logprob                8.09134    5.24060   1.544   0.1226    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family t

In [193]:
summary(model_1312enc_gpt4_10shot)


Call:
glm(formula = correct ~ task + input_nchars + input_ntokens + 
    output_ntokens + input_logprob + output_logprob, family = binomial, 
    data = scaled_rot13and12enc_gpt4_10shot_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-2.20072  -0.17732  -0.05789  -0.00507   2.55896  

Coefficients:
                             Estimate Std. Error z value Pr(>|z|)    
(Intercept)                    -7.666      1.638  -4.679 2.89e-06 ***
taskrot13enc_highprob_10shot    3.105      1.250   2.485   0.0130 *  
input_nchars                    6.913      5.512   1.254   0.2098    
input_ntokens                   3.264      1.615   2.021   0.0433 *  
output_ntokens                 -6.013      4.490  -1.339   0.1805    
input_logprob                   1.865      1.418   1.316   0.1883    
output_logprob                  5.940      4.811   1.235   0.2169    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family 

In [194]:
summary(model_1312enc_gpt35_0shot)


Call:
bayesglm(formula = correct ~ task + input_nchars + input_ntokens + 
    output_ntokens + input_logprob + output_logprob, family = binomial, 
    data = scaled_rot13and12enc_gpt35_0shot_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.15582  -0.14753  -0.05797  -0.01427   2.39885  

Coefficients:
                            Estimate Std. Error z value Pr(>|z|)    
(Intercept)                  -8.0533     1.9340  -4.164 3.13e-05 ***
taskrot13enc_highprob_0shot   2.6807     1.4694   1.824   0.0681 .  
input_nchars                 -0.5503     1.1180  -0.492   0.6226    
input_ntokens                 0.0593     1.0078   0.059   0.9531    
output_ntokens               -0.7417     1.1847  -0.626   0.5313    
input_logprob                 1.0840     1.0726   1.011   0.3122    
output_logprob                0.8403     1.1743   0.716   0.4743    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family tak

In [195]:
summary(model_1312enc_gpt35_5shot)


Call:
bayesglm(formula = correct ~ task + input_nchars + input_ntokens + 
    output_ntokens + input_logprob + output_logprob, family = binomial, 
    data = scaled_rot13and12enc_gpt35_5shot_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-0.73580  -0.21179  -0.10475  -0.04891   2.39210  

Coefficients:
                            Estimate Std. Error z value Pr(>|z|)    
(Intercept)                  -6.3208     1.5362  -4.115 3.88e-05 ***
taskrot13enc_highprob_5shot   2.4617     1.4379   1.712   0.0869 .  
input_nchars                 -0.3495     1.0522  -0.332   0.7398    
input_ntokens                 0.9452     1.0293   0.918   0.3585    
output_ntokens               -0.6618     1.1244  -0.589   0.5561    
input_logprob                 0.5349     0.9016   0.593   0.5530    
output_logprob                0.9029     1.1528   0.783   0.4335    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family tak

In [196]:
summary(model_1312enc_gpt35_10shot)


Call:
bayesglm(formula = correct ~ task + input_nchars + input_ntokens + 
    output_ntokens + input_logprob + output_logprob, family = binomial, 
    data = scaled_rot13and12enc_gpt35_10shot_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-0.77484  -0.17982  -0.08848  -0.03675   2.39949  

Coefficients:
                             Estimate Std. Error z value Pr(>|z|)    
(Intercept)                   -6.7202     1.6604  -4.047 5.18e-05 ***
taskrot13enc_highprob_10shot   2.2755     1.4409   1.579    0.114    
input_nchars                  -0.5053     1.0831  -0.467    0.641    
input_ntokens                  0.3206     0.9899   0.324    0.746    
output_ntokens                -0.5447     1.1067  -0.492    0.623    
input_logprob                  0.4672     0.9261   0.504    0.614    
output_logprob                 0.7386     1.1240   0.657    0.511    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial f

In [209]:
summary(model_1312enc_claude3_0shot_notokens)


Call:
glm(formula = correct ~ task + input_nchars + input_logprob + 
    output_logprob, family = binomial, data = scaled_rot13and12enc_claude3_0shot_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.8019  -0.5510  -0.2816   0.7567   2.9289  

Coefficients:
                            Estimate Std. Error z value Pr(>|z|)    
(Intercept)                  -2.4718     0.3987  -6.200 5.65e-10 ***
taskrot13enc_highprob_0shot   2.5807     0.4848   5.323 1.02e-07 ***
input_nchars                  4.2355     1.7854   2.372   0.0177 *  
input_logprob                 0.7670     0.5224   1.468   0.1420    
output_logprob                4.3029     1.8183   2.366   0.0180 *  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 256.41  on 199  degrees of freedom
Residual deviance: 170.62  on 195  degrees of freedom
AIC: 180.62

Number of Fisher Scoring iterations: 5


In [211]:
summary(model_1312enc_claude3_5shot_notokens)


Call:
glm(formula = correct ~ task + input_nchars + input_logprob + 
    output_logprob, family = binomial, data = scaled_rot13and12enc_claude3_5shot_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.1357  -0.9705   0.6114   0.7691   1.7366  

Coefficients:
                            Estimate Std. Error z value Pr(>|z|)    
(Intercept)                  0.98137    0.26613   3.688 0.000226 ***
taskrot13enc_highprob_5shot  0.02261    0.40617   0.056 0.955600    
input_nchars                 0.39903    1.38821   0.287 0.773772    
input_logprob                1.30180    0.41653   3.125 0.001776 ** 
output_logprob              -0.13930    1.41287  -0.099 0.921459    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 240.86  on 199  degrees of freedom
Residual deviance: 214.60  on 195  degrees of freedom
AIC: 224.6

Number of Fisher Scoring iterations: 4


In [212]:
summary(model_1312enc_claude3_10shot_notokens)


Call:
glm(formula = correct ~ task + input_nchars + input_logprob + 
    output_logprob, family = binomial, data = scaled_rot13and12enc_claude3_10shot_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.2226  -0.7293   0.5866   0.7661   1.6295  

Coefficients:
                             Estimate Std. Error z value Pr(>|z|)    
(Intercept)                    1.1774     0.2763   4.261 2.03e-05 ***
taskrot13enc_highprob_10shot  -0.2337     0.4127  -0.566   0.5712    
input_nchars                   1.7116     1.4124   1.212   0.2256    
input_logprob                  1.1034     0.4141   2.665   0.0077 ** 
output_logprob                 1.4223     1.4263   0.997   0.3187    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 237.18  on 199  degrees of freedom
Residual deviance: 210.02  on 195  degrees of freedom
AIC: 220.02

Number of Fisher Scoring iterations: 4


In [197]:
summary(model_1312enc_gpt35ft_0shot)


Call:
bayesglm(formula = correct ~ task + input_nchars + input_ntokens + 
    output_ntokens + input_logprob + output_logprob, family = binomial, 
    data = scaled_rot13and12enc_gpt35ft_0shot_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.15582  -0.14753  -0.05797  -0.01427   2.39885  

Coefficients:
                            Estimate Std. Error z value Pr(>|z|)    
(Intercept)                  -8.0533     1.9340  -4.164 3.13e-05 ***
taskrot13enc_highprob_0shot   2.6807     1.4694   1.824   0.0681 .  
input_nchars                 -0.5503     1.1180  -0.492   0.6226    
input_ntokens                 0.0593     1.0078   0.059   0.9531    
output_ntokens               -0.7417     1.1847  -0.626   0.5313    
input_logprob                 1.0840     1.0726   1.011   0.3122    
output_logprob                0.8403     1.1743   0.716   0.4743    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family t

In [198]:
summary(model_1312enc_gpt35ft_10shot)


Call:
bayesglm(formula = correct ~ task + input_nchars + input_ntokens + 
    output_ntokens + input_logprob + output_logprob, family = binomial, 
    data = scaled_rot13and12enc_gpt35ft_10shot_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-0.92009  -0.10072  -0.02556  -0.00416   2.09150  

Coefficients:
                            Estimate Std. Error z value Pr(>|z|)    
(Intercept)                  -9.7183     2.5189  -3.858 0.000114 ***
taskrot13enc_highprob_0shot   2.1747     1.4960   1.454 0.146027    
input_nchars                 -0.6748     1.2193  -0.553 0.579939    
input_ntokens                -0.7176     1.2073  -0.594 0.552247    
output_ntokens               -0.8354     1.2913  -0.647 0.517693    
input_logprob                 0.8932     1.1345   0.787 0.431128    
output_logprob                1.2649     1.3949   0.907 0.364538    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family 

In [199]:
summary(model_1312enc_gpt35ft_100shot)


Call:
bayesglm(formula = correct ~ task + input_nchars + input_ntokens + 
    output_ntokens + input_logprob + output_logprob, family = binomial, 
    data = scaled_rot13and12enc_gpt35ft_100shot_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-0.75876  -0.14765  -0.07069  -0.02453   2.58835  

Coefficients:
                            Estimate Std. Error z value Pr(>|z|)    
(Intercept)                 -7.31850    1.87858  -3.896 9.79e-05 ***
taskrot13enc_highprob_0shot  2.01821    1.45405   1.388    0.165    
input_nchars                -0.35691    1.08182  -0.330    0.741    
input_ntokens               -0.09273    1.02162  -0.091    0.928    
output_ntokens              -0.50752    1.12383  -0.452    0.652    
input_logprob                0.88225    1.07855   0.818    0.413    
output_logprob               0.65469    1.12981   0.579    0.562    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family

In [200]:
summary(model_1312dec_gpt4_0shot)


Call:
bayesglm(formula = correct ~ task + input_nchars + input_ntokens + 
    output_ntokens + input_logprob + output_logprob, family = binomial, 
    data = scaled_rot13and12dec_gpt4_0shot_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.33755  -0.51739  -0.08295  -0.02992   2.34608  

Coefficients:
                            Estimate Std. Error z value Pr(>|z|)    
(Intercept)                 -6.26570    1.68235  -3.724 0.000196 ***
taskrot13dec_highprob_0shot  5.46436    1.68741   3.238 0.001202 ** 
input_nchars                -0.30729    0.92202  -0.333 0.738924    
input_ntokens               -0.01902    0.90445  -0.021 0.983222    
output_ntokens               1.33840    0.72464   1.847 0.064747 .  
input_logprob                0.76472    0.92373   0.828 0.407747    
output_logprob               1.31530    0.62011   2.121 0.033914 *  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family take

In [201]:
summary(model_1312dec_gpt4_5shot)


Call:
bayesglm(formula = correct ~ task + input_nchars + input_ntokens + 
    output_ntokens + input_logprob + output_logprob, family = binomial, 
    data = scaled_rot13and12dec_gpt4_5shot_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.50761  -0.48113  -0.08335  -0.02809   2.22563  

Coefficients:
                            Estimate Std. Error z value Pr(>|z|)    
(Intercept)                 -6.33664    1.69061  -3.748 0.000178 ***
taskrot13dec_highprob_5shot  5.51539    1.69412   3.256 0.001131 ** 
input_nchars                 0.20467    0.90602   0.226 0.821281    
input_ntokens               -0.03961    0.89921  -0.044 0.964863    
output_ntokens               0.66195    0.66076   1.002 0.316440    
input_logprob                0.66974    0.90789   0.738 0.460705    
output_logprob               1.38788    0.63767   2.176 0.029520 *  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family take

In [202]:
summary(model_1312dec_gpt4_10shot)


Call:
bayesglm(formula = correct ~ task + input_nchars + input_ntokens + 
    output_ntokens + input_logprob + output_logprob, family = binomial, 
    data = scaled_rot13and12dec_gpt4_10shot_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.50720  -0.45583  -0.08130  -0.02787   2.17242  

Coefficients:
                             Estimate Std. Error z value Pr(>|z|)    
(Intercept)                   -6.3560     1.6901  -3.761 0.000169 ***
taskrot13dec_highprob_10shot   5.5068     1.6934   3.252 0.001146 ** 
input_nchars                   0.5177     0.9466   0.547 0.584405    
input_ntokens                 -0.1161     0.9104  -0.128 0.898535    
output_ntokens                 0.9634     0.6940   1.388 0.165101    
input_logprob                  0.6487     0.9127   0.711 0.477251    
output_logprob                 1.9109     0.6912   2.764 0.005703 ** 
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial fa

In [203]:
summary(model_1312dec_gpt35_0shot)


Call:
bayesglm(formula = correct ~ task + input_nchars + input_ntokens + 
    output_ntokens + input_logprob + output_logprob, family = binomial, 
    data = scaled_rot13and12dec_gpt35_0shot_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.02403  -0.27709  -0.10241  -0.03357   2.64572  

Coefficients:
                            Estimate Std. Error z value Pr(>|z|)    
(Intercept)                 -6.77544    1.60628  -4.218 2.46e-05 ***
taskrot13dec_highprob_0shot  4.01978    1.54314   2.605  0.00919 ** 
input_nchars                 0.24639    0.96999   0.254  0.79949    
input_ntokens               -0.26697    0.97561  -0.274  0.78436    
output_ntokens               0.03153    0.81122   0.039  0.96900    
input_logprob                0.17103    0.94753   0.181  0.85676    
output_logprob               1.77143    0.95576   1.853  0.06382 .  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family tak

In [204]:
summary(model_1312dec_gpt35_5shot)


Call:
bayesglm(formula = correct ~ task + input_nchars + input_ntokens + 
    output_ntokens + input_logprob + output_logprob, family = binomial, 
    data = scaled_rot13and12dec_gpt35_5shot_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.45238  -0.12860  -0.04244  -0.00535   1.76936  

Coefficients:
                            Estimate Std. Error z value Pr(>|z|)    
(Intercept)                  -9.1928     1.9745  -4.656 3.23e-06 ***
taskrot13dec_highprob_5shot   4.6418     1.6447   2.822  0.00477 ** 
input_nchars                  0.1614     1.1325   0.143  0.88666    
input_ntokens                -0.6406     1.2132  -0.528  0.59750    
output_ntokens                4.6090     1.4252   3.234  0.00122 ** 
input_logprob                 3.9002     1.8964   2.057  0.03973 *  
output_logprob                3.8558     1.5247   2.529  0.01144 *  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family tak

In [205]:
summary(model_1312dec_gpt35_10shot)


Call:
bayesglm(formula = correct ~ task + input_nchars + input_ntokens + 
    output_ntokens + input_logprob + output_logprob, family = binomial, 
    data = scaled_rot13and12dec_gpt35_10shot_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.19666  -0.21499  -0.08088  -0.01507   2.99287  

Coefficients:
                             Estimate Std. Error z value Pr(>|z|)    
(Intercept)                   -7.6343     1.7355  -4.399 1.09e-05 ***
taskrot13dec_highprob_10shot   4.3272     1.5881   2.725  0.00644 ** 
input_nchars                   0.4269     1.0673   0.400  0.68915    
input_ntokens                 -1.8700     1.4028  -1.333  0.18252    
output_ntokens                 1.3213     0.9715   1.360  0.17381    
input_logprob                  0.4931     1.0549   0.467  0.64019    
output_logprob                 2.1030     1.0475   2.008  0.04469 *  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial f

In [210]:
summary(model_1312dec_claude3_0shot_notokens)


Call:
glm(formula = correct ~ task + input_nchars + input_logprob + 
    output_logprob, family = binomial, data = scaled_rot13and12dec_claude3_0shot_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.5409  -0.5166   0.1567   0.2804   2.7056  

Coefficients:
                            Estimate Std. Error z value Pr(>|z|)    
(Intercept)                  -1.4736     0.3162  -4.660 3.17e-06 ***
taskrot13dec_highprob_0shot   4.4218     0.6394   6.916 4.64e-12 ***
input_nchars                  6.9663     2.3659   2.944  0.00324 ** 
input_logprob                 5.6602     2.3448   2.414  0.01578 *  
output_logprob                1.5560     0.6743   2.308  0.02102 *  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 274.83  on 199  degrees of freedom
Residual deviance: 113.68  on 195  degrees of freedom
AIC: 123.68

Number of Fisher Scoring iterations: 6


In [213]:
summary(model_1312dec_claude3_5shot_notokens)


Call:
glm(formula = correct ~ task + input_nchars + input_logprob + 
    output_logprob, family = binomial, data = scaled_rot13and12dec_claude3_5shot_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-3.05577   0.08111   0.12919   0.21765   1.18025  

Coefficients:
                            Estimate Std. Error z value Pr(>|z|)    
(Intercept)                   4.0497     0.7397   5.475 4.38e-08 ***
taskrot13dec_highprob_5shot   0.5679     0.8999   0.631   0.5280    
input_nchars                  4.6802     2.8349   1.651   0.0988 .  
input_logprob                -0.1330     3.0272  -0.044   0.9650    
output_logprob                5.0880     1.2518   4.065 4.81e-05 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 90.787  on 199  degrees of freedom
Residual deviance: 53.340  on 195  degrees of freedom
AIC: 63.34

Number of Fisher Scoring iterations: 7


In [214]:
summary(model_1312dec_claude3_10shot_notokens)


Call:
glm(formula = correct ~ task + input_nchars + input_logprob + 
    output_logprob, family = binomial, data = scaled_rot13and12dec_claude3_10shot_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-2.32620   0.02149   0.04170   0.09382   1.13690  

Coefficients:
                             Estimate Std. Error z value Pr(>|z|)    
(Intercept)                     6.260      1.518   4.125 3.71e-05 ***
taskrot13dec_highprob_10shot    0.240      1.153   0.208 0.835134    
input_nchars                    6.432      3.409   1.887 0.059207 .  
input_logprob                  -1.335      3.791  -0.352 0.724761    
output_logprob                  8.150      2.399   3.397 0.000682 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 79.406  on 199  degrees of freedom
Residual deviance: 32.556  on 195  degrees of freedom
AIC: 42.556

Number of Fisher Scoring iterations: 9

In [206]:
summary(model_1312dec_gpt35ft_0shot)


Call:
bayesglm(formula = correct ~ task + input_nchars + input_ntokens + 
    output_ntokens + input_logprob + output_logprob, family = binomial, 
    data = scaled_rot13and12dec_gpt35ft_0shot_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.02403  -0.27709  -0.10241  -0.03357   2.64572  

Coefficients:
                            Estimate Std. Error z value Pr(>|z|)    
(Intercept)                 -6.77544    1.60628  -4.218 2.46e-05 ***
taskrot13dec_highprob_0shot  4.01978    1.54314   2.605  0.00919 ** 
input_nchars                 0.24639    0.96999   0.254  0.79949    
input_ntokens               -0.26697    0.97561  -0.274  0.78436    
output_ntokens               0.03153    0.81122   0.039  0.96900    
input_logprob                0.17103    0.94753   0.181  0.85676    
output_logprob               1.77143    0.95576   1.853  0.06382 .  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family t

In [207]:
summary(model_1312dec_gpt35ft_10shot)


Call:
bayesglm(formula = correct ~ task + input_nchars + input_ntokens + 
    output_ntokens + input_logprob + output_logprob, family = binomial, 
    data = scaled_rot13and12dec_gpt35ft_10shot_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.15282  -0.21352  -0.07487  -0.01782   2.38304  

Coefficients:
                            Estimate Std. Error z value Pr(>|z|)    
(Intercept)                  -7.6175     1.6913  -4.504 6.67e-06 ***
taskrot13dec_highprob_0shot   4.0141     1.5520   2.587  0.00970 ** 
input_nchars                 -0.2705     1.0910  -0.248  0.80419    
input_ntokens                -0.5312     1.1328  -0.469  0.63914    
output_ntokens                3.5889     1.2071   2.973  0.00295 ** 
input_logprob                 2.2677     1.4886   1.523  0.12766    
output_logprob                2.9425     1.2870   2.286  0.02223 *  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family 

In [208]:
summary(model_1312dec_gpt35ft_100shot)


Call:
bayesglm(formula = correct ~ task + input_nchars + input_ntokens + 
    output_ntokens + input_logprob + output_logprob, family = binomial, 
    data = scaled_rot13and12dec_gpt35ft_100shot_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.44568  -0.28830  -0.08755  -0.02046   2.66153  

Coefficients:
                            Estimate Std. Error z value Pr(>|z|)    
(Intercept)                 -6.89568    1.67420  -4.119 3.81e-05 ***
taskrot13dec_highprob_0shot  4.95818    1.63777   3.027  0.00247 ** 
input_nchars                 0.05138    0.94695   0.054  0.95673    
input_ntokens               -0.80704    1.03197  -0.782  0.43419    
output_ntokens               0.03438    0.74190   0.046  0.96304    
input_logprob                0.70959    0.99432   0.714  0.47545    
output_logprob               0.56659    0.66467   0.852  0.39397    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family

In [215]:
vif(model_1312enc_gpt4_0shot)

In [216]:
vif(model_1312enc_gpt4_5shot)

In [217]:
vif(model_1312enc_gpt4_10shot)

In [218]:
vif(model_1312enc_gpt35_0shot)

In [219]:
vif(model_1312enc_gpt35_5shot)

In [220]:
vif(model_1312enc_gpt35_10shot)

In [223]:
vif(model_1312enc_claude3_0shot_notokens)

In [224]:
vif(model_1312enc_claude3_5shot_notokens)

In [225]:
vif(model_1312enc_claude3_10shot_notokens)

In [226]:
vif(model_1312enc_gpt35ft_0shot)

In [227]:
vif(model_1312enc_gpt35ft_10shot)

In [228]:
vif(model_1312enc_gpt35ft_100shot)

In [229]:
vif(model_1312dec_gpt4_0shot)

In [230]:
vif(model_1312dec_gpt4_5shot)

In [231]:
vif(model_1312dec_gpt4_10shot)

In [232]:
vif(model_1312dec_gpt35_0shot)

In [233]:
vif(model_1312dec_gpt35_5shot)

In [234]:
vif(model_1312dec_gpt35_10shot)

In [238]:
vif(model_1312dec_claude3_0shot_notokens)

In [239]:
vif(model_1312dec_claude3_5shot_notokens)

In [240]:
vif(model_1312dec_claude3_10shot_notokens)

In [241]:
vif(model_1312dec_gpt35ft_0shot)

In [242]:
vif(model_1312dec_gpt35ft_10shot)

In [243]:
vif(model_1312dec_gpt35ft_100shot)

## Shift ciphers: Few-shot (word; rot-13 vs. rot-12)

In [244]:
rot13and12enc_word_gpt4_0shot_df <- read.table(file = 'table_few_rot13and12enc_word_gpt-4-0613_0shot.tsv', sep = '\t', header = TRUE)
rot13and12enc_word_gpt4_5shot_df <- read.table(file = 'table_few_rot13and12enc_word_gpt-4-0613_5shot.tsv', sep = '\t', header = TRUE)
rot13and12enc_word_gpt4_10shot_df <- read.table(file = 'table_few_rot13and12enc_word_gpt-4-0613_10shot.tsv', sep = '\t', header = TRUE)

rot13and12enc_word_gpt35_0shot_df <- read.table(file = 'table_few_rot13and12enc_word_gpt-3.5-turbo-0613_0shot.tsv', sep = '\t', header = TRUE)
rot13and12enc_word_gpt35_5shot_df <- read.table(file = 'table_few_rot13and12enc_word_gpt-3.5-turbo-0613_5shot.tsv', sep = '\t', header = TRUE)
rot13and12enc_word_gpt35_10shot_df <- read.table(file = 'table_few_rot13and12enc_word_gpt-3.5-turbo-0613_10shot.tsv', sep = '\t', header = TRUE)

rot13and12enc_word_claude3_0shot_df <- read.table(file = 'table_few_rot13and12enc_word_claude-3-opus-20240229_0shot.tsv', sep = '\t', header = TRUE)
rot13and12enc_word_claude3_5shot_df <- read.table(file = 'table_few_rot13and12enc_word_claude-3-opus-20240229_5shot.tsv', sep = '\t', header = TRUE)
rot13and12enc_word_claude3_10shot_df <- read.table(file = 'table_few_rot13and12enc_word_claude-3-opus-20240229_10shot.tsv', sep = '\t', header = TRUE)



rot13and12dec_word_gpt4_0shot_df <- read.table(file = 'table_few_rot13and12dec_word_gpt-4-0613_0shot.tsv', sep = '\t', header = TRUE)
rot13and12dec_word_gpt4_5shot_df <- read.table(file = 'table_few_rot13and12dec_word_gpt-4-0613_5shot.tsv', sep = '\t', header = TRUE)
rot13and12dec_word_gpt4_10shot_df <- read.table(file = 'table_few_rot13and12dec_word_gpt-4-0613_10shot.tsv', sep = '\t', header = TRUE)

rot13and12dec_word_gpt35_0shot_df <- read.table(file = 'table_few_rot13and12dec_word_gpt-3.5-turbo-0613_0shot.tsv', sep = '\t', header = TRUE)
rot13and12dec_word_gpt35_5shot_df <- read.table(file = 'table_few_rot13and12dec_word_gpt-3.5-turbo-0613_5shot.tsv', sep = '\t', header = TRUE)
rot13and12dec_word_gpt35_10shot_df <- read.table(file = 'table_few_rot13and12dec_word_gpt-3.5-turbo-0613_10shot.tsv', sep = '\t', header = TRUE)

rot13and12dec_word_claude3_0shot_df <- read.table(file = 'table_few_rot13and12dec_word_claude-3-opus-20240229_0shot.tsv', sep = '\t', header = TRUE)
rot13and12dec_word_claude3_5shot_df <- read.table(file = 'table_few_rot13and12dec_word_claude-3-opus-20240229_5shot.tsv', sep = '\t', header = TRUE)
rot13and12dec_word_claude3_10shot_df <- read.table(file = 'table_few_rot13and12dec_word_claude-3-opus-20240229_10shot.tsv', sep = '\t', header = TRUE)





In [245]:
scaled_rot13and12enc_word_gpt4_0shot_df <- scale_taskpair_df(rot13and12enc_word_gpt4_0shot_df)
scaled_rot13and12enc_word_gpt4_5shot_df <- scale_taskpair_df(rot13and12enc_word_gpt4_5shot_df)
scaled_rot13and12enc_word_gpt4_10shot_df <- scale_taskpair_df(rot13and12enc_word_gpt4_10shot_df)

scaled_rot13and12enc_word_gpt35_0shot_df <- scale_taskpair_df(rot13and12enc_word_gpt35_0shot_df)
scaled_rot13and12enc_word_gpt35_5shot_df <- scale_taskpair_df(rot13and12enc_word_gpt35_5shot_df)
scaled_rot13and12enc_word_gpt35_10shot_df <- scale_taskpair_df(rot13and12enc_word_gpt35_10shot_df)

scaled_rot13and12enc_word_claude3_0shot_df <- scale_taskpair_df(rot13and12enc_word_claude3_0shot_df)
scaled_rot13and12enc_word_claude3_5shot_df <- scale_taskpair_df(rot13and12enc_word_claude3_5shot_df)
scaled_rot13and12enc_word_claude3_10shot_df <- scale_taskpair_df(rot13and12enc_word_claude3_10shot_df)




scaled_rot13and12dec_word_gpt4_0shot_df <- scale_taskpair_df(rot13and12dec_word_gpt4_0shot_df)
scaled_rot13and12dec_word_gpt4_5shot_df <- scale_taskpair_df(rot13and12dec_word_gpt4_5shot_df)
scaled_rot13and12dec_word_gpt4_10shot_df <- scale_taskpair_df(rot13and12dec_word_gpt4_10shot_df)

scaled_rot13and12dec_word_gpt35_0shot_df <- scale_taskpair_df(rot13and12dec_word_gpt35_0shot_df)
scaled_rot13and12dec_word_gpt35_5shot_df <- scale_taskpair_df(rot13and12dec_word_gpt35_5shot_df)
scaled_rot13and12dec_word_gpt35_10shot_df <- scale_taskpair_df(rot13and12dec_word_gpt35_10shot_df)

scaled_rot13and12dec_word_claude3_0shot_df <- scale_taskpair_df(rot13and12dec_word_claude3_0shot_df)
scaled_rot13and12dec_word_claude3_5shot_df <- scale_taskpair_df(rot13and12dec_word_claude3_5shot_df)
scaled_rot13and12dec_word_claude3_10shot_df <- scale_taskpair_df(rot13and12dec_word_claude3_10shot_df)




In [246]:
model_1312enc_word_gpt4_0shot <- bayesglm(correct ~ task + output_ntokens + input_logprob + output_logprob, 
               data=scaled_rot13and12enc_word_gpt4_0shot_df, family=binomial)
model_1312enc_word_gpt4_5shot <- bayesglm(correct ~ task + output_ntokens + input_logprob + output_logprob, 
               data=scaled_rot13and12enc_word_gpt4_5shot_df, family=binomial)
model_1312enc_word_gpt4_10shot <- bayesglm(correct ~ task + output_ntokens + input_logprob + output_logprob, 
               data=scaled_rot13and12enc_word_gpt4_10shot_df, family=binomial)

model_1312enc_word_gpt35_0shot <- bayesglm(correct ~ task + output_ntokens + input_logprob + output_logprob, 
               data=scaled_rot13and12enc_word_gpt35_0shot_df, family=binomial)
model_1312enc_word_gpt35_5shot <- bayesglm(correct ~ task + output_ntokens + input_logprob + output_logprob, 
               data=scaled_rot13and12enc_word_gpt35_5shot_df, family=binomial)
model_1312enc_word_gpt35_10shot <- bayesglm(correct ~ task + output_ntokens + input_logprob + output_logprob, 
               data=scaled_rot13and12enc_word_gpt35_10shot_df, family=binomial)

model_1312enc_word_claude3_0shot_notokens <- glm(correct ~ task + input_logprob + output_logprob, 
               data=scaled_rot13and12enc_word_claude3_0shot_df, family=binomial)
model_1312enc_word_claude3_5shot_notokens <- glm(correct ~ task + input_logprob + output_logprob, 
               data=scaled_rot13and12enc_word_claude3_5shot_df, family=binomial)
model_1312enc_word_claude3_10shot_notokens <- glm(correct ~ task + input_logprob + output_logprob, 
               data=scaled_rot13and12enc_word_claude3_10shot_df, family=binomial)








In [247]:
model_1312dec_word_gpt4_0shot <- bayesglm(correct ~ task + input_ntokens + input_logprob + output_logprob, 
               data=scaled_rot13and12dec_word_gpt4_0shot_df, family=binomial)
model_1312dec_word_gpt4_5shot <- bayesglm(correct ~ task + input_ntokens + input_logprob + output_logprob, 
               data=scaled_rot13and12dec_word_gpt4_5shot_df, family=binomial)
model_1312dec_word_gpt4_10shot <- bayesglm(correct ~ task + input_ntokens + input_logprob + output_logprob, 
               data=scaled_rot13and12dec_word_gpt4_10shot_df, family=binomial)

model_1312dec_word_gpt35_0shot <- bayesglm(correct ~ task + input_ntokens + input_logprob + output_logprob, 
               data=scaled_rot13and12dec_word_gpt35_0shot_df, family=binomial)
model_1312dec_word_gpt35_5shot <- bayesglm(correct ~ task + input_ntokens + input_logprob + output_logprob, 
               data=scaled_rot13and12dec_word_gpt35_5shot_df, family=binomial)
model_1312dec_word_gpt35_10shot <- bayesglm(correct ~ task + input_ntokens + input_logprob + output_logprob, 
               data=scaled_rot13and12dec_word_gpt35_10shot_df, family=binomial)

model_1312dec_word_claude3_0shot_notokens <- bayesglm(correct ~ task + input_logprob + output_logprob, 
               data=scaled_rot13and12dec_word_claude3_0shot_df, family=binomial)
model_1312dec_word_claude3_5shot_notokens <- bayesglm(correct ~ task + input_logprob + output_logprob, 
               data=scaled_rot13and12dec_word_claude3_5shot_df, family=binomial)
model_1312dec_word_claude3_10shot_notokens <- bayesglm(correct ~ task + input_logprob + output_logprob, 
               data=scaled_rot13and12dec_word_claude3_10shot_df, family=binomial)









In [248]:
summary(model_1312enc_word_gpt4_0shot)


Call:
bayesglm(formula = correct ~ task + output_ntokens + input_logprob + 
    output_logprob, family = binomial, data = scaled_rot13and12enc_word_gpt4_0shot_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.50500  -0.08588  -0.07260   0.94652   1.31046  

Coefficients:
                                 Estimate Std. Error z value Pr(>|z|)    
(Intercept)                       -5.9761     1.7703  -3.376 0.000736 ***
taskrot13enc_highprob_word_0shot   6.3153     1.7837   3.541 0.000399 ***
output_ntokens                     0.1224     0.2427   0.504 0.614047    
input_logprob                      0.1254     0.1992   0.629 0.529078    
output_logprob                    -0.1351     0.2006  -0.673 0.500671    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 239.05  on 199  degrees of freedom
Residual deviance: 135.40  on 195  degrees of freedom
AIC: 145.4

Number 

In [249]:
summary(model_1312enc_word_gpt4_5shot)


Call:
bayesglm(formula = correct ~ task + output_ntokens + input_logprob + 
    output_logprob, family = binomial, data = scaled_rot13and12enc_word_gpt4_5shot_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.66758  -0.10429  -0.06785   0.80435   1.52307  

Coefficients:
                                 Estimate Std. Error z value Pr(>|z|)    
(Intercept)                      -5.93536    1.75080  -3.390 0.000699 ***
taskrot13enc_highprob_word_5shot  6.13643    1.76207   3.483 0.000497 ***
output_ntokens                   -0.05662    0.24454  -0.232 0.816897    
input_logprob                     0.44132    0.21071   2.094 0.036216 *  
output_logprob                    0.14053    0.20210   0.695 0.486847    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 237.18  on 199  degrees of freedom
Residual deviance: 132.03  on 195  degrees of freedom
AIC: 142.03

Number

In [250]:
summary(model_1312enc_word_gpt4_10shot)


Call:
bayesglm(formula = correct ~ task + output_ntokens + input_logprob + 
    output_logprob, family = binomial, data = scaled_rot13and12enc_word_gpt4_10shot_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.59488  -0.08957  -0.07061   0.89607   1.36854  

Coefficients:
                                  Estimate Std. Error z value Pr(>|z|)    
(Intercept)                       -5.91850    1.76120  -3.360 0.000778 ***
taskrot13enc_highprob_word_10shot  6.23391    1.77321   3.516 0.000439 ***
output_ntokens                     0.04144    0.24343   0.170 0.864826    
input_logprob                      0.29065    0.20394   1.425 0.154110    
output_logprob                     0.08811    0.20020   0.440 0.659868    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 240.86  on 199  degrees of freedom
Residual deviance: 134.12  on 195  degrees of freedom
AIC: 144.12


In [254]:
summary(model_1312enc_word_gpt35_0shot)


Call:
bayesglm(formula = correct ~ task + output_ntokens + input_logprob + 
    output_logprob, family = binomial, data = scaled_rot13and12enc_word_gpt35_0shot_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.58526  -0.78905  -0.07185  -0.04656   1.67544  

Coefficients:
                                 Estimate Std. Error z value Pr(>|z|)    
(Intercept)                      -5.86211    1.67467  -3.500 0.000464 ***
taskrot13enc_highprob_word_0shot  5.45049    1.68391   3.237 0.001209 ** 
output_ntokens                   -0.32069    0.25128  -1.276 0.201871    
input_logprob                     0.45581    0.21181   2.152 0.031396 *  
output_logprob                    0.03268    0.20188   0.162 0.871391    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 208.20  on 199  degrees of freedom
Residual deviance: 130.19  on 195  degrees of freedom
AIC: 140.19

Numbe

In [255]:
summary(model_1312enc_word_gpt35_5shot)


Call:
bayesglm(formula = correct ~ task + output_ntokens + input_logprob + 
    output_logprob, family = binomial, data = scaled_rot13and12enc_word_gpt35_5shot_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.40471  -0.85919  -0.07816  -0.06050   1.75391  

Coefficients:
                                 Estimate Std. Error z value Pr(>|z|)    
(Intercept)                      -5.79600    1.65797  -3.496 0.000473 ***
taskrot13enc_highprob_word_5shot  5.31448    1.66875   3.185 0.001449 ** 
output_ntokens                   -0.15403    0.24709  -0.623 0.533036    
input_logprob                     0.32124    0.20621   1.558 0.119266    
output_logprob                    0.03927    0.20126   0.195 0.845311    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 200.16  on 199  degrees of freedom
Residual deviance: 132.06  on 195  degrees of freedom
AIC: 142.06

Numbe

In [256]:
summary(model_1312enc_word_gpt35_10shot)


Call:
bayesglm(formula = correct ~ task + output_ntokens + input_logprob + 
    output_logprob, family = binomial, data = scaled_rot13and12enc_word_gpt35_10shot_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.36208  -0.83560  -0.07961  -0.06188   1.63410  

Coefficients:
                                  Estimate Std. Error z value Pr(>|z|)    
(Intercept)                       -5.79282    1.64882  -3.513 0.000443 ***
taskrot13enc_highprob_word_10shot  5.23221    1.65990   3.152 0.001621 ** 
output_ntokens                    -0.12900    0.24884  -0.518 0.604169    
input_logprob                      0.32980    0.20817   1.584 0.113132    
output_logprob                     0.03778    0.20299   0.186 0.852335    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 194.49  on 199  degrees of freedom
Residual deviance: 130.33  on 195  degrees of freedom
AIC: 140.33

In [260]:
summary(model_1312enc_word_claude3_0shot_notokens)


Call:
glm(formula = correct ~ task + input_logprob + output_logprob, 
    family = binomial, data = scaled_rot13and12enc_word_claude3_0shot_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.80638  -0.86593  -0.00225   0.85502   1.83466  

Coefficients:
                                 Estimate Std. Error z value Pr(>|z|)    
(Intercept)                      -0.98445    0.22909  -4.297 1.73e-05 ***
taskrot13enc_highprob_word_0shot  1.96949    0.33019   5.965 2.45e-09 ***
input_logprob                     0.32455    0.16255   1.997   0.0459 *  
output_logprob                   -0.08452    0.16434  -0.514   0.6071    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 277.26  on 199  degrees of freedom
Residual deviance: 232.86  on 196  degrees of freedom
AIC: 240.86

Number of Fisher Scoring iterations: 4


In [261]:
summary(model_1312enc_word_claude3_5shot_notokens)


Call:
glm(formula = correct ~ task + input_logprob + output_logprob, 
    family = binomial, data = scaled_rot13and12enc_word_claude3_5shot_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.8903  -0.9207   0.6354   0.7783   1.7360  

Coefficients:
                                 Estimate Std. Error z value Pr(>|z|)    
(Intercept)                       -0.7007     0.2161  -3.243  0.00118 ** 
taskrot13enc_highprob_word_5shot   1.9556     0.3324   5.883 4.03e-09 ***
input_logprob                      0.2168     0.1601   1.354  0.17569    
output_logprob                    -0.1424     0.1660  -0.858  0.39110    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 274.83  on 199  degrees of freedom
Residual deviance: 233.54  on 196  degrees of freedom
AIC: 241.54

Number of Fisher Scoring iterations: 4


In [262]:
summary(model_1312enc_word_claude3_10shot_notokens)


Call:
glm(formula = correct ~ task + input_logprob + output_logprob, 
    family = binomial, data = scaled_rot13and12enc_word_claude3_10shot_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.9740  -0.9233   0.6005   0.7799   1.6769  

Coefficients:
                                  Estimate Std. Error z value Pr(>|z|)    
(Intercept)                        -0.6724     0.2160  -3.113  0.00185 ** 
taskrot13enc_highprob_word_10shot   2.0091     0.3386   5.933 2.98e-09 ***
input_logprob                       0.2674     0.1617   1.654  0.09813 .  
output_logprob                     -0.1987     0.1682  -1.182  0.23734    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 273.87  on 199  degrees of freedom
Residual deviance: 230.80  on 196  degrees of freedom
AIC: 238.8

Number of Fisher Scoring iterations: 4


In [251]:
summary(model_1312dec_word_gpt4_0shot)


Call:
bayesglm(formula = correct ~ task + input_ntokens + input_logprob + 
    output_logprob, family = binomial, data = scaled_rot13and12dec_word_gpt4_0shot_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.45290  -0.90440  -0.07681  -0.06017   1.45194  

Coefficients:
                                  Estimate Std. Error z value Pr(>|z|)    
(Intercept)                      -5.834543   1.699687  -3.433 0.000598 ***
taskrot13dec_highprob_word_0shot  5.687382   1.711709   3.323 0.000892 ***
input_ntokens                     0.167054   0.240132   0.696 0.486632    
input_logprob                     0.324279   0.204018   1.589 0.111956    
output_logprob                    0.008957   0.198272   0.045 0.963966    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 218.10  on 199  degrees of freedom
Residual deviance: 136.06  on 195  degrees of freedom
AIC: 146.06

N

In [252]:
summary(model_1312dec_word_gpt4_5shot)


Call:
bayesglm(formula = correct ~ task + input_ntokens + input_logprob + 
    output_logprob, family = binomial, data = scaled_rot13and12dec_word_gpt4_5shot_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.54289  -0.27995  -0.07504   0.14789   1.46700  

Coefficients:
                                 Estimate Std. Error z value Pr(>|z|)    
(Intercept)                      -5.88770    1.72150  -3.420 0.000626 ***
taskrot13dec_highprob_word_5shot  5.87831    1.73407   3.390 0.000699 ***
input_ntokens                     0.25923    0.24240   1.069 0.284876    
input_logprob                     0.39245    0.20711   1.895 0.058109 .  
output_logprob                    0.05709    0.19957   0.286 0.774821    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 224.93  on 199  degrees of freedom
Residual deviance: 134.89  on 195  degrees of freedom
AIC: 144.89

Number 

In [253]:
summary(model_1312dec_word_gpt4_10shot)


Call:
bayesglm(formula = correct ~ task + input_ntokens + input_logprob + 
    output_logprob, family = binomial, data = scaled_rot13and12dec_word_gpt4_10shot_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.46823  -0.93548  -0.07468  -0.05661   1.43962  

Coefficients:
                                  Estimate Std. Error z value Pr(>|z|)    
(Intercept)                       -5.92226    1.72564  -3.432 0.000599 ***
taskrot13dec_highprob_word_10shot  5.91264    1.73881   3.400 0.000673 ***
input_ntokens                      0.30149    0.24231   1.244 0.213408    
input_logprob                      0.24793    0.19967   1.242 0.214352    
output_logprob                     0.03104    0.19745   0.157 0.875101    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 222.71  on 199  degrees of freedom
Residual deviance: 136.65  on 195  degrees of freedom
AIC: 146.65



In [257]:
summary(model_1312dec_word_gpt35_0shot)


Call:
bayesglm(formula = correct ~ task + input_ntokens + input_logprob + 
    output_logprob, family = binomial, data = scaled_rot13and12dec_word_gpt35_0shot_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.34739  -0.66423  -0.07171  -0.06452   1.83071  

Coefficients:
                                 Estimate Std. Error z value Pr(>|z|)    
(Intercept)                      -6.12868    1.68228  -3.643 0.000269 ***
taskrot13dec_highprob_word_0shot  5.48558    1.69681   3.233 0.001226 ** 
input_ntokens                     0.60494    0.26426   2.289 0.022068 *  
input_logprob                     0.05123    0.21394   0.239 0.810748    
output_logprob                    0.05444    0.21741   0.250 0.802284    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 175.87  on 199  degrees of freedom
Residual deviance: 119.00  on 195  degrees of freedom
AIC: 129

Number of

In [258]:
summary(model_1312dec_word_gpt35_5shot)


Call:
bayesglm(formula = correct ~ task + input_ntokens + input_logprob + 
    output_logprob, family = binomial, data = scaled_rot13and12dec_word_gpt35_5shot_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.43081  -0.70119  -0.07344  -0.05195   1.79134  

Coefficients:
                                 Estimate Std. Error z value Pr(>|z|)    
(Intercept)                       -6.1240     1.6959  -3.611 0.000305 ***
taskrot13dec_highprob_word_5shot   5.6129     1.7106   3.281 0.001033 ** 
input_ntokens                      0.6106     0.2606   2.343 0.019105 *  
input_logprob                      0.1273     0.2087   0.610 0.542090    
output_logprob                     0.1136     0.2125   0.535 0.592925    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 185.49  on 199  degrees of freedom
Residual deviance: 122.93  on 195  degrees of freedom
AIC: 132.93

Number

In [259]:
summary(model_1312dec_word_gpt35_10shot)


Call:
bayesglm(formula = correct ~ task + input_ntokens + input_logprob + 
    output_logprob, family = binomial, data = scaled_rot13and12dec_word_gpt35_10shot_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.79479  -0.67935  -0.07258  -0.04086   1.83991  

Coefficients:
                                  Estimate Std. Error z value Pr(>|z|)    
(Intercept)                        -6.3142     1.7451  -3.618 0.000297 ***
taskrot13dec_highprob_word_10shot   6.0351     1.7608   3.428 0.000609 ***
input_ntokens                       0.7172     0.2683   2.673 0.007514 ** 
input_logprob                       0.1427     0.2088   0.683 0.494368    
output_logprob                      0.4201     0.2197   1.912 0.055883 .  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 200.16  on 199  degrees of freedom
Residual deviance: 121.55  on 195  degrees of freedom
AIC: 131.55


In [263]:
summary(model_1312dec_word_claude3_0shot_notokens)


Call:
bayesglm(formula = correct ~ task + input_logprob + output_logprob, 
    family = binomial, data = scaled_rot13and12dec_word_claude3_0shot_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.74903  -0.07832  -0.06920   0.82140   1.15033  

Coefficients:
                                 Estimate Std. Error z value Pr(>|z|)    
(Intercept)                      -5.99763    1.81851  -3.298 0.000973 ***
taskrot13dec_highprob_word_0shot  6.75020    1.83057   3.687 0.000226 ***
input_logprob                    -0.22592    0.18862  -1.198 0.231014    
output_logprob                   -0.01268    0.20610  -0.062 0.950938    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 255.07  on 199  degrees of freedom
Residual deviance: 125.78  on 196  degrees of freedom
AIC: 133.78

Number of Fisher Scoring iterations: 17


In [264]:
summary(model_1312dec_word_claude3_5shot_notokens)


Call:
bayesglm(formula = correct ~ task + input_logprob + output_logprob, 
    family = binomial, data = scaled_rot13and12dec_word_claude3_5shot_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-2.55671  -0.05527  -0.02742   0.24855   0.64750  

Coefficients:
                                 Estimate Std. Error z value Pr(>|z|)    
(Intercept)                       -6.4880     2.1837  -2.971  0.00297 ** 
taskrot13dec_highprob_word_5shot   9.8504     2.2890   4.303 1.68e-05 ***
input_logprob                      0.4063     0.3987   1.019  0.30822    
output_logprob                    -0.7302     0.4781  -1.527  0.12668    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 276.939  on 199  degrees of freedom
Residual deviance:  29.849  on 196  degrees of freedom
AIC: 37.849

Number of Fisher Scoring iterations: 18


In [265]:
summary(model_1312dec_word_claude3_10shot_notokens)


Call:
bayesglm(formula = correct ~ task + input_logprob + output_logprob, 
    family = binomial, data = scaled_rot13and12dec_word_claude3_10shot_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-2.66750  -0.05731  -0.03657   0.22280   0.48599  

Coefficients:
                                  Estimate Std. Error z value Pr(>|z|)    
(Intercept)                        -6.4585     2.2023  -2.933  0.00336 ** 
taskrot13dec_highprob_word_10shot  10.0816     2.3155   4.354 1.34e-05 ***
input_logprob                      -0.3232     0.4364  -0.741  0.45893    
output_logprob                     -0.3921     0.4798  -0.817  0.41386    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 277.079  on 199  degrees of freedom
Residual deviance:  25.659  on 196  degrees of freedom
AIC: 33.659

Number of Fisher Scoring iterations: 18


In [266]:
vif(model_1312enc_word_gpt4_0shot)

In [267]:
vif(model_1312enc_word_gpt4_5shot)

In [268]:
vif(model_1312enc_word_gpt4_10shot)

In [269]:
vif(model_1312enc_word_gpt35_0shot)

In [270]:
vif(model_1312enc_word_gpt35_5shot)

In [271]:
vif(model_1312enc_word_gpt35_10shot)

In [275]:
vif(model_1312enc_word_claude3_0shot_notokens)

In [276]:
vif(model_1312enc_word_claude3_5shot_notokens)

In [277]:
vif(model_1312enc_word_claude3_10shot_notokens)

In [278]:
vif(model_1312dec_word_gpt4_0shot)

In [279]:
vif(model_1312dec_word_gpt4_5shot)

In [280]:
vif(model_1312dec_word_gpt4_10shot)

In [281]:
vif(model_1312dec_word_gpt35_0shot)

In [282]:
vif(model_1312dec_word_gpt35_5shot)

In [283]:
vif(model_1312dec_word_gpt35_10shot)

In [287]:
vif(model_1312dec_word_claude3_0shot_notokens)

In [288]:
vif(model_1312dec_word_claude3_5shot_notokens)

In [289]:
vif(model_1312dec_word_claude3_10shot_notokens)

## Shift ciphers: Few-shot (word overlap; rot-13 vs. rot-12)

In [290]:
rot13and12enc_word_overlap_gpt4_0shot_df <- read.table(file = 'table_few_rot13and12enc_word_overlap_gpt-4-0613_0shot.tsv', sep = '\t', header = TRUE)
rot13and12enc_word_overlap_gpt4_5shot_df <- read.table(file = 'table_few_rot13and12enc_word_overlap_gpt-4-0613_5shot.tsv', sep = '\t', header = TRUE)
rot13and12enc_word_overlap_gpt4_10shot_df <- read.table(file = 'table_few_rot13and12enc_word_overlap_gpt-4-0613_10shot.tsv', sep = '\t', header = TRUE)

rot13and12enc_word_overlap_gpt35_0shot_df <- read.table(file = 'table_few_rot13and12enc_word_overlap_gpt-3.5-turbo-0613_0shot.tsv', sep = '\t', header = TRUE)
rot13and12enc_word_overlap_gpt35_5shot_df <- read.table(file = 'table_few_rot13and12enc_word_overlap_gpt-3.5-turbo-0613_5shot.tsv', sep = '\t', header = TRUE)
rot13and12enc_word_overlap_gpt35_10shot_df <- read.table(file = 'table_few_rot13and12enc_word_overlap_gpt-3.5-turbo-0613_10shot.tsv', sep = '\t', header = TRUE)

rot13and12enc_word_overlap_claude3_0shot_df <- read.table(file = 'table_few_rot13and12enc_word_overlap_claude-3-opus-20240229_0shot.tsv', sep = '\t', header = TRUE)
rot13and12enc_word_overlap_claude3_5shot_df <- read.table(file = 'table_few_rot13and12enc_word_overlap_claude-3-opus-20240229_5shot.tsv', sep = '\t', header = TRUE)
rot13and12enc_word_overlap_claude3_10shot_df <- read.table(file = 'table_few_rot13and12enc_word_overlap_claude-3-opus-20240229_10shot.tsv', sep = '\t', header = TRUE)



rot13and12dec_word_overlap_gpt4_0shot_df <- read.table(file = 'table_few_rot13and12dec_word_overlap_gpt-4-0613_0shot.tsv', sep = '\t', header = TRUE)
rot13and12dec_word_overlap_gpt4_5shot_df <- read.table(file = 'table_few_rot13and12dec_word_overlap_gpt-4-0613_5shot.tsv', sep = '\t', header = TRUE)
rot13and12dec_word_overlap_gpt4_10shot_df <- read.table(file = 'table_few_rot13and12dec_word_overlap_gpt-4-0613_10shot.tsv', sep = '\t', header = TRUE)

rot13and12dec_word_overlap_gpt35_0shot_df <- read.table(file = 'table_few_rot13and12dec_word_overlap_gpt-3.5-turbo-0613_0shot.tsv', sep = '\t', header = TRUE)
rot13and12dec_word_overlap_gpt35_5shot_df <- read.table(file = 'table_few_rot13and12dec_word_overlap_gpt-3.5-turbo-0613_5shot.tsv', sep = '\t', header = TRUE)
rot13and12dec_word_overlap_gpt35_10shot_df <- read.table(file = 'table_few_rot13and12dec_word_overlap_gpt-3.5-turbo-0613_10shot.tsv', sep = '\t', header = TRUE)

rot13and12dec_word_overlap_claude3_0shot_df <- read.table(file = 'table_few_rot13and12dec_word_overlap_claude-3-opus-20240229_0shot.tsv', sep = '\t', header = TRUE)
rot13and12dec_word_overlap_claude3_5shot_df <- read.table(file = 'table_few_rot13and12dec_word_overlap_claude-3-opus-20240229_5shot.tsv', sep = '\t', header = TRUE)
rot13and12dec_word_overlap_claude3_10shot_df <- read.table(file = 'table_few_rot13and12dec_word_overlap_claude-3-opus-20240229_10shot.tsv', sep = '\t', header = TRUE)





In [291]:
scaled_rot13and12enc_word_overlap_gpt4_0shot_df <- scale_taskpair_df(rot13and12enc_word_overlap_gpt4_0shot_df)
scaled_rot13and12enc_word_overlap_gpt4_5shot_df <- scale_taskpair_df(rot13and12enc_word_overlap_gpt4_5shot_df)
scaled_rot13and12enc_word_overlap_gpt4_10shot_df <- scale_taskpair_df(rot13and12enc_word_overlap_gpt4_10shot_df)

scaled_rot13and12enc_word_overlap_gpt35_0shot_df <- scale_taskpair_df(rot13and12enc_word_overlap_gpt35_0shot_df)
scaled_rot13and12enc_word_overlap_gpt35_5shot_df <- scale_taskpair_df(rot13and12enc_word_overlap_gpt35_5shot_df)
scaled_rot13and12enc_word_overlap_gpt35_10shot_df <- scale_taskpair_df(rot13and12enc_word_overlap_gpt35_10shot_df)

scaled_rot13and12enc_word_overlap_claude3_0shot_df <- scale_taskpair_df(rot13and12enc_word_overlap_claude3_0shot_df)
scaled_rot13and12enc_word_overlap_claude3_5shot_df <- scale_taskpair_df(rot13and12enc_word_overlap_claude3_5shot_df)
scaled_rot13and12enc_word_overlap_claude3_10shot_df <- scale_taskpair_df(rot13and12enc_word_overlap_claude3_10shot_df)




scaled_rot13and12dec_word_overlap_gpt4_0shot_df <- scale_taskpair_df(rot13and12dec_word_overlap_gpt4_0shot_df)
scaled_rot13and12dec_word_overlap_gpt4_5shot_df <- scale_taskpair_df(rot13and12dec_word_overlap_gpt4_5shot_df)
scaled_rot13and12dec_word_overlap_gpt4_10shot_df <- scale_taskpair_df(rot13and12dec_word_overlap_gpt4_10shot_df)

scaled_rot13and12dec_word_overlap_gpt35_0shot_df <- scale_taskpair_df(rot13and12dec_word_overlap_gpt35_0shot_df)
scaled_rot13and12dec_word_overlap_gpt35_5shot_df <- scale_taskpair_df(rot13and12dec_word_overlap_gpt35_5shot_df)
scaled_rot13and12dec_word_overlap_gpt35_10shot_df <- scale_taskpair_df(rot13and12dec_word_overlap_gpt35_10shot_df)

scaled_rot13and12dec_word_overlap_claude3_0shot_df <- scale_taskpair_df(rot13and12dec_word_overlap_claude3_0shot_df)
scaled_rot13and12dec_word_overlap_claude3_5shot_df <- scale_taskpair_df(rot13and12dec_word_overlap_claude3_5shot_df)
scaled_rot13and12dec_word_overlap_claude3_10shot_df <- scale_taskpair_df(rot13and12dec_word_overlap_claude3_10shot_df)




In [292]:
model_1312enc_word_overlap_gpt4_0shot <- bayesglm(correct ~ task + output_ntokens + input_logprob + output_logprob, 
               data=scaled_rot13and12enc_word_overlap_gpt4_0shot_df, family=binomial)
model_1312enc_word_overlap_gpt4_5shot <- bayesglm(correct ~ task + output_ntokens + input_logprob + output_logprob, 
               data=scaled_rot13and12enc_word_overlap_gpt4_5shot_df, family=binomial)
model_1312enc_word_overlap_gpt4_10shot <- bayesglm(correct ~ task + output_ntokens + input_logprob + output_logprob, 
               data=scaled_rot13and12enc_word_overlap_gpt4_10shot_df, family=binomial)

model_1312enc_word_overlap_gpt35_0shot <- bayesglm(correct ~ task + output_ntokens + input_logprob + output_logprob, 
               data=scaled_rot13and12enc_word_overlap_gpt35_0shot_df, family=binomial)
model_1312enc_word_overlap_gpt35_5shot <- bayesglm(correct ~ task + output_ntokens + input_logprob + output_logprob, 
               data=scaled_rot13and12enc_word_overlap_gpt35_5shot_df, family=binomial)
model_1312enc_word_overlap_gpt35_10shot <- bayesglm(correct ~ task + output_ntokens + input_logprob + output_logprob, 
               data=scaled_rot13and12enc_word_overlap_gpt35_10shot_df, family=binomial)

model_1312enc_word_overlap_claude3_0shot_notokens <- glm(correct ~ task + input_logprob + output_logprob, 
               data=scaled_rot13and12enc_word_overlap_claude3_0shot_df, family=binomial)
model_1312enc_word_overlap_claude3_5shot_notokens <- glm(correct ~ task + input_logprob + output_logprob, 
               data=scaled_rot13and12enc_word_overlap_claude3_5shot_df, family=binomial)
model_1312enc_word_overlap_claude3_10shot_notokens <- glm(correct ~ task + input_logprob + output_logprob, 
               data=scaled_rot13and12enc_word_overlap_claude3_10shot_df, family=binomial)








In [293]:
model_1312dec_word_overlap_gpt4_0shot <- bayesglm(correct ~ task + input_ntokens + input_logprob + output_logprob, 
               data=scaled_rot13and12dec_word_overlap_gpt4_0shot_df, family=binomial)
model_1312dec_word_overlap_gpt4_5shot <- bayesglm(correct ~ task + input_ntokens + input_logprob + output_logprob, 
               data=scaled_rot13and12dec_word_overlap_gpt4_5shot_df, family=binomial)
model_1312dec_word_overlap_gpt4_10shot <- bayesglm(correct ~ task + input_ntokens + input_logprob + output_logprob, 
               data=scaled_rot13and12dec_word_overlap_gpt4_10shot_df, family=binomial)

model_1312dec_word_overlap_gpt35_0shot <- bayesglm(correct ~ task + input_ntokens + input_logprob + output_logprob, 
               data=scaled_rot13and12dec_word_overlap_gpt35_0shot_df, family=binomial)
model_1312dec_word_overlap_gpt35_5shot <- bayesglm(correct ~ task + input_ntokens + input_logprob + output_logprob, 
               data=scaled_rot13and12dec_word_overlap_gpt35_5shot_df, family=binomial)
model_1312dec_word_overlap_gpt35_10shot <- bayesglm(correct ~ task + input_ntokens + input_logprob + output_logprob, 
               data=scaled_rot13and12dec_word_overlap_gpt35_10shot_df, family=binomial)

model_1312dec_word_overlap_claude3_0shot_notokens <- bayesglm(correct ~ task + input_logprob + output_logprob, 
               data=scaled_rot13and12dec_word_overlap_claude3_0shot_df, family=binomial)
model_1312dec_word_overlap_claude3_5shot_notokens <- bayesglm(correct ~ task + input_logprob + output_logprob, 
               data=scaled_rot13and12dec_word_overlap_claude3_5shot_df, family=binomial)
model_1312dec_word_overlap_claude3_10shot_notokens <- bayesglm(correct ~ task + input_logprob + output_logprob, 
               data=scaled_rot13and12dec_word_overlap_claude3_10shot_df, family=binomial)









In [294]:
summary(model_1312enc_word_overlap_gpt4_0shot)


Call:
bayesglm(formula = correct ~ task + output_ntokens + input_logprob + 
    output_logprob, family = binomial, data = scaled_rot13and12enc_word_overlap_gpt4_0shot_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.37433  -0.08662  -0.07636   1.04582   1.30037  

Coefficients:
                                         Estimate Std. Error z value Pr(>|z|)
(Intercept)                              -5.84135    1.72930  -3.378 0.000730
taskrot13enc_highprob_word_overlap_0shot  5.94554    1.74287   3.411 0.000646
output_ntokens                           -0.02817    0.24926  -0.113 0.910007
input_logprob                             0.08589    0.19891   0.432 0.665898
output_logprob                           -0.12427    0.21745  -0.571 0.567680
                                            
(Intercept)                              ***
taskrot13enc_highprob_word_overlap_0shot ***
output_ntokens                              
input_logprob                            

In [295]:
summary(model_1312enc_word_overlap_gpt4_5shot)


Call:
bayesglm(formula = correct ~ task + output_ntokens + input_logprob + 
    output_logprob, family = binomial, data = scaled_rot13and12enc_word_overlap_gpt4_5shot_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.47767  -0.08541  -0.07488   0.99576   1.22001  

Coefficients:
                                         Estimate Std. Error z value Pr(>|z|)
(Intercept)                              -5.86719    1.74763  -3.357 0.000787
taskrot13enc_highprob_word_overlap_5shot  6.10749    1.76162   3.467 0.000526
output_ntokens                            0.03730    0.25014   0.149 0.881447
input_logprob                            -0.13295    0.20066  -0.663 0.507621
output_logprob                           -0.04016    0.21847  -0.184 0.854148
                                            
(Intercept)                              ***
taskrot13enc_highprob_word_overlap_5shot ***
output_ntokens                              
input_logprob                            

In [296]:
summary(model_1312enc_word_overlap_gpt4_10shot)


Call:
bayesglm(formula = correct ~ task + output_ntokens + input_logprob + 
    output_logprob, family = binomial, data = scaled_rot13and12enc_word_overlap_gpt4_10shot_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.41357  -0.08376  -0.07499   1.02726   1.15544  

Coefficients:
                                          Estimate Std. Error z value Pr(>|z|)
(Intercept)                               -5.89506    1.75390  -3.361 0.000776
taskrot13enc_highprob_word_overlap_10shot  6.16292    1.76851   3.485 0.000493
output_ntokens                             0.13560    0.25078   0.541 0.588685
input_logprob                              0.03378    0.19940   0.169 0.865478
output_logprob                             0.01684    0.21755   0.077 0.938313
                                             
(Intercept)                               ***
taskrot13enc_highprob_word_overlap_10shot ***
output_ntokens                               
input_logprob                 

In [300]:
summary(model_1312enc_word_overlap_gpt35_0shot)


Call:
bayesglm(formula = correct ~ task + output_ntokens + input_logprob + 
    output_logprob, family = binomial, data = scaled_rot13and12enc_word_overlap_gpt35_0shot_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.45761  -0.84826  -0.07936  -0.05799   1.61691  

Coefficients:
                                         Estimate Std. Error z value Pr(>|z|)
(Intercept)                               -5.7196     1.6653  -3.434 0.000594
taskrot13enc_highprob_word_overlap_0shot   5.3749     1.6779   3.203 0.001358
output_ntokens                            -0.1672     0.2553  -0.655 0.512469
input_logprob                              0.2376     0.2069   1.148 0.250889
output_logprob                             0.1900     0.2243   0.847 0.397034
                                            
(Intercept)                              ***
taskrot13enc_highprob_word_overlap_0shot ** 
output_ntokens                              
input_logprob                           

In [301]:
summary(model_1312enc_word_overlap_gpt35_5shot)


Call:
bayesglm(formula = correct ~ task + output_ntokens + input_logprob + 
    output_logprob, family = binomial, data = scaled_rot13and12enc_word_overlap_gpt35_5shot_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.58496  -0.75699  -0.07846  -0.05555   1.74180  

Coefficients:
                                         Estimate Std. Error z value Pr(>|z|)
(Intercept)                               -5.6829     1.6329  -3.480 0.000501
taskrot13enc_highprob_word_overlap_5shot   5.0805     1.6451   3.088 0.002014
output_ntokens                            -0.3101     0.2654  -1.168 0.242682
input_logprob                              0.2524     0.2149   1.174 0.240219
output_logprob                             0.2669     0.2317   1.152 0.249254
                                            
(Intercept)                              ***
taskrot13enc_highprob_word_overlap_5shot ** 
output_ntokens                              
input_logprob                           

In [302]:
summary(model_1312enc_word_overlap_gpt35_10shot)


Call:
bayesglm(formula = correct ~ task + output_ntokens + input_logprob + 
    output_logprob, family = binomial, data = scaled_rot13and12enc_word_overlap_gpt35_10shot_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.42529  -0.78736  -0.08053  -0.06094   1.70563  

Coefficients:
                                          Estimate Std. Error z value Pr(>|z|)
(Intercept)                                -5.6731     1.6261  -3.489 0.000485
taskrot13enc_highprob_word_overlap_10shot   5.0221     1.6392   3.064 0.002186
output_ntokens                             -0.2153     0.2636  -0.817 0.413928
input_logprob                               0.1985     0.2135   0.930 0.352565
output_logprob                              0.2721     0.2309   1.178 0.238670
                                             
(Intercept)                               ***
taskrot13enc_highprob_word_overlap_10shot ** 
output_ntokens                               
input_logprob                

In [309]:
summary(model_1312enc_word_overlap_claude3_0shot_notokens)


Call:
glm(formula = correct ~ task + input_logprob + output_logprob, 
    family = binomial, data = scaled_rot13and12enc_word_overlap_claude3_0shot_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.1847  -0.9140   0.4977   0.7801   1.7278  

Coefficients:
                                         Estimate Std. Error z value Pr(>|z|)
(Intercept)                               -0.7454     0.2259  -3.299  0.00097
taskrot13enc_highprob_word_overlap_0shot   2.3377     0.3716   6.292 3.14e-10
input_logprob                              0.2793     0.1717   1.626  0.10387
output_logprob                            -0.4725     0.1759  -2.686  0.00724
                                            
(Intercept)                              ***
taskrot13enc_highprob_word_overlap_0shot ***
input_logprob                               
output_logprob                           ** 
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial 

In [310]:
summary(model_1312enc_word_overlap_claude3_5shot_notokens)


Call:
glm(formula = correct ~ task + input_logprob + output_logprob, 
    family = binomial, data = scaled_rot13and12enc_word_overlap_claude3_5shot_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.1923   0.4837   0.6044   0.7242   1.1665  

Coefficients:
                                         Estimate Std. Error z value Pr(>|z|)
(Intercept)                               1.10211    0.24017   4.589 4.46e-06
taskrot13enc_highprob_word_overlap_5shot  0.56345    0.36799   1.531   0.1257
input_logprob                             0.39283    0.18063   2.175   0.0297
output_logprob                            0.04906    0.18464   0.266   0.7905
                                            
(Intercept)                              ***
taskrot13enc_highprob_word_overlap_5shot    
input_logprob                            *  
output_logprob                              
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial 

In [311]:
summary(model_1312enc_word_overlap_claude3_10shot_notokens)


Call:
glm(formula = correct ~ task + input_logprob + output_logprob, 
    family = binomial, data = scaled_rot13and12enc_word_overlap_claude3_10shot_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.7346   0.2428   0.3779   0.5137   1.0703  

Coefficients:
                                          Estimate Std. Error z value Pr(>|z|)
(Intercept)                                 2.9125     0.4553   6.397 1.58e-10
taskrot13enc_highprob_word_overlap_10shot  -0.9738     0.5262  -1.851  0.06421
input_logprob                               0.6764     0.2451   2.760  0.00578
output_logprob                             -0.3260     0.2303  -1.416  0.15680
                                             
(Intercept)                               ***
taskrot13enc_highprob_word_overlap_10shot .  
input_logprob                             ** 
output_logprob                               
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter fo

In [297]:
summary(model_1312dec_word_overlap_gpt4_0shot)


Call:
bayesglm(formula = correct ~ task + input_ntokens + input_logprob + 
    output_logprob, family = binomial, data = scaled_rot13and12dec_word_overlap_gpt4_0shot_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.78947  -0.11357  -0.07029   0.76409   1.63100  

Coefficients:
                                         Estimate Std. Error z value Pr(>|z|)
(Intercept)                               -5.9393     1.7417  -3.410 0.000649
taskrot13dec_highprob_word_overlap_0shot   6.0413     1.7563   3.440 0.000582
input_ntokens                              0.2015     0.2551   0.790 0.429660
input_logprob                              0.4408     0.2289   1.926 0.054142
output_logprob                            -0.3239     0.2095  -1.546 0.122058
                                            
(Intercept)                              ***
taskrot13dec_highprob_word_overlap_0shot ***
input_ntokens                               
input_logprob                            .

In [298]:
summary(model_1312dec_word_overlap_gpt4_5shot)


Call:
bayesglm(formula = correct ~ task + input_ntokens + input_logprob + 
    output_logprob, family = binomial, data = scaled_rot13and12dec_word_overlap_gpt4_5shot_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.72877  -0.11138  -0.07332   0.79268   1.46661  

Coefficients:
                                         Estimate Std. Error z value Pr(>|z|)
(Intercept)                              -5.90439    1.73375  -3.406 0.000660
taskrot13dec_highprob_word_overlap_5shot  5.97951    1.74836   3.420 0.000626
input_ntokens                             0.28195    0.25550   1.104 0.269802
input_logprob                             0.51077    0.23168   2.205 0.027482
output_logprob                           -0.03207    0.20385  -0.157 0.874987
                                            
(Intercept)                              ***
taskrot13dec_highprob_word_overlap_5shot ***
input_ntokens                               
input_logprob                            *

In [299]:
summary(model_1312dec_word_overlap_gpt4_10shot)


Call:
bayesglm(formula = correct ~ task + input_ntokens + input_logprob + 
    output_logprob, family = binomial, data = scaled_rot13and12dec_word_overlap_gpt4_10shot_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.86010  -0.09649  -0.06627   0.84461   1.48677  

Coefficients:
                                          Estimate Std. Error z value Pr(>|z|)
(Intercept)                                -5.9578     1.7734  -3.360 0.000781
taskrot13dec_highprob_word_overlap_10shot   6.3237     1.7887   3.535 0.000407
input_ntokens                               0.1822     0.2560   0.712 0.476643
input_logprob                               0.3889     0.2302   1.689 0.091158
output_logprob                             -0.3186     0.2115  -1.507 0.131863
                                             
(Intercept)                               ***
taskrot13dec_highprob_word_overlap_10shot ***
input_ntokens                                
input_logprob                  

In [303]:
summary(model_1312dec_word_overlap_gpt35_0shot)


Call:
bayesglm(formula = correct ~ task + input_ntokens + input_logprob + 
    output_logprob, family = binomial, data = scaled_rot13and12dec_word_overlap_gpt35_0shot_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.23747  -0.86777  -0.07980  -0.06711   1.62815  

Coefficients:
                                         Estimate Std. Error z value Pr(>|z|)
(Intercept)                              -5.89354    1.68322  -3.501 0.000463
taskrot13dec_highprob_word_overlap_0shot  5.52861    1.69861   3.255 0.001135
input_ntokens                             0.34756    0.26044   1.335 0.182035
input_logprob                             0.22427    0.22532   0.995 0.319574
output_logprob                           -0.05377    0.20346  -0.264 0.791549
                                            
(Intercept)                              ***
taskrot13dec_highprob_word_overlap_0shot ** 
input_ntokens                               
input_logprob                            

In [304]:
summary(model_1312dec_word_overlap_gpt35_5shot)


Call:
bayesglm(formula = correct ~ task + input_ntokens + input_logprob + 
    output_logprob, family = binomial, data = scaled_rot13and12dec_word_overlap_gpt35_5shot_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.22469  -0.97317  -0.07995  -0.07227   1.45773  

Coefficients:
                                         Estimate Std. Error z value Pr(>|z|)
(Intercept)                              -5.71773    1.66190  -3.440 0.000581
taskrot13dec_highprob_word_overlap_5shot  5.34732    1.67518   3.192 0.001412
input_ntokens                            -0.09599    0.25207  -0.381 0.703341
input_logprob                             0.05221    0.21924   0.238 0.811778
output_logprob                            0.05857    0.20133   0.291 0.771123
                                            
(Intercept)                              ***
taskrot13dec_highprob_word_overlap_5shot ** 
input_ntokens                               
input_logprob                            

In [305]:
summary(model_1312dec_word_overlap_gpt35_10shot)


Call:
bayesglm(formula = correct ~ task + input_ntokens + input_logprob + 
    output_logprob, family = binomial, data = scaled_rot13and12dec_word_overlap_gpt35_10shot_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.3362  -0.9606  -0.1681  -0.1488   2.8713  

Coefficients:
                                            Estimate Std. Error z value
(Intercept)                               -4.2217357  0.8330924  -5.068
taskrot13dec_highprob_word_overlap_10shot  3.8539386  0.8642935   4.459
input_ntokens                             -0.0009611  0.2478700  -0.004
input_logprob                              0.2141073  0.2187059   0.979
output_logprob                             0.0562009  0.1984158   0.283
                                          Pr(>|z|)    
(Intercept)                               4.03e-07 ***
taskrot13dec_highprob_word_overlap_10shot 8.23e-06 ***
input_ntokens                                0.997    
input_logprob                                0

In [306]:
summary(model_1312dec_word_overlap_claude3_0shot_notokens)


Call:
bayesglm(formula = correct ~ task + input_logprob + output_logprob, 
    family = binomial, data = scaled_rot13and12dec_word_overlap_claude3_0shot_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.68917  -0.08513  -0.06971   0.82904   1.23489  

Coefficients:
                                         Estimate Std. Error z value Pr(>|z|)
(Intercept)                              -6.01133    1.80834  -3.324 0.000887
taskrot13dec_highprob_word_overlap_0shot  6.65376    1.82108   3.654 0.000258
input_logprob                            -0.35590    0.19309  -1.843 0.065303
output_logprob                            0.05716    0.21012   0.272 0.785605
                                            
(Intercept)                              ***
taskrot13dec_highprob_word_overlap_0shot ***
input_logprob                            .  
output_logprob                              
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion paramete

In [307]:
summary(model_1312dec_word_overlap_claude3_5shot_notokens)


Call:
bayesglm(formula = correct ~ task + input_logprob + output_logprob, 
    family = binomial, data = scaled_rot13and12dec_word_overlap_claude3_5shot_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-0.60221  -0.34582   0.04131   0.06108   2.43316  

Coefficients:
                                         Estimate Std. Error z value Pr(>|z|)
(Intercept)                               -2.8675     0.4729  -6.064 1.33e-09
taskrot13dec_highprob_word_overlap_5shot   9.2516     2.1723   4.259 2.06e-05
input_logprob                             -0.4909     0.4421  -1.110    0.267
output_logprob                             0.1129     0.3859   0.292    0.770
                                            
(Intercept)                              ***
taskrot13dec_highprob_word_overlap_5shot ***
input_logprob                               
output_logprob                              
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion paramete

In [308]:
summary(model_1312dec_word_overlap_claude3_10shot_notokens)


Call:
bayesglm(formula = correct ~ task + input_logprob + output_logprob, 
    family = binomial, data = scaled_rot13and12dec_word_overlap_claude3_10shot_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.9027  -0.6976   0.1464   0.1735   1.8854  

Coefficients:
                                          Estimate Std. Error z value Pr(>|z|)
(Intercept)                               -1.22233    0.24838  -4.921 8.60e-07
taskrot13dec_highprob_word_overlap_10shot  5.54533    0.89950   6.165 7.05e-10
input_logprob                              0.03734    0.25203   0.148    0.882
output_logprob                             0.26998    0.23369   1.155    0.248
                                             
(Intercept)                               ***
taskrot13dec_highprob_word_overlap_10shot ***
input_logprob                                
output_logprob                               
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion paramet

In [312]:
vif(model_1312enc_word_overlap_gpt4_0shot)

In [313]:
vif(model_1312enc_word_overlap_gpt4_5shot)

In [314]:
vif(model_1312enc_word_overlap_gpt4_10shot)

In [315]:
vif(model_1312enc_word_overlap_gpt35_0shot)

In [316]:
vif(model_1312enc_word_overlap_gpt35_5shot)

In [317]:
vif(model_1312enc_word_overlap_gpt35_10shot)

In [321]:
vif(model_1312enc_word_overlap_claude3_0shot_notokens)

In [322]:
vif(model_1312enc_word_overlap_claude3_5shot_notokens)

In [323]:
vif(model_1312enc_word_overlap_claude3_10shot_notokens)

In [324]:
vif(model_1312dec_word_overlap_gpt4_0shot)

In [325]:
vif(model_1312dec_word_overlap_gpt4_5shot)

In [326]:
vif(model_1312dec_word_overlap_gpt4_10shot)

In [327]:
vif(model_1312dec_word_overlap_gpt35_0shot)

In [328]:
vif(model_1312dec_word_overlap_gpt35_5shot)

In [329]:
vif(model_1312dec_word_overlap_gpt35_10shot)

In [331]:
vif(model_1312dec_word_overlap_claude3_0shot_notokens)

In [332]:
vif(model_1312dec_word_overlap_claude3_5shot_notokens)

In [333]:
vif(model_1312dec_word_overlap_claude3_10shot_notokens)

# Reversal

In [334]:
# Read in data
revenc_gpt4_df <- read.table(file = 'table_revenc_gpt-4-0613.tsv', sep = '\t', header = TRUE)
revenc_gpt35_df <- read.table(file = 'table_revenc_gpt-3.5-turbo-0613.tsv', sep = '\t', header = TRUE)
revenc_llama3_df <- read.table(file = 'table_revenc_llama-3-70b-chat-hf.tsv', sep = '\t', header = TRUE)
revenc_claude3_df <- read.table(file = 'table_revenc_claude-3-opus-20240229.tsv', sep = '\t', header = TRUE)
revenc_gemini1_df <- read.table(file = 'table_revenc_gemini-1.0-pro-001.tsv', sep = '\t', header = TRUE)


revdec_gpt4_df <- read.table(file = 'table_revdec_gpt-4-0613.tsv', sep = '\t', header = TRUE)
revdec_gpt35_df <- read.table(file = 'table_revdec_gpt-3.5-turbo-0613.tsv', sep = '\t', header = TRUE)
revdec_llama3_df <- read.table(file = 'table_revdec_llama-3-70b-chat-hf.tsv', sep = '\t', header = TRUE)
revdec_claude3_df <- read.table(file = 'table_revdec_claude-3-opus-20240229.tsv', sep = '\t', header = TRUE)
revdec_gemini1_df <- read.table(file = 'table_revdec_gemini-1.0-pro-001.tsv', sep = '\t', header = TRUE)


In [335]:
# Z-score data
scaled_revenc_gpt4_df <- scale_df(revenc_gpt4_df)
scaled_revenc_gpt35_df <- scale_df(revenc_gpt35_df)
scaled_revenc_llama3_df <- scale_df(revenc_llama3_df)
scaled_revenc_claude3_df <- scale_df(revenc_claude3_df)
scaled_revenc_gemini1_df <- scale_df(revenc_gemini1_df)


scaled_revdec_gpt4_df <- scale_df(revdec_gpt4_df)
scaled_revdec_gpt35_df <- scale_df(revdec_gpt35_df)
scaled_revdec_llama3_df <- scale_df(revdec_llama3_df)
scaled_revdec_claude3_df <- scale_df(revdec_claude3_df)
scaled_revdec_gemini1_df <- scale_df(revdec_gemini1_df)


In [336]:

revenc_gpt4_model <- correct_vs_length_and_prob(scaled_revenc_gpt4_df, include_output_chars=FALSE)
revenc_gpt35_model <- correct_vs_length_and_prob(scaled_revenc_gpt35_df, include_output_chars=FALSE)
revenc_llama3_model <- correct_vs_length_and_prob(scaled_revenc_llama3_df, include_output_chars=FALSE)
revenc_claude3_notokens_model <- glm(correct ~ input_logprob + output_logprob + input_nchars, 
               data=scaled_revenc_claude3_df, family=binomial)
revenc_gemini1_model <- correct_vs_length_and_prob(scaled_revenc_gemini1_df, include_output_chars=FALSE)

revdec_gpt4_model <- correct_vs_length_and_prob(scaled_revdec_gpt4_df, include_output_chars=FALSE)
revdec_gpt35_model <- correct_vs_length_and_prob(scaled_revdec_gpt35_df, include_output_chars=FALSE)
revdec_llama3_model <- correct_vs_length_and_prob(scaled_revdec_llama3_df, include_output_chars=FALSE)
revdec_claude3_notokens_model <- glm(correct ~ input_logprob + output_logprob + input_nchars, 
               data=scaled_revdec_claude3_df, family=binomial)
revdec_gemini1_model <- correct_vs_length_and_prob(scaled_revdec_gemini1_df, include_output_chars=FALSE)



In [337]:
summary(revenc_gpt4_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob + input_ntokens + 
    output_ntokens + input_nchars, family = binomial, data = df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.4668   0.3250   0.4665   0.5949   1.7972  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     1.71991    0.17799   9.663   <2e-16 ***
input_logprob  -0.03883    0.20693  -0.188   0.8511    
output_logprob -0.30283    0.77852  -0.389   0.6973    
input_ntokens   3.06200    1.90858   1.604   0.1086    
output_ntokens -3.65279    2.03329  -1.796   0.0724 .  
input_nchars   -0.67044    0.65079  -1.030   0.3029    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 288.81  on 299  degrees of freedom
Residual deviance: 245.57  on 294  degrees of freedom
AIC: 257.57

Number of Fisher Scoring iterations: 5


In [338]:
summary(revenc_gpt35_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob + input_ntokens + 
    output_ntokens + input_nchars, family = binomial, data = df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.5268  -0.9842  -0.4943   1.0049   2.1479  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)    -0.64275    0.14184  -4.532 5.85e-06 ***
input_logprob   0.27657    0.22757   1.215    0.224    
output_logprob -0.11149    0.86103  -0.129    0.897    
input_ntokens  -1.68717    1.72867  -0.976    0.329    
output_ntokens  0.01594    1.73855   0.009    0.993    
input_nchars    0.64733    0.68799   0.941    0.347    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 400.34  on 299  degrees of freedom
Residual deviance: 341.51  on 294  degrees of freedom
AIC: 353.51

Number of Fisher Scoring iterations: 5


In [339]:
summary(revenc_llama3_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob + input_ntokens + 
    output_ntokens + input_nchars, family = binomial, data = df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-0.65358  -0.30337  -0.17592  -0.06698   3.03519  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     -4.6375     0.7918  -5.857 4.71e-09 ***
input_logprob    0.9064     0.9316   0.973    0.331    
output_logprob  -2.3569     3.1214  -0.755    0.450    
input_ntokens    1.1798     4.6818   0.252    0.801    
output_ntokens  -3.8041     4.5225  -0.841    0.400    
input_nchars    -1.2790     2.3890  -0.535    0.592    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 87.687  on 299  degrees of freedom
Residual deviance: 74.111  on 294  degrees of freedom
AIC: 86.111

Number of Fisher Scoring iterations: 8


In [340]:
summary(revenc_claude3_notokens_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob + input_nchars, 
    family = binomial, data = scaled_revenc_claude3_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-3.0122  -0.3549   0.2623   0.4553   1.9621  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)      1.4785     0.2319   6.375 1.83e-10 ***
input_logprob    3.4693     0.3881   8.939  < 2e-16 ***
output_logprob  -2.7827     0.9074  -3.067  0.00216 ** 
input_nchars    -0.2981     0.8085  -0.369  0.71239    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 368.20  on 299  degrees of freedom
Residual deviance: 196.57  on 296  degrees of freedom
AIC: 204.57

Number of Fisher Scoring iterations: 6


In [341]:
summary(revenc_gemini1_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob + input_ntokens + 
    output_ntokens + input_nchars, family = binomial, data = df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.84128  -0.55520  -0.19226  -0.01592   2.31691  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     -2.9033     0.3848  -7.544 4.55e-14 ***
input_logprob    1.9918     0.5024   3.965 7.35e-05 ***
output_logprob   0.1596     1.5527   0.103   0.9181    
input_ntokens    3.3884     2.8724   1.180   0.2381    
output_ntokens  -6.4500     2.9922  -2.156   0.0311 *  
input_nchars     1.9975     1.1393   1.753   0.0796 .  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 302.99  on 299  degrees of freedom
Residual deviance: 199.61  on 294  degrees of freedom
AIC: 211.61

Number of Fisher Scoring iterations: 7


In [347]:
summary(revdec_gpt4_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob + input_ntokens + 
    output_ntokens + input_nchars, family = binomial, data = df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.8716   0.1950   0.3063   0.4420   1.9434  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)      2.1370     0.2382   8.972  < 2e-16 ***
input_logprob   -2.9227     0.9709  -3.010  0.00261 ** 
output_logprob   2.2588     0.3278   6.891 5.55e-12 ***
input_ntokens   -0.1544     2.3128  -0.067  0.94677    
output_ntokens  -0.9864     2.2231  -0.444  0.65727    
input_nchars    -0.4750     0.7494  -0.634  0.52616    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 282.84  on 299  degrees of freedom
Residual deviance: 193.97  on 294  degrees of freedom
AIC: 205.97

Number of Fisher Scoring iterations: 6


In [348]:
summary(revdec_gpt35_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob + input_ntokens + 
    output_ntokens + input_nchars, family = binomial, data = df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.9183  -0.9433   0.5918   0.8400   2.5700  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)      0.0747     0.1386   0.539  0.58981    
input_logprob   -2.2254     0.8629  -2.579  0.00991 ** 
output_logprob   1.3848     0.2447   5.659 1.52e-08 ***
input_ntokens   -3.0624     1.8011  -1.700  0.08908 .  
output_ntokens   1.2449     1.7323   0.719  0.47235    
input_nchars    -0.5018     0.7103  -0.706  0.47989    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 413.63  on 299  degrees of freedom
Residual deviance: 328.84  on 294  degrees of freedom
AIC: 340.84

Number of Fisher Scoring iterations: 5


In [349]:
summary(revdec_llama3_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob + input_ntokens + 
    output_ntokens + input_nchars, family = binomial, data = df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.8550  -0.7805  -0.2430   0.8383   3.2149  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     -1.2890     0.2062  -6.251 4.09e-10 ***
input_logprob   -0.2878     1.1042  -0.261   0.7944    
output_logprob   1.5947     0.3430   4.649 3.33e-06 ***
input_ntokens   -3.4502     2.0516  -1.682   0.0926 .  
output_ntokens   2.1893     1.9701   1.111   0.2665    
input_nchars     0.1994     0.9012   0.221   0.8249    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 387.22  on 299  degrees of freedom
Residual deviance: 277.87  on 294  degrees of freedom
AIC: 289.87

Number of Fisher Scoring iterations: 6


In [350]:
summary(revdec_claude3_notokens_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob + input_nchars, 
    family = binomial, data = scaled_revdec_claude3_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-3.2949  -0.4483   0.3295   0.5556   2.4010  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)      0.7515     0.1841   4.083 4.45e-05 ***
input_logprob   -1.7020     0.8125  -2.095   0.0362 *  
output_logprob   3.3464     0.3708   9.024  < 2e-16 ***
input_nchars     0.4681     0.7721   0.606   0.5443    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 395.37  on 299  degrees of freedom
Residual deviance: 221.77  on 296  degrees of freedom
AIC: 229.77

Number of Fisher Scoring iterations: 5


In [351]:
summary(revdec_gemini1_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob + input_ntokens + 
    output_ntokens + input_nchars, family = binomial, data = df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.0432  -0.6414  -0.1063   0.7578   2.8660  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     -1.4662     0.2461  -5.958 2.55e-09 ***
input_logprob   -4.2670     1.1733  -3.637 0.000276 ***
output_logprob   3.5286     0.4839   7.292 3.05e-13 ***
input_ntokens   -1.4551     2.6720  -0.545 0.586057    
output_ntokens  -0.0578     2.5931  -0.022 0.982216    
input_nchars    -2.1372     0.8939  -2.391 0.016816 *  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 399.40  on 299  degrees of freedom
Residual deviance: 258.77  on 294  degrees of freedom
AIC: 270.77

Number of Fisher Scoring iterations: 6


In [342]:
vif(revenc_gpt4_model)

In [343]:
vif(revenc_gpt35_model)

In [344]:
vif(revenc_llama3_model)

In [345]:
vif(revenc_claude3_notokens_model)

In [346]:
vif(revenc_gemini1_model)

In [352]:
vif(revdec_gpt4_model)

In [353]:
vif(revdec_gpt35_model)

In [354]:
vif(revdec_llama3_model)

In [355]:
vif(revdec_claude3_notokens_model)

In [356]:
vif(revdec_gemini1_model)

## Reversal (few-shot)

In [357]:
revenc_gpt4_0shot_df <- read.table(file = 'table_few_revenc_gpt-4-0613_0shot.tsv', sep = '\t', header = TRUE)
revenc_gpt4_5shot_df <- read.table(file = 'table_few_revenc_gpt-4-0613_5shot.tsv', sep = '\t', header = TRUE)
revenc_gpt4_10shot_df <- read.table(file = 'table_few_revenc_gpt-4-0613_10shot.tsv', sep = '\t', header = TRUE)

revenc_gpt35_0shot_df <- read.table(file = 'table_few_revenc_gpt-3.5-turbo-0613_0shot.tsv', sep = '\t', header = TRUE)
revenc_gpt35_5shot_df <- read.table(file = 'table_few_revenc_gpt-3.5-turbo-0613_5shot.tsv', sep = '\t', header = TRUE)
revenc_gpt35_10shot_df <- read.table(file = 'table_few_revenc_gpt-3.5-turbo-0613_10shot.tsv', sep = '\t', header = TRUE)

revenc_claude3_0shot_df <- read.table(file = 'table_few_revenc_claude-3-opus-20240229_0shot.tsv', sep = '\t', header = TRUE)
revenc_claude3_5shot_df <- read.table(file = 'table_few_revenc_claude-3-opus-20240229_5shot.tsv', sep = '\t', header = TRUE)
revenc_claude3_10shot_df <- read.table(file = 'table_few_revenc_claude-3-opus-20240229_10shot.tsv', sep = '\t', header = TRUE)

revenc_gpt35ft_0shot_df <- read.table(file = 'table_few_revenc_gpt-3.5-turbo-0613_0shot.tsv', sep = '\t', header = TRUE)
revenc_gpt35ft_10shot_df <- read.table(file = 'table_few_revenc_ft_gpt-3.5_10shot_0shot.tsv', sep = '\t', header = TRUE)
revenc_gpt35ft_100shot_df <- read.table(file = 'table_few_revenc_ft_gpt-3.5_100shot_0shot.tsv', sep = '\t', header = TRUE)



revdec_gpt4_0shot_df <- read.table(file = 'table_few_revdec_gpt-4-0613_0shot.tsv', sep = '\t', header = TRUE)
revdec_gpt4_5shot_df <- read.table(file = 'table_few_revdec_gpt-4-0613_5shot.tsv', sep = '\t', header = TRUE)
revdec_gpt4_10shot_df <- read.table(file = 'table_few_revdec_gpt-4-0613_10shot.tsv', sep = '\t', header = TRUE)

revdec_gpt35_0shot_df <- read.table(file = 'table_few_revdec_gpt-3.5-turbo-0613_0shot.tsv', sep = '\t', header = TRUE)
revdec_gpt35_5shot_df <- read.table(file = 'table_few_revdec_gpt-3.5-turbo-0613_5shot.tsv', sep = '\t', header = TRUE)
revdec_gpt35_10shot_df <- read.table(file = 'table_few_revdec_gpt-3.5-turbo-0613_10shot.tsv', sep = '\t', header = TRUE)

revdec_claude3_0shot_df <- read.table(file = 'table_few_revdec_claude-3-opus-20240229_0shot.tsv', sep = '\t', header = TRUE)
revdec_claude3_5shot_df <- read.table(file = 'table_few_revdec_claude-3-opus-20240229_5shot.tsv', sep = '\t', header = TRUE)
revdec_claude3_10shot_df <- read.table(file = 'table_few_revdec_claude-3-opus-20240229_10shot.tsv', sep = '\t', header = TRUE)

revdec_gpt35ft_0shot_df <- read.table(file = 'table_few_revdec_gpt-3.5-turbo-0613_0shot.tsv', sep = '\t', header = TRUE)
revdec_gpt35ft_10shot_df <- read.table(file = 'table_few_revdec_ft_gpt-3.5_10shot_0shot.tsv', sep = '\t', header = TRUE)
revdec_gpt35ft_100shot_df <- read.table(file = 'table_few_revdec_ft_gpt-3.5_100shot_0shot.tsv', sep = '\t', header = TRUE)





In [358]:
scaled_revenc_gpt4_0shot_df <- scale_df(revenc_gpt4_0shot_df)
scaled_revenc_gpt4_5shot_df <- scale_df(revenc_gpt4_5shot_df)
scaled_revenc_gpt4_10shot_df <- scale_df(revenc_gpt4_10shot_df)

scaled_revenc_gpt35_0shot_df <- scale_df(revenc_gpt35_0shot_df)
scaled_revenc_gpt35_5shot_df <- scale_df(revenc_gpt35_5shot_df)
scaled_revenc_gpt35_10shot_df <- scale_df(revenc_gpt35_10shot_df)

scaled_revenc_claude3_0shot_df <- scale_df(revenc_claude3_0shot_df)
scaled_revenc_claude3_5shot_df <- scale_df(revenc_claude3_5shot_df)
scaled_revenc_claude3_10shot_df <- scale_df(revenc_claude3_10shot_df)

scaled_revenc_gpt35ft_0shot_df <- scale_df(revenc_gpt35ft_0shot_df)
scaled_revenc_gpt35ft_10shot_df <- scale_df(revenc_gpt35ft_10shot_df)
scaled_revenc_gpt35ft_100shot_df <- scale_df(revenc_gpt35ft_100shot_df)





scaled_revdec_gpt4_0shot_df <- scale_df(revdec_gpt4_0shot_df)
scaled_revdec_gpt4_5shot_df <- scale_df(revdec_gpt4_5shot_df)
scaled_revdec_gpt4_10shot_df <- scale_df(revdec_gpt4_10shot_df)

scaled_revdec_gpt35_0shot_df <- scale_df(revdec_gpt35_0shot_df)
scaled_revdec_gpt35_5shot_df <- scale_df(revdec_gpt35_5shot_df)
scaled_revdec_gpt35_10shot_df <- scale_df(revdec_gpt35_10shot_df)

scaled_revdec_claude3_0shot_df <- scale_df(revdec_claude3_0shot_df)
scaled_revdec_claude3_5shot_df <- scale_df(revdec_claude3_5shot_df)
scaled_revdec_claude3_10shot_df <- scale_df(revdec_claude3_10shot_df)

scaled_revdec_gpt35ft_0shot_df <- scale_df(revdec_gpt35ft_0shot_df)
scaled_revdec_gpt35ft_10shot_df <- scale_df(revdec_gpt35ft_10shot_df)
scaled_revdec_gpt35ft_100shot_df <- scale_df(revdec_gpt35ft_100shot_df)





In [359]:
revenc_gpt4_0shot_model <- correct_vs_length_and_prob(scaled_revenc_gpt4_0shot_df, include_output_chars=FALSE)
revenc_gpt4_5shot_model <- correct_vs_length_and_prob(scaled_revenc_gpt4_5shot_df, include_output_chars=FALSE)
revenc_gpt4_10shot_model <- correct_vs_length_and_prob(scaled_revenc_gpt4_10shot_df, include_output_chars=FALSE)

revenc_gpt35_0shot_model <- correct_vs_length_and_prob(scaled_revenc_gpt35_0shot_df, include_output_chars=FALSE)
revenc_gpt35_5shot_model <- correct_vs_length_and_prob(scaled_revenc_gpt35_5shot_df, include_output_chars=FALSE)
revenc_gpt35_10shot_model <- correct_vs_length_and_prob(scaled_revenc_gpt35_10shot_df, include_output_chars=FALSE)

revenc_claude3_0shot_notokens_model <- glm(correct ~ input_logprob + output_logprob + input_nchars, 
                                       data=scaled_revenc_claude3_0shot_df, family=binomial)
revenc_claude3_5shot_notokens_model <- glm(correct ~ input_logprob + output_logprob + input_nchars, 
                                       data=scaled_revenc_claude3_5shot_df, family=binomial)
revenc_claude3_10shot_notokens_model <- glm(correct ~ input_logprob + output_logprob + input_nchars, 
                                       data=scaled_revenc_claude3_10shot_df, family=binomial)

revenc_gpt35ft_0shot_model <- correct_vs_length_and_prob(scaled_revenc_gpt35ft_0shot_df, include_output_chars=FALSE)
revenc_gpt35ft_10shot_model <- correct_vs_length_and_prob(scaled_revenc_gpt35ft_10shot_df, include_output_chars=FALSE)
revenc_gpt35ft_100shot_model <- correct_vs_length_and_prob(scaled_revenc_gpt35ft_100shot_df, include_output_chars=FALSE)






revdec_gpt4_0shot_model <- correct_vs_length_and_prob(scaled_revdec_gpt4_0shot_df, include_output_chars=FALSE)
revdec_gpt4_5shot_model <- correct_vs_length_and_prob(scaled_revdec_gpt4_5shot_df, include_output_chars=FALSE)
revdec_gpt4_10shot_model <- correct_vs_length_and_prob(scaled_revdec_gpt4_10shot_df, include_output_chars=FALSE)

revdec_gpt35_0shot_model <- correct_vs_length_and_prob(scaled_revdec_gpt35_0shot_df, include_output_chars=FALSE)
revdec_gpt35_5shot_model <- correct_vs_length_and_prob(scaled_revdec_gpt35_5shot_df, include_output_chars=FALSE)
revdec_gpt35_10shot_model <- correct_vs_length_and_prob(scaled_revdec_gpt35_10shot_df, include_output_chars=FALSE)

revdec_claude3_0shot_notokens_model <- glm(correct ~ input_logprob + output_logprob + input_nchars, 
                                       data=scaled_revdec_claude3_0shot_df, family=binomial)
revdec_claude3_5shot_notokens_model <- glm(correct ~ input_logprob + output_logprob + input_nchars, 
                                       data=scaled_revdec_claude3_5shot_df, family=binomial)
revdec_claude3_10shot_notokens_model <- glm(correct ~ input_logprob + output_logprob + input_nchars, 
                                       data=scaled_revdec_claude3_10shot_df, family=binomial)

revdec_gpt35ft_0shot_model <- correct_vs_length_and_prob(scaled_revdec_gpt35ft_0shot_df, include_output_chars=FALSE)
revdec_gpt35ft_10shot_model <- correct_vs_length_and_prob(scaled_revdec_gpt35ft_10shot_df, include_output_chars=FALSE)
revdec_gpt35ft_100shot_model <- correct_vs_length_and_prob(scaled_revdec_gpt35ft_100shot_df, include_output_chars=FALSE)




In [360]:
summary(revenc_gpt4_0shot_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob + input_ntokens + 
    output_ntokens + input_nchars, family = binomial, data = df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.7724  -1.3048   0.8114   0.9516   1.3441  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     0.54906    0.12233   4.488 7.18e-06 ***
input_logprob   0.04037    0.19725   0.205   0.8378    
output_logprob  1.09198    0.76038   1.436   0.1510    
input_ntokens  -0.41127    1.50968  -0.272   0.7853    
output_ntokens -0.03916    1.60724  -0.024   0.9806    
input_nchars    1.25566    0.62196   2.019   0.0435 *  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 395.37  on 299  degrees of freedom
Residual deviance: 383.45  on 294  degrees of freedom
AIC: 395.45

Number of Fisher Scoring iterations: 4


In [361]:
summary(revenc_gpt4_5shot_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob + input_ntokens + 
    output_ntokens + input_nchars, family = binomial, data = df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.7003   0.2640   0.3240   0.4443   2.0388  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)      2.4404     0.2344  10.412  < 2e-16 ***
input_logprob    0.9747     0.2896   3.365 0.000765 ***
output_logprob  -0.8956     1.1521  -0.777 0.436932    
input_ntokens    2.7231     2.2690   1.200 0.230078    
output_ntokens  -3.1408     2.4359  -1.289 0.197276    
input_nchars    -0.4452     0.9030  -0.493 0.622010    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 207.91  on 299  degrees of freedom
Residual deviance: 176.00  on 294  degrees of freedom
AIC: 188

Number of Fisher Scoring iterations: 5


In [362]:
summary(revenc_gpt4_10shot_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob + input_ntokens + 
    output_ntokens + input_nchars, family = binomial, data = df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.9202   0.1799   0.2382   0.3494   2.2531  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)      3.0997     0.3245   9.553  < 2e-16 ***
input_logprob    1.8088     0.4306   4.201 2.66e-05 ***
output_logprob  -2.3215     1.4852  -1.563    0.118    
input_ntokens   -1.3065     2.8214  -0.463    0.643    
output_ntokens  -0.1720     3.0438  -0.057    0.955    
input_nchars     0.1404     1.2206   0.115    0.908    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 162.33  on 299  degrees of freedom
Residual deviance: 125.70  on 294  degrees of freedom
AIC: 137.7

Number of Fisher Scoring iterations: 6


In [363]:
summary(revenc_gpt35_0shot_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob + input_ntokens + 
    output_ntokens + input_nchars, family = binomial, data = df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-0.7212  -0.3173  -0.2123  -0.1241   3.1788  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     -4.0128     0.5441  -7.375 1.64e-13 ***
input_logprob    0.7716     0.8280   0.932    0.351    
output_logprob  -3.6619     2.4840  -1.474    0.140    
input_ntokens    1.3292     4.6376   0.287    0.774    
output_ntokens  -5.3240     4.6328  -1.149    0.250    
input_nchars    -0.4499     1.8481  -0.243    0.808    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 94.321  on 299  degrees of freedom
Residual deviance: 83.014  on 294  degrees of freedom
AIC: 95.014

Number of Fisher Scoring iterations: 7


In [364]:
summary(revenc_gpt35_5shot_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob + input_ntokens + 
    output_ntokens + input_nchars, family = binomial, data = df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.0891  -1.0396   0.6280   0.8216   1.8870  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)      0.8296     0.1346   6.164  7.1e-10 ***
input_logprob    0.4585     0.2142   2.140   0.0323 *  
output_logprob  -1.7273     0.8349  -2.069   0.0386 *  
input_ntokens    1.6824     1.6162   1.041   0.2979    
output_ntokens  -3.4834     1.7415  -2.000   0.0455 *  
input_nchars    -0.3067     0.6589  -0.465   0.6416    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 376.12  on 299  degrees of freedom
Residual deviance: 336.54  on 294  degrees of freedom
AIC: 348.54

Number of Fisher Scoring iterations: 4


In [365]:
summary(revenc_gpt35_10shot_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob + input_ntokens + 
    output_ntokens + input_nchars, family = binomial, data = df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.9570  -1.2825   0.7460   0.8879   1.7345  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)      0.6819     0.1268   5.377 7.57e-08 ***
input_logprob    0.2716     0.2041   1.331   0.1832    
output_logprob  -1.5604     0.7974  -1.957   0.0504 .  
input_ntokens    1.0557     1.5516   0.680   0.4963    
output_ntokens  -2.6204     1.6640  -1.575   0.1153    
input_nchars    -0.2603     0.6279  -0.415   0.6784    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 385.93  on 299  degrees of freedom
Residual deviance: 364.50  on 294  degrees of freedom
AIC: 376.5

Number of Fisher Scoring iterations: 4


In [367]:
summary(revenc_claude3_0shot_notokens_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob + input_nchars, 
    family = binomial, data = scaled_revenc_claude3_0shot_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.0099  -1.0665   0.6869   0.8516   1.7561  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)      0.5708     0.1300   4.391 1.13e-05 ***
input_logprob    1.2788     0.2236   5.718 1.08e-08 ***
output_logprob  -0.6750     0.6839  -0.987    0.324    
input_nchars     0.1225     0.6314   0.194    0.846    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 396.42  on 299  degrees of freedom
Residual deviance: 353.21  on 296  degrees of freedom
AIC: 361.21

Number of Fisher Scoring iterations: 4


In [368]:
summary(revenc_claude3_5shot_notokens_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob + input_nchars, 
    family = binomial, data = scaled_revenc_claude3_5shot_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.9588  -0.2969   0.2639   0.4830   2.2448  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     1.52522    0.21466   7.105  1.2e-12 ***
input_logprob   3.19041    0.38857   8.211  < 2e-16 ***
output_logprob -0.04998    0.93363  -0.054   0.9573    
input_nchars    1.95692    0.90437   2.164   0.0305 *  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 355.77  on 299  degrees of freedom
Residual deviance: 203.49  on 296  degrees of freedom
AIC: 211.49

Number of Fisher Scoring iterations: 6


In [369]:
summary(revenc_claude3_10shot_notokens_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob + input_nchars, 
    family = binomial, data = scaled_revenc_claude3_10shot_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-3.3384  -0.2399   0.2081   0.4193   2.3578  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)      1.7696     0.2533   6.988 2.80e-12 ***
input_logprob    3.7207     0.4534   8.206 2.29e-16 ***
output_logprob   0.2509     1.0084   0.249  0.80348    
input_nchars     2.7675     0.9988   2.771  0.00559 ** 
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 353.87  on 299  degrees of freedom
Residual deviance: 180.36  on 296  degrees of freedom
AIC: 188.36

Number of Fisher Scoring iterations: 6


In [370]:
summary(revenc_gpt35ft_0shot_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob + input_ntokens + 
    output_ntokens + input_nchars, family = binomial, data = df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-0.7212  -0.3173  -0.2123  -0.1241   3.1788  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     -4.0128     0.5441  -7.375 1.64e-13 ***
input_logprob    0.7716     0.8280   0.932    0.351    
output_logprob  -3.6619     2.4840  -1.474    0.140    
input_ntokens    1.3292     4.6376   0.287    0.774    
output_ntokens  -5.3240     4.6328  -1.149    0.250    
input_nchars    -0.4499     1.8481  -0.243    0.808    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 94.321  on 299  degrees of freedom
Residual deviance: 83.014  on 294  degrees of freedom
AIC: 95.014

Number of Fisher Scoring iterations: 7


In [371]:
summary(revenc_gpt35ft_10shot_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob + input_ntokens + 
    output_ntokens + input_nchars, family = binomial, data = df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.8249   0.2267   0.3042   0.4343   1.7280  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)      2.5758     0.2548  10.111   <2e-16 ***
input_logprob   -0.2251     0.2823  -0.797    0.425    
output_logprob  -0.8832     1.1173  -0.791    0.429    
input_ntokens    3.6907     2.2943   1.609    0.108    
output_ntokens  -6.4935     2.5547  -2.542    0.011 *  
input_nchars     0.7879     0.9298   0.847    0.397    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 207.91  on 299  degrees of freedom
Residual deviance: 164.59  on 294  degrees of freedom
AIC: 176.59

Number of Fisher Scoring iterations: 6


In [372]:
summary(revenc_gpt35ft_100shot_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob + input_ntokens + 
    output_ntokens + input_nchars, family = binomial, data = df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-2.67179   0.08833   0.13382   0.21978   0.99083  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)      4.3190     0.5599   7.713 1.23e-14 ***
input_logprob   -0.5834     0.4566  -1.278   0.2013    
output_logprob   1.3609     1.7257   0.789   0.4303    
input_ntokens    1.3297     4.0491   0.328   0.7426    
output_ntokens  -4.4573     4.5893  -0.971   0.3314    
input_nchars     2.7755     1.4864   1.867   0.0619 .  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 100.766  on 299  degrees of freedom
Residual deviance:  65.657  on 294  degrees of freedom
AIC: 77.657

Number of Fisher Scoring iterations: 7


In [373]:
summary(revdec_gpt4_0shot_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob + input_ntokens + 
    output_ntokens + input_nchars, family = binomial, data = df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-2.70732   0.01725   0.41929   0.61930   2.26361  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)      1.4570     0.1824   7.989 1.35e-15 ***
input_logprob   -2.0154     1.0098  -1.996   0.0459 *  
output_logprob   2.1287     0.3047   6.987 2.81e-12 ***
input_ntokens    1.1696     2.0408   0.573   0.5666    
output_ntokens  -1.1016     1.9369  -0.569   0.5695    
input_nchars    -0.4709     0.7911  -0.595   0.5517    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 337.40  on 299  degrees of freedom
Residual deviance: 261.96  on 294  degrees of freedom
AIC: 273.96

Number of Fisher Scoring iterations: 5


In [374]:
summary(revdec_gpt4_5shot_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob + input_ntokens + 
    output_ntokens + input_nchars, family = binomial, data = df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.1446  -0.7630   0.5580   0.6839   2.4993  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     1.19173    0.14744   8.083 6.32e-16 ***
input_logprob  -1.28684    0.87434  -1.472    0.141    
output_logprob  1.15995    0.23635   4.908 9.22e-07 ***
input_ntokens  -2.23679    1.86541  -1.199    0.230    
output_ntokens  1.41117    1.73583   0.813    0.416    
input_nchars   -0.06773    0.68961  -0.098    0.922    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 341.72  on 299  degrees of freedom
Residual deviance: 300.04  on 294  degrees of freedom
AIC: 312.04

Number of Fisher Scoring iterations: 4


In [375]:
summary(revdec_gpt4_10shot_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob + input_ntokens + 
    output_ntokens + input_nchars, family = binomial, data = df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.0753   0.4547   0.5565   0.6847   2.2460  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)      1.2835     0.1487   8.633  < 2e-16 ***
input_logprob   -0.9205     0.8671  -1.062  0.28844    
output_logprob   0.8792     0.2286   3.847  0.00012 ***
input_ntokens   -0.2252     1.8581  -0.121  0.90352    
output_ntokens  -0.2544     1.7374  -0.146  0.88357    
input_nchars    -0.3038     0.6856  -0.443  0.65770    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 328.32  on 299  degrees of freedom
Residual deviance: 297.52  on 294  degrees of freedom
AIC: 309.52

Number of Fisher Scoring iterations: 4


In [376]:
summary(revdec_gpt35_0shot_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob + input_ntokens + 
    output_ntokens + input_nchars, family = binomial, data = df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.7625  -0.9985  -0.2170   0.9224   2.8136  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     -0.1929     0.1369  -1.409 0.158754    
input_logprob   -3.1882     0.9177  -3.474 0.000512 ***
output_logprob   1.4756     0.2707   5.452 4.99e-08 ***
input_ntokens   -3.7092     1.7915  -2.070 0.038415 *  
output_ntokens   0.5806     1.6814   0.345 0.729883    
input_nchars     0.1258     0.6755   0.186 0.852319    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 415.67  on 299  degrees of freedom
Residual deviance: 342.25  on 294  degrees of freedom
AIC: 354.25

Number of Fisher Scoring iterations: 5


In [377]:
summary(revdec_gpt35_5shot_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob + input_ntokens + 
    output_ntokens + input_nchars, family = binomial, data = df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.7034  -0.9021  -0.3918   0.9676   2.9710  

Coefficients:
                Estimate Std. Error z value Pr(>|z|)    
(Intercept)    -0.912155   0.165114  -5.524 3.31e-08 ***
input_logprob  -2.585621   0.982059  -2.633  0.00847 ** 
output_logprob  1.807566   0.330407   5.471 4.48e-08 ***
input_ntokens  -1.695625   1.878373  -0.903  0.36668    
output_ntokens -0.728780   1.793173  -0.406  0.68443    
input_nchars    0.008317   0.730075   0.011  0.99091    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 394.29  on 299  degrees of freedom
Residual deviance: 316.85  on 294  degrees of freedom
AIC: 328.85

Number of Fisher Scoring iterations: 5


In [378]:
summary(revdec_gpt35_10shot_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob + input_ntokens + 
    output_ntokens + input_nchars, family = binomial, data = df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.7708  -0.8421  -0.3337   0.9063   3.3797  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     -1.2873     0.1966  -6.547 5.89e-11 ***
input_logprob   -2.7229     1.0709  -2.543    0.011 *  
output_logprob   2.2493     0.3910   5.752 8.82e-09 ***
input_ntokens   -1.8578     1.9992  -0.929    0.353    
output_ntokens  -1.0974     1.9104  -0.574    0.566    
input_nchars     0.4822     0.7799   0.618    0.536    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 380.51  on 299  degrees of freedom
Residual deviance: 290.36  on 294  degrees of freedom
AIC: 302.36

Number of Fisher Scoring iterations: 6


In [380]:
summary(revdec_claude3_0shot_notokens_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob + input_nchars, 
    family = binomial, data = scaled_revdec_claude3_0shot_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.3676  -0.6178   0.3699   0.6926   3.1296  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     0.02133    0.16292   0.131    0.896    
input_logprob  -1.44632    0.81237  -1.780    0.075 .  
output_logprob  3.22590    0.38073   8.473   <2e-16 ***
input_nchars    0.41218    0.74283   0.555    0.579    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 413.63  on 299  degrees of freedom
Residual deviance: 263.21  on 296  degrees of freedom
AIC: 271.21

Number of Fisher Scoring iterations: 5


In [381]:
summary(revdec_claude3_5shot_notokens_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob + input_nchars, 
    family = binomial, data = scaled_revdec_claude3_5shot_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.9640  -0.3232   0.2376   0.4987   2.1198  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)      0.3761     0.1936   1.943    0.052 .  
input_logprob   -1.4486     0.9794  -1.479    0.139    
output_logprob   4.5708     0.5256   8.696   <2e-16 ***
input_nchars     1.2844     0.9158   1.403    0.161    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 405.37  on 299  degrees of freedom
Residual deviance: 193.90  on 296  degrees of freedom
AIC: 201.9

Number of Fisher Scoring iterations: 6


In [382]:
summary(revdec_claude3_10shot_notokens_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob + input_nchars, 
    family = binomial, data = scaled_revdec_claude3_10shot_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-3.1260  -0.3535   0.2454   0.4953   2.9017  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)      0.8040     0.1978   4.065  4.8e-05 ***
input_logprob   -0.7982     0.9747  -0.819   0.4129    
output_logprob   4.1272     0.4699   8.783  < 2e-16 ***
input_nchars     1.8548     0.9353   1.983   0.0474 *  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 395.37  on 299  degrees of freedom
Residual deviance: 196.93  on 296  degrees of freedom
AIC: 204.93

Number of Fisher Scoring iterations: 6


In [383]:
summary(revdec_gpt35ft_0shot_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob + input_ntokens + 
    output_ntokens + input_nchars, family = binomial, data = df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.7625  -0.9985  -0.2170   0.9224   2.8136  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     -0.1929     0.1369  -1.409 0.158754    
input_logprob   -3.1882     0.9177  -3.474 0.000512 ***
output_logprob   1.4756     0.2707   5.452 4.99e-08 ***
input_ntokens   -3.7092     1.7915  -2.070 0.038415 *  
output_ntokens   0.5806     1.6814   0.345 0.729883    
input_nchars     0.1258     0.6755   0.186 0.852319    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 415.67  on 299  degrees of freedom
Residual deviance: 342.25  on 294  degrees of freedom
AIC: 354.25

Number of Fisher Scoring iterations: 5


In [384]:
summary(revdec_gpt35ft_10shot_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob + input_ntokens + 
    output_ntokens + input_nchars, family = binomial, data = df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.6878   0.2415   0.2942   0.3663   1.1433  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)      2.9415     0.2795  10.525   <2e-16 ***
input_logprob   -2.2179     1.2944  -1.713   0.0866 .  
output_logprob   0.2732     0.3213   0.850   0.3951    
input_ntokens   -1.2518     3.1875  -0.393   0.6945    
output_ntokens  -0.3711     2.9084  -0.128   0.8985    
input_nchars    -0.9763     1.0013  -0.975   0.3296    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 141.63  on 299  degrees of freedom
Residual deviance: 128.38  on 294  degrees of freedom
AIC: 140.38

Number of Fisher Scoring iterations: 6


In [385]:
summary(revdec_gpt35ft_100shot_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob + input_ntokens + 
    output_ntokens + input_nchars, family = binomial, data = df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-3.2158   0.1015   0.1292   0.1840   0.9701  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)      4.5505     0.6138   7.414 1.23e-13 ***
input_logprob   -1.6989     1.9818  -0.857    0.391    
output_logprob   0.1197     0.4732   0.253    0.800    
input_ntokens    4.6514     5.9132   0.787    0.432    
output_ntokens  -5.2971     5.5206  -0.960    0.337    
input_nchars    -1.8680     1.4825  -1.260    0.208    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 58.823  on 299  degrees of freedom
Residual deviance: 48.638  on 294  degrees of freedom
AIC: 60.638

Number of Fisher Scoring iterations: 7


In [386]:
vif(revenc_gpt4_0shot_model)

In [387]:
vif(revenc_gpt4_5shot_model)

In [388]:
vif(revenc_gpt4_10shot_model)

In [389]:
vif(revenc_gpt35_0shot_model)

In [390]:
vif(revenc_gpt35_5shot_model)

In [391]:
vif(revenc_gpt35_10shot_model)

In [393]:
vif(revenc_claude3_0shot_notokens_model)

In [394]:
vif(revenc_claude3_5shot_notokens_model)

In [395]:
vif(revenc_claude3_10shot_notokens_model)

In [396]:
vif(revenc_gpt35ft_0shot_model)

In [397]:
vif(revenc_gpt35ft_10shot_model)

In [398]:
vif(revenc_gpt35ft_100shot_model)

In [399]:
vif(revdec_gpt4_0shot_model)

In [400]:
vif(revdec_gpt4_5shot_model)

In [401]:
vif(revdec_gpt4_10shot_model)

In [402]:
vif(revdec_gpt35_0shot_model)

In [403]:
vif(revdec_gpt35_5shot_model)

In [404]:
vif(revdec_gpt35_10shot_model)

In [406]:
vif(revdec_claude3_0shot_notokens_model)

In [407]:
vif(revdec_claude3_5shot_notokens_model)

In [408]:
vif(revdec_claude3_10shot_notokens_model)

In [409]:
vif(revdec_gpt35ft_0shot_model)

In [410]:
vif(revdec_gpt35ft_10shot_model)

In [411]:
vif(revdec_gpt35ft_100shot_model)

# Swap

In [499]:
# Read in data
swap_next_base_gpt4_df <- read.table(file = 'table_swap_next_base_gpt-4-0613.tsv', sep = '\t', header = TRUE)
swap_next_base_gpt35_df <- read.table(file = 'table_swap_next_base_gpt-3.5-turbo-0613.tsv', sep = '\t', header = TRUE)
swap_next_base_llama3_df <- read.table(file = 'table_swap_next_base_llama-3-70b-chat-hf.tsv', sep = '\t', header = TRUE)
swap_next_base_claude3_df <- read.table(file = 'table_swap_next_base_claude-3-opus-20240229.tsv', sep = '\t', header = TRUE)
swap_next_base_gemini1_df <- read.table(file = 'table_swap_next_base_gemini-1.0-pro-001.tsv', sep = '\t', header = TRUE)



In [500]:
# Z-score data
scaled_swap_next_base_gpt4_df <- scale_df(swap_next_base_gpt4_df)
scaled_swap_next_base_gpt35_df <- scale_df(swap_next_base_gpt35_df)
scaled_swap_next_base_llama3_df <- scale_df(swap_next_base_llama3_df)
scaled_swap_next_base_claude3_df <- scale_df(swap_next_base_claude3_df)
scaled_swap_next_base_gemini1_df <- scale_df(swap_next_base_gemini1_df)


In [502]:
swap_next_base_gpt4_model <- glm(correct ~ input_logprob + output_logprob + input_ntokens + input_nchars, 
                                 data=scaled_swap_next_base_gpt4_df, family=binomial)
swap_next_base_gpt35_model <- glm(correct ~ input_logprob + output_logprob + input_ntokens + input_nchars, 
                                 data=scaled_swap_next_base_gpt35_df, family=binomial)
swap_next_base_llama3_model <- glm(correct ~ input_logprob + output_logprob + input_ntokens + input_nchars, 
                                 data=scaled_swap_next_base_llama3_df, family=binomial)
swap_next_base_claude3_notokens_model <- glm(correct ~ input_logprob + output_logprob + input_nchars, 
                                 data=scaled_swap_next_base_claude3_df, family=binomial)
swap_next_base_gemini1_model <- glm(correct ~ input_logprob + output_logprob + input_ntokens + input_nchars, 
                                 data=scaled_swap_next_base_gemini1_df, family=binomial)



In [503]:
vif(swap_next_base_gpt4_model)

In [504]:
vif(swap_next_base_gpt35_model)

In [505]:
vif(swap_next_base_llama3_model)

In [507]:
vif(swap_next_base_claude3_notokens_model)

In [508]:
vif(swap_next_base_gemini1_model)

### Rerunning with just output logprob

In [509]:
swap_next_base_gpt4_model <- glm(correct ~ output_logprob + input_ntokens + input_nchars, 
                                 data=scaled_swap_next_base_gpt4_df, family=binomial)
swap_next_base_gpt35_model <- glm(correct ~ output_logprob + input_ntokens + input_nchars, 
                                 data=scaled_swap_next_base_gpt35_df, family=binomial)
swap_next_base_llama3_model <- glm(correct ~ output_logprob + input_ntokens + input_nchars, 
                                 data=scaled_swap_next_base_llama3_df, family=binomial)
swap_next_base_claude3_notokens_model <- glm(correct ~ output_logprob + input_nchars, 
                                 data=scaled_swap_next_base_claude3_df, family=binomial)
swap_next_base_gemini1_model <- glm(correct ~ output_logprob + input_ntokens + input_nchars, 
                                 data=scaled_swap_next_base_gemini1_df, family=binomial)


In [510]:
summary(swap_next_base_gpt4_model)


Call:
glm(formula = correct ~ output_logprob + input_ntokens + input_nchars, 
    family = binomial, data = scaled_swap_next_base_gpt4_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-2.23047  -0.54883  -0.07368   0.65085   2.53538  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     -0.6818     0.1924  -3.543 0.000396 ***
output_logprob   3.5219     0.4123   8.541  < 2e-16 ***
input_ntokens    2.2323     0.7371   3.029 0.002457 ** 
input_nchars    -0.5648     0.6971  -0.810 0.417840    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 413.97  on 299  degrees of freedom
Residual deviance: 245.89  on 296  degrees of freedom
AIC: 253.89

Number of Fisher Scoring iterations: 6


In [511]:
summary(swap_next_base_gpt35_model)


Call:
glm(formula = correct ~ output_logprob + input_ntokens + input_nchars, 
    family = binomial, data = scaled_swap_next_base_gpt35_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.6911  -0.9031  -0.3101   0.9510   2.2064  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     -0.8507     0.1628  -5.224 1.75e-07 ***
output_logprob   1.9801     0.2885   6.864 6.69e-12 ***
input_ntokens    0.2248     0.5931   0.379    0.705    
input_nchars     0.4359     0.5867   0.743    0.457    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 397.45  on 299  degrees of freedom
Residual deviance: 316.07  on 296  degrees of freedom
AIC: 324.07

Number of Fisher Scoring iterations: 5


In [512]:
summary(swap_next_base_llama3_model)


Call:
glm(formula = correct ~ output_logprob + input_ntokens + input_nchars, 
    family = binomial, data = scaled_swap_next_base_llama3_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.1919  -0.6191  -0.3622  -0.1355   2.7830  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     -2.4657     0.2874  -8.580  < 2e-16 ***
output_logprob   1.9033     0.4450   4.277  1.9e-05 ***
input_ntokens    1.4375     0.8111   1.772   0.0763 .  
input_nchars    -0.9708     0.8139  -1.193   0.2330    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 242.98  on 299  degrees of freedom
Residual deviance: 205.60  on 296  degrees of freedom
AIC: 213.6

Number of Fisher Scoring iterations: 6


In [513]:
summary(swap_next_base_claude3_notokens_model)


Call:
glm(formula = correct ~ output_logprob + input_nchars, family = binomial, 
    data = scaled_swap_next_base_claude3_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.7952  -0.7624  -0.3688   0.6949   2.7320  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     -1.4617     0.1960  -7.459 8.72e-14 ***
output_logprob   2.2039     0.3387   6.507 7.67e-11 ***
input_nchars     1.4735     0.2318   6.357 2.05e-10 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 349.96  on 299  degrees of freedom
Residual deviance: 271.76  on 297  degrees of freedom
AIC: 277.76

Number of Fisher Scoring iterations: 5


In [514]:
summary(swap_next_base_gemini1_model)


Call:
glm(formula = correct ~ output_logprob + input_ntokens + input_nchars, 
    family = binomial, data = scaled_swap_next_base_gemini1_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.8323  -0.7581  -0.3135   0.9698   2.3680  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)    -1.51788    0.21474  -7.068 1.57e-12 ***
output_logprob  2.53175    0.37295   6.788 1.13e-11 ***
input_ntokens   0.08339    0.57508   0.145    0.885    
input_nchars    0.95996    0.58024   1.654    0.098 .  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 363.07  on 299  degrees of freedom
Residual deviance: 276.47  on 296  degrees of freedom
AIC: 284.47

Number of Fisher Scoring iterations: 6


In [515]:
vif(swap_next_base_gpt4_model)

In [516]:
vif(swap_next_base_gpt35_model)

In [517]:
vif(swap_next_base_llama3_model)

In [518]:
vif(swap_next_base_claude3_notokens_model)

In [519]:
vif(swap_next_base_gemini1_model)

# Pig Latin

In [412]:
# Read in data
pigenc_gpt4_df <- read.table(file = 'table_pig_ay_enc_gpt-4-0613.tsv', sep = '\t', header = TRUE)
pigenc_gpt35_df <- read.table(file = 'table_pig_ay_enc_gpt-3.5-turbo-0613.tsv', sep = '\t', header = TRUE)
pigenc_llama3_df <- read.table(file = 'table_pig_ay_enc_llama-3-70b-chat-hf.tsv', sep = '\t', header = TRUE)
pigenc_claude3_df <- read.table(file = 'table_pig_ay_enc_claude-3-opus-20240229.tsv', sep = '\t', header = TRUE)
pigenc_gemini1_df <- read.table(file = 'table_pig_ay_enc_gemini-1.0-pro-001.tsv', sep = '\t', header = TRUE)


pigdec_gpt4_df <- read.table(file = 'table_pig_ay_dec_gpt-4-0613.tsv', sep = '\t', header = TRUE)
pigdec_gpt35_df <- read.table(file = 'table_pig_ay_dec_gpt-3.5-turbo-0613.tsv', sep = '\t', header = TRUE)
pigdec_llama3_df <- read.table(file = 'table_pig_ay_dec_llama-3-70b-chat-hf.tsv', sep = '\t', header = TRUE)
pigdec_claude3_df <- read.table(file = 'table_pig_ay_dec_claude-3-opus-20240229.tsv', sep = '\t', header = TRUE)
pigdec_gemini1_df <- read.table(file = 'table_pig_ay_dec_gemini-1.0-pro-001.tsv', sep = '\t', header = TRUE)



In [477]:
# Z-score data
scaled_pigenc_gpt4_df <- scale_df(pigenc_gpt4_df)
scaled_pigenc_gpt35_df <- scale_df(pigenc_gpt35_df)
scaled_pigenc_llama3_df <- scale_df(pigenc_llama3_df)
scaled_pigenc_claude3_df <- scale_df(pigenc_claude3_df)
scaled_pigenc_gemini1_df <- scale_df(pigenc_gemini1_df)

scaled_pigdec_gpt4_df <- scale_df(pigdec_gpt4_df)
scaled_pigdec_gpt35_df <- scale_df(pigdec_gpt35_df)
scaled_pigdec_llama3_df <- scale_df(pigdec_llama3_df)
scaled_pigdec_claude3_df <- scale_df(pigdec_claude3_df)
scaled_pigdec_gemini1_df <- scale_df(pigdec_gemini1_df)


In [478]:

pigenc_gpt4_model <- correct_vs_length_and_prob(scaled_pigenc_gpt4_df, include_output_chars=TRUE)
pigenc_gpt35_model <- correct_vs_length_and_prob(scaled_pigenc_gpt35_df, include_output_chars=TRUE)
pigenc_llama3_model <- correct_vs_length_and_prob(scaled_pigenc_llama3_df, include_output_chars=TRUE)
pigenc_claude3_notokens_model <- glm(correct ~ input_logprob + output_logprob + input_nchars + output_nchars, 
               data=scaled_pigenc_claude3_df, family=binomial)
pigenc_gemini1_model <- bayesglm(correct ~ input_logprob + output_logprob + input_ntokens + output_ntokens + input_nchars + output_nchars, 
               data=scaled_pigenc_gemini1_df, family=binomial)


pigdec_gpt4_model <- correct_vs_length_and_prob(scaled_pigdec_gpt4_df, include_output_chars=TRUE)
pigdec_gpt35_model <- correct_vs_length_and_prob(scaled_pigdec_gpt35_df, include_output_chars=TRUE)
pigdec_llama3_model <- correct_vs_length_and_prob(scaled_pigdec_llama3_df, include_output_chars=TRUE)
pigdec_claude3_notokens_model <- glm(correct ~ input_logprob + output_logprob + input_nchars + output_nchars, 
               data=scaled_pigdec_claude3_df, family=binomial)
pigdec_gemini1_model <- correct_vs_length_and_prob(scaled_pigdec_gemini1_df, include_output_chars=TRUE)


In [479]:
summary(pigenc_gpt4_model) 


Call:
glm(formula = correct ~ input_logprob + output_logprob + input_ntokens + 
    output_ntokens + input_nchars + output_nchars, family = binomial, 
    data = df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.7494  -1.0129  -0.6483   1.1876   2.0373  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)    -0.52418    0.12731  -4.117 3.83e-05 ***
input_logprob  -0.02844    0.18989  -0.150  0.88097    
output_logprob  1.93966    0.70569   2.749  0.00598 ** 
input_ntokens   1.89304    0.79457   2.382  0.01720 *  
output_ntokens  1.90174    0.99917   1.903  0.05700 .  
input_nchars    5.93802    2.95840   2.007  0.04473 *  
output_nchars  -8.30880    3.67071  -2.264  0.02360 *  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 400.34  on 299  degrees of freedom
Residual deviance: 368.70  on 293  degrees of freedom
AIC: 382.7

Number of 

In [480]:
summary(pigenc_gpt35_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob + input_ntokens + 
    output_ntokens + input_nchars + output_nchars, family = binomial, 
    data = df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.3523  -0.8287  -0.6536   1.1613   2.2372  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)    -1.16906    0.14519  -8.052 8.15e-16 ***
input_logprob   0.08184    0.21390   0.383   0.7020    
output_logprob  0.91205    0.74120   1.230   0.2185    
input_ntokens   1.72350    0.86877   1.984   0.0473 *  
output_ntokens -0.86339    1.09634  -0.788   0.4310    
input_nchars   -1.56653    3.29903  -0.475   0.6349    
output_nchars   1.17719    4.02761   0.292   0.7701    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 341.72  on 299  degrees of freedom
Residual deviance: 321.69  on 293  degrees of freedom
AIC: 335.69

Number of

In [481]:
summary(pigenc_llama3_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob + input_ntokens + 
    output_ntokens + input_nchars + output_nchars, family = binomial, 
    data = df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-0.93408  -0.28116  -0.13852  -0.04986   2.91409  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     -4.9210     0.8098  -6.077 1.23e-09 ***
input_logprob    1.0246     0.8088   1.267   0.2052    
output_logprob   4.8864     2.0443   2.390   0.0168 *  
input_ntokens    1.5128     2.3806   0.635   0.5251    
output_ntokens  -3.1689     2.5719  -1.232   0.2179    
input_nchars    -3.2595     8.5377  -0.382   0.7026    
output_nchars    8.5942    10.6653   0.806   0.4203    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 94.321  on 299  degrees of freedom
Residual deviance: 72.334  on 293  degrees of freedom
AIC: 86.334


In [482]:
summary(pigenc_claude3_notokens_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob + input_nchars + 
    output_nchars, family = binomial, data = scaled_pigenc_claude3_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.9696  -1.1947   0.7783   0.9582   1.8215  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)   
(Intercept)      0.4018     0.1228   3.273  0.00107 **
input_logprob    0.5265     0.1763   2.987  0.00282 **
output_logprob   1.4502     0.5434   2.669  0.00761 **
input_nchars    -2.1124     1.8110  -1.166  0.24345   
output_nchars    3.7199     1.9646   1.894  0.05829 . 
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 405.37  on 299  degrees of freedom
Residual deviance: 381.55  on 295  degrees of freedom
AIC: 391.55

Number of Fisher Scoring iterations: 4


In [483]:
summary(pigenc_gemini1_model)


Call:
bayesglm(formula = correct ~ input_logprob + output_logprob + 
    input_ntokens + output_ntokens + input_nchars + output_nchars, 
    family = binomial, data = scaled_pigenc_gemini1_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-0.02482  -0.02482  -0.02482  -0.02482  -0.02482  

Coefficients:
                 Estimate Std. Error z value Pr(>|z|)   
(Intercept)    -8.085e+00  3.104e+00  -2.605  0.00919 **
input_logprob   3.247e-17  1.136e+00   0.000  1.00000   
output_logprob  5.669e-17  1.155e+00   0.000  1.00000   
input_ntokens  -3.019e-17  1.154e+00   0.000  1.00000   
output_ntokens -4.324e-17  1.155e+00   0.000  1.00000   
input_nchars   -2.179e-17  1.155e+00   0.000  1.00000   
output_nchars  -7.215e-17  1.155e+00   0.000  1.00000   
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 0.00000  on 299  degrees of freedom
Residual deviance: 0.18483  on

In [490]:
summary(pigdec_gpt4_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob + input_ntokens + 
    output_ntokens + input_nchars + output_nchars, family = binomial, 
    data = df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.8736  -0.7337   0.4224   0.7328   2.1890  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)      0.6525     0.1543   4.229 2.34e-05 ***
input_logprob    0.8155     0.7951   1.026   0.3050    
output_logprob   2.0507     0.2562   8.004 1.21e-15 ***
input_ntokens    0.9519     1.1533   0.825   0.4091    
output_ntokens  -1.5573     0.9316  -1.672   0.0946 .  
input_nchars     5.8314     4.3050   1.355   0.1756    
output_nchars   -2.8202     3.4841  -0.809   0.4183    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 401.25  on 299  degrees of freedom
Residual deviance: 292.16  on 293  degrees of freedom
AIC: 306.16

Number of

In [491]:
summary(pigdec_gpt35_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob + input_ntokens + 
    output_ntokens + input_nchars + output_nchars, family = binomial, 
    data = df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.6092  -0.6466  -0.1985   0.6533   3.3620  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     -2.0072     0.2743  -7.317 2.54e-13 ***
input_logprob    1.4048     0.8884   1.581   0.1138    
output_logprob   3.1852     0.4590   6.940 3.93e-12 ***
input_ntokens    0.3708     1.2622   0.294   0.7689    
output_ntokens  -2.3121     1.0911  -2.119   0.0341 *  
input_nchars     4.7209     4.8613   0.971   0.3315    
output_nchars   -0.2726     3.9111  -0.070   0.9444    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 357.64  on 299  degrees of freedom
Residual deviance: 236.09  on 293  degrees of freedom
AIC: 250.09

Number of

In [492]:
summary(pigdec_llama3_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob + input_ntokens + 
    output_ntokens + input_nchars + output_nchars, family = binomial, 
    data = df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.7477  -0.4628  -0.2011  -0.0447   3.3760  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     -3.6808     0.4985  -7.384 1.53e-13 ***
input_logprob    1.2451     1.2405   1.004   0.3155    
output_logprob   3.0160     0.6441   4.682 2.84e-06 ***
input_ntokens   -0.4705     1.6656  -0.282   0.7776    
output_ntokens  -3.6114     1.6562  -2.180   0.0292 *  
input_nchars     9.3290     6.8764   1.357   0.1749    
output_nchars   -3.4633     5.3252  -0.650   0.5155    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 216.14  on 299  degrees of freedom
Residual deviance: 153.79  on 293  degrees of freedom
AIC: 167.79

Number of

In [493]:
summary(pigdec_claude3_notokens_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob + input_nchars + 
    output_nchars, family = binomial, data = scaled_pigdec_claude3_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.9139  -0.7932   0.4075   0.7116   2.3150  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)      0.9195     0.1618   5.681 1.34e-08 ***
input_logprob    1.6519     0.6756   2.445   0.0145 *  
output_logprob   1.9426     0.2552   7.613 2.68e-14 ***
input_nchars     0.9702     2.4748   0.392   0.6950    
output_nchars    2.3140     2.3535   0.983   0.3255    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 388.47  on 299  degrees of freedom
Residual deviance: 288.68  on 295  degrees of freedom
AIC: 298.68

Number of Fisher Scoring iterations: 5


In [489]:
summary(pigdec_gemini1_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob + input_ntokens + 
    output_ntokens + input_nchars + output_nchars, family = binomial, 
    data = df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.4519  -0.5331  -0.2496  -0.0669   3.2458  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     -3.0544     0.4011  -7.615 2.64e-14 ***
input_logprob    0.4918     1.0347   0.475   0.6346    
output_logprob   3.1686     0.6107   5.188 2.12e-07 ***
input_ntokens   -2.0981     1.8059  -1.162   0.2453    
output_ntokens   2.6574     1.0863   2.446   0.0144 *  
input_nchars    -5.8321     5.3644  -1.087   0.2770    
output_nchars    7.5124     4.3299   1.735   0.0827 .  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 242.98  on 299  degrees of freedom
Residual deviance: 175.71  on 293  degrees of freedom
AIC: 189.71

Number of

In [484]:
vif(pigenc_gpt4_model)

In [485]:
vif(pigenc_gpt35_model)

In [486]:
vif(pigenc_llama3_model)

In [487]:
vif(pigenc_claude3_notokens_model)

In [488]:
vif(pigenc_gemini1_model)

In [494]:
vif(pigdec_gpt4_model)

In [495]:
vif(pigdec_gpt35_model)

In [496]:
vif(pigdec_llama3_model)

In [497]:
vif(pigdec_claude3_notokens_model)

In [498]:
vif(pigdec_gemini1_model)

### Comparing Pig Latin and Boar Etruscan

In [413]:
# Read in data
pigboarenc_gpt4_df <- read.table(file = 'table_pig_boar_enc_gpt-4-0613.tsv', sep = '\t', header = TRUE)
pigboarenc_gpt35_df <- read.table(file = 'table_pig_boar_enc_gpt-3.5-turbo-0613.tsv', sep = '\t', header = TRUE)
pigboarenc_llama3_df <- read.table(file = 'table_pig_boar_enc_llama-3-70b-chat-hf.tsv', sep = '\t', header = TRUE)
pigboarenc_claude3_df <- read.table(file = 'table_pig_boar_enc_claude-3-opus-20240229.tsv', sep = '\t', header = TRUE)
pigboarenc_gemini1_df <- read.table(file = 'table_pig_boar_enc_gemini-1.0-pro-001.tsv', sep = '\t', header = TRUE)


pigboardec_gpt4_df <- read.table(file = 'table_pig_boar_dec_gpt-4-0613.tsv', sep = '\t', header = TRUE)
pigboardec_gpt35_df <- read.table(file = 'table_pig_boar_dec_gpt-3.5-turbo-0613.tsv', sep = '\t', header = TRUE)
pigboardec_llama3_df <- read.table(file = 'table_pig_boar_dec_llama-3-70b-chat-hf.tsv', sep = '\t', header = TRUE)
pigboardec_claude3_df <- read.table(file = 'table_pig_boar_dec_claude-3-opus-20240229.tsv', sep = '\t', header = TRUE)
pigboardec_gemini1_df <- read.table(file = 'table_pig_boar_dec_gemini-1.0-pro-001.tsv', sep = '\t', header = TRUE)


In [414]:
# Z-score data
scaled_pigboarenc_gpt4_df <- scale_taskpair_df(pigboarenc_gpt4_df)
scaled_pigboarenc_gpt35_df <- scale_taskpair_df(pigboarenc_gpt35_df)
scaled_pigboarenc_llama3_df <- scale_taskpair_df(pigboarenc_llama3_df)
scaled_pigboarenc_claude3_df <- scale_taskpair_df(pigboarenc_claude3_df)
scaled_pigboarenc_gemini1_df <- scale_taskpair_df(pigboarenc_gemini1_df)

scaled_pigboardec_gpt4_df <- scale_taskpair_df(pigboardec_gpt4_df)
scaled_pigboardec_gpt35_df <- scale_taskpair_df(pigboardec_gpt35_df)
scaled_pigboardec_llama3_df <- scale_taskpair_df(pigboardec_llama3_df)
scaled_pigboardec_claude3_df <- scale_taskpair_df(pigboardec_claude3_df)
scaled_pigboardec_gemini1_df <- scale_taskpair_df(pigboardec_gemini1_df)


In [415]:
model_pigtaskenc4 <- bayesglm(correct ~ task + input_nchars + output_nchars + input_ntokens + output_ntokens + input_logprob + output_logprob, 
               data=scaled_pigboarenc_gpt4_df, family=binomial)


In [416]:
model_pigtaskenc35 <- bayesglm(correct ~ task + input_nchars + output_nchars + input_ntokens + output_ntokens + input_logprob + output_logprob, 
               data=scaled_pigboarenc_gpt35_df, family=binomial)

In [417]:
model_pigtaskencllama <- bayesglm(correct ~ task + input_nchars + output_nchars + input_ntokens + output_ntokens + input_logprob + output_logprob, 
               data=scaled_pigboarenc_llama3_df, family=binomial)

In [418]:
model_pigtaskencclaude_notokens <- glm(correct ~ task + input_nchars + output_nchars + input_logprob + output_logprob, 
               data=scaled_pigboarenc_claude3_df, family=binomial)

In [419]:
model_pigtaskencgemini <- bayesglm(correct ~ task + input_nchars + output_nchars + input_ntokens + output_ntokens + input_logprob + output_logprob, 
               data=scaled_pigboarenc_gemini1_df, family=binomial)

In [420]:
model_pigtaskdec4 <- bayesglm(correct ~ task + input_nchars + output_nchars + input_ntokens + output_ntokens + input_logprob + output_logprob, 
               data=scaled_pigboardec_gpt4_df, family=binomial)

In [421]:
model_pigtaskdec35 <- bayesglm(correct ~ task + input_nchars + output_nchars + input_ntokens + output_ntokens + input_logprob + output_logprob, 
               data=scaled_pigboardec_gpt35_df, family=binomial)

In [422]:
model_pigtaskdecllama <- bayesglm(correct ~ task + input_nchars + output_nchars + input_ntokens + output_ntokens + input_logprob + output_logprob, 
               data=scaled_pigboardec_llama3_df, family=binomial)

In [423]:
model_pigtaskdecclaude_notokens <- bayesglm(correct ~ task + input_nchars + output_nchars + input_ntokens + output_ntokens + input_logprob + output_logprob, 
               data=scaled_pigboardec_claude3_df, family=binomial)

In [424]:
model_pigtaskdecgemini <- bayesglm(correct ~ task + input_nchars + output_nchars + input_ntokens + output_ntokens + input_logprob + output_logprob, 
               data=scaled_pigboardec_gemini1_df, family=binomial)

In [426]:
summary(model_pigtaskenc4)


Call:
bayesglm(formula = correct ~ task + input_nchars + output_nchars + 
    input_ntokens + output_ntokens + input_logprob + output_logprob, 
    family = binomial, data = scaled_pigboarenc_gpt4_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.2647  -0.7784  -0.5503   1.0503   2.3496  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)    -0.56003    0.23023  -2.432 0.014998 *  
taskuv         -1.32615    0.38979  -3.402 0.000668 ***
input_nchars    0.22833    0.77500   0.295 0.768286    
output_nchars   0.17641    0.87201   0.202 0.839682    
input_ntokens  -0.06654    0.61653  -0.108 0.914051    
output_ntokens  0.16345    0.72926   0.224 0.822658    
input_logprob  -0.05815    0.31163  -0.187 0.851964    
output_logprob  1.02052    0.70345   1.451 0.146853    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 229.22  on 199  degr

In [425]:
summary(model_pigtaskenc35)


Call:
bayesglm(formula = correct ~ task + input_nchars + output_nchars + 
    input_ntokens + output_ntokens + input_logprob + output_logprob, 
    family = binomial, data = scaled_pigboarenc_gpt35_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.37642  -0.50706  -0.09424  -0.03471   2.18253  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     -1.2589     0.3004  -4.191 2.78e-05 ***
taskuv          -4.7283     1.5892  -2.975  0.00293 ** 
input_nchars    -0.1590     0.8572  -0.186  0.85283    
output_nchars   -0.3834     0.9585  -0.400  0.68912    
input_ntokens    1.6958     0.9195   1.844  0.06516 .  
output_ntokens  -0.9901     0.9850  -1.005  0.31484    
input_logprob    0.5392     0.4245   1.270  0.20403    
output_logprob   0.6935     0.8150   0.851  0.39483    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 169.08  o

In [427]:
summary(model_pigtaskencllama)


Call:
bayesglm(formula = correct ~ task + input_nchars + output_nchars + 
    input_ntokens + output_ntokens + input_logprob + output_logprob, 
    family = binomial, data = scaled_pigboarenc_llama3_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-0.54503  -0.28170  -0.12513  -0.07656   2.51722  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)    -3.44821    0.59838  -5.763 8.29e-09 ***
taskuv         -2.24203    1.42591  -1.572    0.116    
input_nchars    0.37363    1.02270   0.365    0.715    
output_nchars   0.28062    1.04511   0.269    0.788    
input_ntokens   0.85383    1.03096   0.828    0.408    
output_ntokens  0.01689    1.00491   0.017    0.987    
input_logprob   0.16972    0.69847   0.243    0.808    
output_logprob  2.06324    1.31996   1.563    0.118    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 46.763  

In [428]:
summary(model_pigtaskencclaude_notokens)


Call:
glm(formula = correct ~ task + input_nchars + output_nchars + 
    input_logprob + output_logprob, family = binomial, data = scaled_pigboarenc_claude3_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.1767  -0.3992  -0.2554   0.7077   2.5009  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)      0.6572     0.2391   2.749  0.00599 ** 
taskuv          -3.7466     0.5492  -6.822 8.99e-12 ***
input_nchars    -0.4619     2.9257  -0.158  0.87456    
output_nchars    2.0027     3.0885   0.648  0.51670    
input_logprob    1.1594     0.4157   2.789  0.00529 ** 
output_logprob   0.7607     0.9130   0.833  0.40475    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 260.19  on 199  degrees of freedom
Residual deviance: 153.84  on 194  degrees of freedom
AIC: 165.84

Number of Fisher Scoring iterations: 5


In [429]:
summary(model_pigtaskencgemini)


Call:
bayesglm(formula = correct ~ task + input_nchars + output_nchars + 
    input_ntokens + output_ntokens + input_logprob + output_logprob, 
    family = binomial, data = scaled_pigboarenc_gemini1_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-0.03007  -0.03007  -0.03007  -0.03007  -0.03007  

Coefficients:
                 Estimate Std. Error z value Pr(>|z|)  
(Intercept)    -7.701e+00  3.323e+00  -2.318   0.0205 *
taskuv         -7.194e-16  2.234e+00   0.000   1.0000  
input_nchars   -1.559e-16  1.157e+00   0.000   1.0000  
output_nchars  -1.615e-16  1.157e+00   0.000   1.0000  
input_ntokens  -1.305e-16  1.156e+00   0.000   1.0000  
output_ntokens -2.319e-16  1.156e+00   0.000   1.0000  
input_logprob   2.520e-16  1.146e+00   0.000   1.0000  
output_logprob  2.649e-16  1.157e+00   0.000   1.0000  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 0.00000

In [430]:
summary(model_pigtaskdec4)


Call:
bayesglm(formula = correct ~ task + input_nchars + output_nchars + 
    input_ntokens + output_ntokens + input_logprob + output_logprob, 
    family = binomial, data = scaled_pigboardec_gpt4_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.2592   0.2391   0.4923   0.6698   1.7133  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     1.89288    0.31533   6.003 1.94e-09 ***
taskuv         -0.91583    0.41160  -2.225   0.0261 *  
input_nchars    0.40501    1.04353   0.388   0.6979    
output_nchars   2.20902    1.05734   2.089   0.0367 *  
input_ntokens  -0.23040    0.73565  -0.313   0.7541    
output_ntokens -1.01859    0.68521  -1.487   0.1371    
input_logprob   0.07216    0.66647   0.108   0.9138    
output_logprob  1.49702    0.34215   4.375 1.21e-05 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 218.10  on 199  degr

In [431]:
summary(model_pigtaskdec35)


Call:
bayesglm(formula = correct ~ task + input_nchars + output_nchars + 
    input_ntokens + output_ntokens + input_logprob + output_logprob, 
    family = binomial, data = scaled_pigboardec_gpt35_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.9060  -0.8991  -0.3178   0.9207   2.4139  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     0.02765    0.24673   0.112   0.9108    
taskuv         -0.51736    0.36100  -1.433   0.1518    
input_nchars    0.77663    1.12490   0.690   0.4899    
output_nchars   2.27877    1.07207   2.126   0.0335 *  
input_ntokens  -0.27435    0.72280  -0.380   0.7043    
output_ntokens -0.99061    0.63655  -1.556   0.1197    
input_logprob   0.78394    0.68171   1.150   0.2502    
output_logprob  1.81746    0.37748   4.815 1.47e-06 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 276.54  on 199  deg

In [432]:
summary(model_pigtaskdecllama)


Call:
bayesglm(formula = correct ~ task + input_nchars + output_nchars + 
    input_ntokens + output_ntokens + input_logprob + output_logprob, 
    family = binomial, data = scaled_pigboardec_llama3_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.50354  -0.40506  -0.19716  -0.06962   2.56405  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)    -2.00487    0.41321  -4.852 1.22e-06 ***
taskuv         -2.84284    0.73808  -3.852 0.000117 ***
input_nchars    0.66283    1.11878   0.592 0.553546    
output_nchars   1.61809    1.13223   1.429 0.152969    
input_ntokens   0.03212    0.91094   0.035 0.971869    
output_ntokens -1.79404    0.95084  -1.887 0.059188 .  
input_logprob  -0.25825    0.85514  -0.302 0.762651    
output_logprob  2.34781    0.70694   3.321 0.000897 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 146.770 

In [433]:
summary(model_pigtaskdecclaude_notokens)


Call:
bayesglm(formula = correct ~ task + input_nchars + output_nchars + 
    input_ntokens + output_ntokens + input_logprob + output_logprob, 
    family = binomial, data = scaled_pigboardec_claude3_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.7265   0.1463   0.2565   0.4696   1.2947  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)      2.5126     0.4169   6.027 1.67e-09 ***
taskuv          -0.2260     0.5017  -0.450   0.6524    
input_nchars     1.2182     1.2808   0.951   0.3415    
output_nchars    2.1089     1.2412   1.699   0.0893 .  
input_ntokens    0.3000     0.8211   0.365   0.7149    
output_ntokens  -0.6233     0.7753  -0.804   0.4214    
input_logprob   -0.3644     0.7847  -0.464   0.6424    
output_logprob   3.3501     0.5655   5.924 3.15e-09 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 185.49  on 199  d

In [434]:
summary(model_pigtaskdecgemini)


Call:
bayesglm(formula = correct ~ task + input_nchars + output_nchars + 
    input_ntokens + output_ntokens + input_logprob + output_logprob, 
    family = binomial, data = scaled_pigboardec_gemini1_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.7071  -0.8027  -0.6106   0.8842   2.2585  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     -0.9673     0.2664  -3.631 0.000283 ***
taskuv          -0.4291     0.4411  -0.973 0.330600    
input_nchars     0.4164     0.9581   0.435 0.663828    
output_nchars    1.3806     0.9006   1.533 0.125288    
input_ntokens   -0.7010     0.7428  -0.944 0.345309    
output_ntokens   0.2381     0.5731   0.415 0.677789    
input_logprob    0.5377     0.6578   0.817 0.413704    
output_logprob   0.7203     0.3359   2.144 0.032011 *  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 227.10  on 199  d

In [435]:
vif(model_pigtaskenc4)

In [436]:
vif(model_pigtaskenc35)

In [437]:
vif(model_pigtaskencllama)

In [438]:
vif(model_pigtaskencclaude_notokens)

In [439]:
vif(model_pigtaskencgemini)

In [440]:
vif(model_pigtaskdec4)

In [441]:
vif(model_pigtaskdec35)

In [442]:
vif(model_pigtaskdecllama)

In [443]:
vif(model_pigtaskdecclaude_notokens)

In [444]:
vif(model_pigtaskdecgemini)

### Comparing Pig Latin variants

In [445]:
# Read in data
pigprobenc_gpt4_df <- read.table(file = 'table_pig_prob_enc_gpt-4-0613.tsv', sep = '\t', header = TRUE)
pigprobenc_gpt35_df <- read.table(file = 'table_pig_prob_enc_gpt-3.5-turbo-0613.tsv', sep = '\t', header = TRUE)
pigprobenc_llama3_df <- read.table(file = 'table_pig_prob_enc_llama-3-70b-chat-hf.tsv', sep = '\t', header = TRUE)
pigprobenc_claude3_df <- read.table(file = 'table_pig_prob_enc_claude-3-opus-20240229.tsv', sep = '\t', header = TRUE)
pigprobenc_gemini1_df <- read.table(file = 'table_pig_prob_enc_gemini-1.0-pro-001.tsv', sep = '\t', header = TRUE)


pigprobdec_gpt4_df <- read.table(file = 'table_pig_prob_dec_gpt-4-0613.tsv', sep = '\t', header = TRUE)
pigprobdec_gpt35_df <- read.table(file = 'table_pig_prob_dec_gpt-3.5-turbo-0613.tsv', sep = '\t', header = TRUE)
pigprobdec_llama3_df <- read.table(file = 'table_pig_prob_dec_llama-3-70b-chat-hf.tsv', sep = '\t', header = TRUE)
pigprobdec_claude3_df <- read.table(file = 'table_pig_prob_dec_claude-3-opus-20240229.tsv', sep = '\t', header = TRUE)
pigprobdec_gemini1_df <- read.table(file = 'table_pig_prob_dec_gemini-1.0-pro-001.tsv', sep = '\t', header = TRUE)


In [446]:
# Z-score data
scaled_pigprobenc_gpt4_df <- scale_taskpair_prob_df(pigprobenc_gpt4_df)
scaled_pigprobenc_gpt35_df <- scale_taskpair_prob_df(pigprobenc_gpt35_df)
scaled_pigprobenc_llama3_df <- scale_taskpair_prob_df(pigprobenc_llama3_df)
scaled_pigprobenc_claude3_df <- scale_taskpair_prob_df(pigprobenc_claude3_df)
scaled_pigprobenc_gemini1_df <- scale_taskpair_prob_df(pigprobenc_gemini1_df)


scaled_pigprobdec_gpt4_df <- scale_taskpair_prob_df(pigprobdec_gpt4_df)
scaled_pigprobdec_gpt35_df <- scale_taskpair_prob_df(pigprobdec_gpt35_df)
scaled_pigprobdec_llama3_df <- scale_taskpair_prob_df(pigprobdec_llama3_df)
scaled_pigprobdec_claude3_df <- scale_taskpair_prob_df(pigprobdec_claude3_df)
scaled_pigprobdec_gemini1_df <- scale_taskpair_prob_df(pigprobdec_gemini1_df)


In [447]:
model_pigprobenc4 <- glm(correct ~ task + input_nchars + output_nchars + input_ntokens + output_ntokens + input_logprob + output_logprob, 
               data=scaled_pigprobenc_gpt4_df, family=binomial)


In [448]:
model_pigprobenc35 <- glm(correct ~ task + input_nchars + output_nchars + input_ntokens + output_ntokens + input_logprob + output_logprob, 
               data=scaled_pigprobenc_gpt35_df, family=binomial)


In [449]:
model_pigprobencllama <- bayesglm(correct ~ task + input_nchars + output_nchars + input_ntokens + output_ntokens + input_logprob + output_logprob, 
               data=scaled_pigprobenc_llama3_df, family=binomial)

In [450]:
model_pigprobencclaude_notokens <- glm(correct ~ task + input_nchars + output_nchars + input_logprob + output_logprob, 
               data=scaled_pigprobenc_claude3_df, family=binomial)

In [451]:
model_pigprobencgemini <- bayesglm(correct ~ task + input_nchars + output_nchars + input_ntokens + output_ntokens + input_logprob + output_logprob, 
               data=scaled_pigprobenc_gemini1_df, family=binomial)

In [452]:
model_pigprobdec4 <- glm(correct ~ task + input_nchars + output_nchars + input_ntokens + output_ntokens + input_logprob + output_logprob, 
               data=scaled_pigprobdec_gpt4_df, family=binomial)

In [453]:
model_pigprobdec35 <- glm(correct ~ task + input_nchars + output_nchars + input_ntokens + output_ntokens + input_logprob + output_logprob, 
               data=scaled_pigprobdec_gpt35_df, family=binomial)

In [454]:
model_pigprobdecllama <- glm(correct ~ task + input_nchars + output_nchars + input_ntokens + output_ntokens + input_logprob + output_logprob, 
               data=scaled_pigprobdec_llama3_df, family=binomial)

In [455]:
model_pigprobdecclaude_notokens <- glm(correct ~ task + input_nchars + output_nchars + input_logprob + output_logprob, 
               data=scaled_pigprobdec_claude3_df, family=binomial)

In [456]:
model_pigprobdecgemini <- glm(correct ~ task + input_nchars + output_nchars + input_ntokens + output_ntokens + input_logprob + output_logprob, 
               data=scaled_pigprobdec_gemini1_df, family=binomial)

In [457]:
summary(model_pigprobenc4)


Call:
glm(formula = correct ~ task + input_nchars + output_nchars + 
    input_ntokens + output_ntokens + input_logprob + output_logprob, 
    family = binomial, data = scaled_pigprobenc_gpt4_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.6702  -0.7804  -0.4988   0.9214   2.5315  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)    -1.26069    0.12461 -10.117  < 2e-16 ***
task            0.74354    0.12438   5.978 2.26e-09 ***
input_nchars    5.28312    1.50249   3.516 0.000438 ***
output_nchars  -6.55400    1.90123  -3.447 0.000566 ***
input_ntokens   0.82896    0.60496   1.370 0.170602    
output_ntokens  1.82726    0.71174   2.567 0.010249 *  
input_logprob   0.01677    0.22750   0.074 0.941239    
output_logprob  1.90058    0.58417   3.253 0.001140 ** 
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 581.26  on 499  degrees o

In [458]:
summary(model_pigprobenc35)


Call:
glm(formula = correct ~ task + input_nchars + output_nchars + 
    input_ntokens + output_ntokens + input_logprob + output_logprob, 
    family = binomial, data = scaled_pigprobenc_gpt35_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.5192  -0.6000  -0.3587  -0.1589   2.6930  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     -2.1511     0.1796 -11.979  < 2e-16 ***
task             0.7805     0.1537   5.079 3.79e-07 ***
input_nchars     4.3925     1.8793   2.337  0.01943 *  
output_nchars   -6.2121     2.3870  -2.602  0.00926 ** 
input_ntokens    2.1263     0.7775   2.735  0.00624 ** 
output_ntokens   1.7178     0.9240   1.859  0.06302 .  
input_logprob    0.4887     0.2990   1.634  0.10217    
output_logprob   2.4526     0.7408   3.311  0.00093 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 452.70  on 499  degrees 

In [459]:
summary(model_pigprobencllama)


Call:
bayesglm(formula = correct ~ task + input_nchars + output_nchars + 
    input_ntokens + output_ntokens + input_logprob + output_logprob, 
    family = binomial, data = scaled_pigprobenc_llama3_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-0.50398  -0.15774  -0.10782  -0.06932   2.95611  

Coefficients:
                 Estimate Std. Error z value Pr(>|z|)    
(Intercept)    -5.2222274  0.6390486  -8.172 3.04e-16 ***
task            0.5819070  0.4460139   1.305   0.1920    
input_nchars    0.9383301  1.1716183   0.801   0.4232    
output_nchars   0.1747071  1.0690839   0.163   0.8702    
input_ntokens   1.5302040  1.1728088   1.305   0.1920    
output_ntokens  0.0777853  1.0070698   0.077   0.9384    
input_logprob   0.0006321  0.7138732   0.001   0.9993    
output_logprob  3.4791678  1.5290128   2.275   0.0229 *  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null 

In [460]:
summary(model_pigprobencclaude_notokens)


Call:
glm(formula = correct ~ task + input_nchars + output_nchars + 
    input_logprob + output_logprob, family = binomial, data = scaled_pigprobenc_claude3_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.3794  -1.0796   0.7049   0.9016   1.6251  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     0.57524    0.09935   5.790 7.05e-09 ***
task            0.26764    0.10105   2.649  0.00808 ** 
input_nchars    2.05299    1.14400   1.795  0.07272 .  
output_nchars  -0.65488    1.16802  -0.561  0.57502    
input_logprob   1.19098    0.20129   5.917 3.28e-09 ***
output_logprob  0.42310    0.44252   0.956  0.33901    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 660.01  on 499  degrees of freedom
Residual deviance: 598.51  on 494  degrees of freedom
AIC: 610.51

Number of Fisher Scoring iterations: 4


In [461]:
summary(model_pigprobencgemini)


Call:
bayesglm(formula = correct ~ task + input_nchars + output_nchars + 
    input_ntokens + output_ntokens + input_logprob + output_logprob, 
    family = binomial, data = scaled_pigprobenc_gemini1_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-0.81934  -0.11691  -0.01817  -0.00595   2.36739  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)    -7.75370    1.80862  -4.287 1.81e-05 ***
task            3.66364    1.18920   3.081  0.00206 ** 
input_nchars    0.17808    0.88659   0.201  0.84080    
output_nchars   0.24332    0.93666   0.260  0.79504    
input_ntokens  -0.54473    0.83715  -0.651  0.51524    
output_ntokens -0.43357    0.94552  -0.459  0.64656    
input_logprob   0.09438    0.49540   0.191  0.84891    
output_logprob  0.23512    0.86431   0.272  0.78560    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 127.720

In [462]:
summary(model_pigprobdec4)


Call:
glm(formula = correct ~ task + input_nchars + output_nchars + 
    input_ntokens + output_ntokens + input_logprob + output_logprob, 
    family = binomial, data = scaled_pigprobdec_gpt4_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.7178   0.2276   0.4047   0.5621   2.2107  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)      1.9096     0.1555  12.281  < 2e-16 ***
task             0.2296     0.1350   1.700  0.08911 .  
input_nchars    -3.5093     2.4045  -1.459  0.14443    
output_nchars    6.7374     2.0216   3.333  0.00086 ***
input_ntokens    0.1912     0.7882   0.243  0.80833    
output_ntokens   0.1227     0.7785   0.158  0.87477    
input_logprob    1.6197     0.6849   2.365  0.01803 *  
output_logprob   1.7782     0.2850   6.240 4.39e-10 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 483.31  on 499  degrees o

In [463]:
summary(model_pigprobdec35)


Call:
glm(formula = correct ~ task + input_nchars + output_nchars + 
    input_ntokens + output_ntokens + input_logprob + output_logprob, 
    family = binomial, data = scaled_pigprobdec_gpt35_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-2.44809  -0.93313   0.04602   0.97348   2.74208  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)    -0.05707    0.10349  -0.551 0.581348    
task            0.11682    0.10485   1.114 0.265176    
input_nchars    1.12060    1.64945   0.679 0.496899    
output_nchars   2.39500    1.37888   1.737 0.082402 .  
input_ntokens   0.65655    0.59733   1.099 0.271708    
output_ntokens -2.08243    0.55620  -3.744 0.000181 ***
input_logprob   0.99006    0.50218   1.972 0.048665 *  
output_logprob  1.59434    0.24255   6.573 4.92e-11 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 693.15  on 499

In [464]:
summary(model_pigprobdecllama)


Call:
glm(formula = correct ~ task + input_nchars + output_nchars + 
    input_ntokens + output_ntokens + input_logprob + output_logprob, 
    family = binomial, data = scaled_pigprobdec_llama3_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-2.04903  -0.45988  -0.22205  -0.05397   3.03315  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     -3.0282     0.2910 -10.406  < 2e-16 ***
task             0.2537     0.1492   1.701   0.0890 .  
input_nchars    -1.7561     2.9612  -0.593   0.5532    
output_nchars    4.7444     2.2878   2.074   0.0381 *  
input_ntokens   -1.3361     1.1131  -1.200   0.2300    
output_ntokens  -1.8439     0.9863  -1.870   0.0615 .  
input_logprob   -1.2765     0.9887  -1.291   0.1967    
output_logprob   3.1073     0.5138   6.048 1.47e-09 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 429.59  on 49

In [465]:
summary(model_pigprobdecclaude_notokens)


Call:
glm(formula = correct ~ task + input_nchars + output_nchars + 
    input_logprob + output_logprob, family = binomial, data = scaled_pigprobdec_claude3_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-3.3812   0.1650   0.3182   0.5198   1.7229  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)      1.9077     0.1666  11.450   <2e-16 ***
task             0.3666     0.1449   2.530   0.0114 *  
input_nchars    -1.0436     1.6489  -0.633   0.5268    
output_nchars    4.2890     1.6607   2.583   0.0098 ** 
input_logprob    0.3166     0.6256   0.506   0.6128    
output_logprob   3.0264     0.3412   8.871   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 519.21  on 499  degrees of freedom
Residual deviance: 340.00  on 494  degrees of freedom
AIC: 352

Number of Fisher Scoring iterations: 5


In [466]:
summary(model_pigprobdecgemini)


Call:
glm(formula = correct ~ task + input_nchars + output_nchars + 
    input_ntokens + output_ntokens + input_logprob + output_logprob, 
    family = binomial, data = scaled_pigprobdec_gemini1_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.7330  -0.8858  -0.7006   1.2491   2.0477  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)    -0.86794    0.10289  -8.436  < 2e-16 ***
task            0.30144    0.10296   2.928  0.00341 ** 
input_nchars    0.08141    1.49867   0.054  0.95668    
output_nchars   1.35880    1.26825   1.071  0.28399    
input_ntokens  -0.01773    0.75317  -0.024  0.98122    
output_ntokens  0.29095    0.52735   0.552  0.58114    
input_logprob   0.94471    0.51737   1.826  0.06785 .  
output_logprob  0.61953    0.21546   2.875  0.00404 ** 
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 619.10  on 499  degree

In [467]:
vif(model_pigprobenc4)

In [468]:
vif(model_pigprobenc35)

In [469]:
vif(model_pigprobencllama)

In [470]:
vif(model_pigprobencclaude_notokens)

In [471]:
vif(model_pigprobencgemini)

In [472]:
vif(model_pigprobdec4)

In [473]:
vif(model_pigprobdec35)

In [474]:
vif(model_pigprobdecllama)

In [475]:
vif(model_pigprobdecclaude_notokens)

In [476]:
vif(model_pigprobdecgemini)

# Acronyms

In [534]:
# Read in data
acronym_gpt4_outp_df <- read.table(file = 'table_acronym_varyoutp_gpt-4-0613.tsv', sep = '\t', header = TRUE)
acronym_gpt35_outp_df <- read.table(file = 'table_acronym_varyoutp_gpt-3.5-turbo-0613.tsv', sep = '\t', header = TRUE)
acronym_llama3_outp_df <- read.table(file = 'table_acronym_varyoutp_llama-3-70b-chat-hf.tsv', sep = '\t', header = TRUE)
acronym_claude3_outp_df <- read.table(file = 'table_acronym_varyoutp_claude-3-opus-20240229.tsv', sep = '\t', header = TRUE)
acronym_gemini1_outp_df <- read.table(file = 'table_acronym_varyoutp_gemini-1.0-pro-001.tsv', sep = '\t', header = TRUE)



acronym_gpt4_inp_df <- read.table(file = 'table_acronym_varyinp_gpt-4-0613.tsv', sep = '\t', header = TRUE)
acronym_gpt35_inp_df <- read.table(file = 'table_acronym_varyinp_gpt-3.5-turbo-0613.tsv', sep = '\t', header = TRUE)
acronym_llama3_inp_df <- read.table(file = 'table_acronym_varyinp_llama-3-70b-chat-hf.tsv', sep = '\t', header = TRUE)
acronym_claude3_inp_df <- read.table(file = 'table_acronym_varyinp_claude-3-opus-20240229.tsv', sep = '\t', header = TRUE)
acronym_gemini1_inp_df <- read.table(file = 'table_acronym_varyinp_gemini-1.0-pro-001.tsv', sep = '\t', header = TRUE)



In [535]:
# Z-score data
scaled_acronym_gpt4_outp_df <- scale_df(acronym_gpt4_outp_df)
scaled_acronym_gpt35_outp_df <- scale_df(acronym_gpt35_outp_df)
scaled_acronym_llama3_outp_df <- scale_df(acronym_llama3_outp_df)
scaled_acronym_claude3_outp_df <- scale_df(acronym_claude3_outp_df)
scaled_acronym_gemini1_outp_df <- scale_df(acronym_gemini1_outp_df)



scaled_acronym_gpt4_inp_df <- scale_df(acronym_gpt4_inp_df)
scaled_acronym_gpt35_inp_df <- scale_df(acronym_gpt35_inp_df)
scaled_acronym_llama3_inp_df <- scale_df(acronym_llama3_inp_df)
scaled_acronym_claude3_inp_df <- scale_df(acronym_claude3_inp_df)
scaled_acronym_gemini1_inp_df <- scale_df(acronym_gemini1_inp_df)


In [536]:
acronym_gpt4_outp_model <- glm(correct ~ input_logprob + output_logprob, 
               data=scaled_acronym_gpt4_outp_df, family=binomial)
acronym_gpt35_outp_model <- glm(correct ~ input_logprob + output_logprob, 
               data=scaled_acronym_gpt35_outp_df, family=binomial)
acronym_llama3_outp_model <- glm(correct ~ input_logprob + output_logprob, 
               data=scaled_acronym_llama3_outp_df, family=binomial)
acronym_claude3_outp_model <- glm(correct ~ input_logprob + output_logprob, 
               data=scaled_acronym_claude3_outp_df, family=binomial)
acronym_gemini1_outp_model <- glm(correct ~ input_logprob + output_logprob + input_ntokens + output_ntokens, 
               data=scaled_acronym_gemini1_outp_df, family=binomial)

acronym_gpt4_inp_model <- glm(correct ~ input_logprob + output_logprob, 
               data=scaled_acronym_gpt4_inp_df, family=binomial)
acronym_gpt35_inp_model <- glm(correct ~ input_logprob + output_logprob, 
               data=scaled_acronym_gpt35_inp_df, family=binomial)
acronym_llama3_inp_model <- glm(correct ~ input_logprob + output_logprob, 
               data=scaled_acronym_llama3_inp_df, family=binomial)
acronym_claude3_inp_model <- glm(correct ~ input_logprob + output_logprob, 
               data=scaled_acronym_claude3_inp_df, family=binomial)
acronym_gemini1_inp_model <- glm(correct ~ input_logprob + output_logprob + input_ntokens + output_ntokens, 
               data=scaled_acronym_gemini1_inp_df, family=binomial)

In [537]:
summary(acronym_gpt4_outp_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob, family = binomial, 
    data = scaled_acronym_gpt4_outp_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.8290  -1.4607   0.7400   0.8019   1.0212  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     1.02684    0.03230  31.795  < 2e-16 ***
input_logprob   0.01891    0.03233   0.585    0.559    
output_logprob  0.20402    0.03182   6.412 1.44e-10 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 5788.3  on 4999  degrees of freedom
Residual deviance: 5746.4  on 4997  degrees of freedom
AIC: 5752.4

Number of Fisher Scoring iterations: 4


In [548]:
summary(acronym_gpt35_outp_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob, family = binomial, 
    data = scaled_acronym_gpt35_outp_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.1969  -0.8597  -0.7189   1.3686   2.1452  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)    -0.95746    0.03234 -29.605   <2e-16 ***
input_logprob   0.04300    0.03167   1.357    0.175    
output_logprob  0.39609    0.03396  11.664   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 5970.6  on 4999  degrees of freedom
Residual deviance: 5821.2  on 4997  degrees of freedom
AIC: 5827.2

Number of Fisher Scoring iterations: 4


In [549]:
summary(acronym_llama3_outp_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob, family = binomial, 
    data = scaled_acronym_llama3_outp_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.3484  -1.1670  -0.9873   1.1695   1.4434  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)    -0.02601    0.02840  -0.916    0.360    
input_logprob   0.02880    0.02845   1.012    0.311    
output_logprob  0.17736    0.02865   6.191 5.98e-10 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 6930.7  on 4999  degrees of freedom
Residual deviance: 6890.0  on 4997  degrees of freedom
AIC: 6896

Number of Fisher Scoring iterations: 3


In [550]:
summary(acronym_claude3_outp_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob, family = binomial, 
    data = scaled_acronym_claude3_outp_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.5110  -1.2910   0.9989   1.0606   1.1764  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     0.30242    0.02865  10.555  < 2e-16 ***
input_logprob   0.10624    0.02891   3.676 0.000237 ***
output_logprob  0.02519    0.02868   0.878 0.379817    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 6819.1  on 4999  degrees of freedom
Residual deviance: 6804.4  on 4997  degrees of freedom
AIC: 6810.4

Number of Fisher Scoring iterations: 4


In [551]:
summary(acronym_gemini1_outp_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob + input_ntokens + 
    output_ntokens, family = binomial, data = scaled_acronym_gemini1_outp_df)

Deviance Residuals: 
   Min      1Q  Median      3Q     Max  
-1.442  -1.250   1.030   1.095   1.251  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     0.19899    0.02848   6.988  2.8e-12 ***
input_logprob  -0.01475    0.02925  -0.504  0.61401    
output_logprob  0.09420    0.02940   3.204  0.00135 ** 
input_ntokens  -0.06154    0.02922  -2.106  0.03522 *  
output_ntokens  0.07906    0.02950   2.680  0.00737 ** 
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 6882.6  on 4999  degrees of freedom
Residual deviance: 6864.1  on 4995  degrees of freedom
AIC: 6874.1

Number of Fisher Scoring iterations: 4


In [538]:
summary(acronym_gpt4_inp_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob, family = binomial, 
    data = scaled_acronym_gpt4_inp_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.8250  -1.4850   0.7384   0.7858   0.9456  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     1.07649    0.03261  33.008  < 2e-16 ***
input_logprob   0.05493    0.03259   1.685   0.0919 .  
output_logprob  0.15358    0.03244   4.735 2.19e-06 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 5681.9  on 4999  degrees of freedom
Residual deviance: 5656.6  on 4997  degrees of freedom
AIC: 5662.6

Number of Fisher Scoring iterations: 4


In [539]:
summary(acronym_gpt35_inp_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob, family = binomial, 
    data = scaled_acronym_gpt35_inp_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.1457  -0.9082  -0.8348   1.4117   1.7552  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)    -0.72733    0.03035 -23.961  < 2e-16 ***
input_logprob   0.14452    0.03027   4.775 1.80e-06 ***
output_logprob  0.14814    0.03056   4.847 1.25e-06 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 6323.2  on 4999  degrees of freedom
Residual deviance: 6276.6  on 4997  degrees of freedom
AIC: 6282.6

Number of Fisher Scoring iterations: 4


In [540]:
summary(acronym_llama3_inp_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob, family = binomial, 
    data = scaled_acronym_llama3_inp_df)

Deviance Residuals: 
   Min      1Q  Median      3Q     Max  
-1.409  -1.216   1.030   1.124   1.296  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     0.12721    0.02841   4.477 7.56e-06 ***
input_logprob   0.11318    0.02846   3.976 7.00e-05 ***
output_logprob  0.08528    0.02845   2.998  0.00272 ** 
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 6911.5  on 4999  degrees of freedom
Residual deviance: 6886.6  on 4997  degrees of freedom
AIC: 6892.6

Number of Fisher Scoring iterations: 3


In [541]:
summary(acronym_claude3_inp_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob, family = binomial, 
    data = scaled_acronym_claude3_inp_df)

Deviance Residuals: 
   Min      1Q  Median      3Q     Max  
-1.238  -1.175  -1.121   1.180   1.234  

Coefficients:
                Estimate Std. Error z value Pr(>|z|)  
(Intercept)    -0.004802   0.028296  -0.170   0.8652  
input_logprob   0.052745   0.028306   1.863   0.0624 .
output_logprob  0.020921   0.028302   0.739   0.4598  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 6931.4  on 4999  degrees of freedom
Residual deviance: 6927.4  on 4997  degrees of freedom
AIC: 6933.4

Number of Fisher Scoring iterations: 3


In [542]:
summary(acronym_gemini1_inp_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob + input_ntokens + 
    output_ntokens, family = binomial, data = scaled_acronym_gemini1_inp_df)

Deviance Residuals: 
   Min      1Q  Median      3Q     Max  
-1.596  -1.171   0.885   1.148   1.498  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     0.04031    0.02860   1.409   0.1587    
input_logprob   0.03670    0.04661   0.787   0.4310    
output_logprob -0.18499    0.02942  -6.288 3.22e-10 ***
input_ntokens  -0.08279    0.04674  -1.771   0.0765 .  
output_ntokens  0.17473    0.02961   5.901 3.62e-09 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 6929.6  on 4999  degrees of freedom
Residual deviance: 6820.4  on 4995  degrees of freedom
AIC: 6830.4

Number of Fisher Scoring iterations: 4


In [552]:
vif(acronym_gpt4_outp_model)

In [553]:
vif(acronym_gpt35_outp_model)

In [554]:
vif(acronym_llama3_outp_model)

In [555]:
vif(acronym_claude3_outp_model)

In [556]:
vif(acronym_gemini1_outp_model)

In [543]:
vif(acronym_gpt4_inp_model)

In [544]:
vif(acronym_gpt35_inp_model)

In [545]:
vif(acronym_llama3_inp_model)

In [546]:
vif(acronym_claude3_inp_model)

In [547]:
vif(acronym_gemini1_inp_model)

### Vary task

In [520]:
# Read in data
acronym_gpt4_1and2_df <- read.table(file = 'table_acronym_varytask_gpt-4-0613.tsv', sep = '\t', header = TRUE)
acronym_gpt35_1and2_df <- read.table(file = 'table_acronym_varytask_gpt-3.5-turbo-0613.tsv', sep = '\t', header = TRUE)
acronym_llama3_1and2_df <- read.table(file = 'table_acronym_varytask_llama-3-70b-chat-hf.tsv', sep = '\t', header = TRUE)
acronym_claude3_1and2_df <- read.table(file = 'table_acronym_varytask_claude-3-opus-20240229.tsv', sep = '\t', header = TRUE)
acronym_gemini1_1and2_df <- read.table(file = 'table_acronym_varytask_gemini-1.0-pro-001.tsv', sep = '\t', header = TRUE)



In [521]:
# Z-score data
scaled_acronym_gpt4_1and2_df <- scale_taskpair_df(acronym_gpt4_1and2_df)
scaled_acronym_gpt35_1and2_df <- scale_taskpair_df(acronym_gpt35_1and2_df)
scaled_acronym_llama3_1and2_df <- scale_taskpair_df(acronym_llama3_1and2_df)
scaled_acronym_claude3_1and2_df <- scale_taskpair_df(acronym_claude3_1and2_df)
scaled_acronym_gemini1_1and2_df <- scale_taskpair_df(acronym_gemini1_1and2_df)


In [523]:
acronym_gpt4_1and2_model <- bayesglm(correct ~ task + input_logprob + output_logprob, 
               data=scaled_acronym_gpt4_1and2_df, family=binomial)
acronym_gpt35_1and2_model <- bayesglm(correct ~ task + input_logprob + output_logprob, 
               data=scaled_acronym_gpt35_1and2_df, family=binomial)
acronym_llama3_1and2_model <- bayesglm(correct ~ task + input_logprob + output_logprob, 
               data=scaled_acronym_llama3_1and2_df, family=binomial)
acronym_claude3_1and2_model <- bayesglm(correct ~ task + input_logprob + output_logprob, 
               data=scaled_acronym_claude3_1and2_df, family=binomial)
acronym_gemini1_1and2_model <- bayesglm(correct ~ task + input_logprob + output_logprob + input_ntokens + output_ntokens, 
               data=scaled_acronym_gemini1_1and2_df, family=binomial)



In [524]:
summary(acronym_gpt4_1and2_model)


Call:
bayesglm(formula = correct ~ task + input_logprob + output_logprob, 
    family = binomial, data = scaled_acronym_gpt4_1and2_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.9392  -0.2452  -0.2053   0.6980   2.8814  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     1.18066    0.07532  15.674  < 2e-16 ***
taskacronym2   -4.82454    0.21268 -22.684  < 2e-16 ***
input_logprob  -0.06966    0.06591  -1.057 0.290499    
output_logprob  0.26550    0.07002   3.792 0.000149 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 2682.0  on 1999  degrees of freedom
Residual deviance: 1323.3  on 1996  degrees of freedom
AIC: 1331.3

Number of Fisher Scoring iterations: 7


In [525]:
summary(acronym_gpt35_1and2_model)


Call:
bayesglm(formula = correct ~ task + input_logprob + output_logprob, 
    family = binomial, data = scaled_acronym_gpt35_1and2_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.20491  -0.93012  -0.02338  -0.02057   1.57853  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)    -0.44672    0.06506  -6.866 6.58e-12 ***
taskacronym2   -7.80307    1.83266  -4.258 2.06e-05 ***
input_logprob   0.04556    0.06047   0.753   0.4512    
output_logprob  0.15608    0.06549   2.383   0.0172 *  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 1976.4  on 1999  degrees of freedom
Residual deviance: 1332.3  on 1996  degrees of freedom
AIC: 1340.3

Number of Fisher Scoring iterations: 16


In [526]:
summary(acronym_llama3_1and2_model)


Call:
bayesglm(formula = correct ~ task + input_logprob + output_logprob, 
    family = binomial, data = scaled_acronym_llama3_1and2_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.3724  -0.7730  -0.7293   1.1056   1.7489  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     0.17562    0.06350   2.766  0.00568 ** 
taskacronym2   -1.32769    0.09746 -13.623  < 2e-16 ***
input_logprob   0.06269    0.04765   1.316  0.18834    
output_logprob  0.01791    0.04824   0.371  0.71049    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 2678.5  on 1999  degrees of freedom
Residual deviance: 2478.8  on 1996  degrees of freedom
AIC: 2486.8

Number of Fisher Scoring iterations: 4


In [527]:
summary(acronym_claude3_1and2_model)


Call:
bayesglm(formula = correct ~ task + input_logprob + output_logprob, 
    family = binomial, data = scaled_acronym_claude3_1and2_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.55128  -0.02471  -0.02164   0.96943   1.22218  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     0.31243    0.06426   4.862 1.16e-06 ***
taskacronym2   -8.64273    1.91519  -4.513 6.40e-06 ***
input_logprob   0.09186    0.06021   1.526   0.1271    
output_logprob  0.13524    0.06430   2.103   0.0355 *  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 2403.2  on 1999  degrees of freedom
Residual deviance: 1355.8  on 1996  degrees of freedom
AIC: 1363.8

Number of Fisher Scoring iterations: 16


In [528]:
summary(acronym_gemini1_1and2_model)


Call:
bayesglm(formula = correct ~ task + input_logprob + output_logprob + 
    input_ntokens + output_ntokens, family = binomial, data = scaled_acronym_gemini1_1and2_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.60634  -0.02784  -0.02153   0.92808   1.38161  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     0.18084    0.06433   2.811  0.00494 ** 
taskacronym2   -8.51671    1.90214  -4.477 7.55e-06 ***
input_logprob  -0.02875    0.06076  -0.473  0.63610    
output_logprob -0.18888    0.06605  -2.860  0.00424 ** 
input_ntokens  -0.15048    0.06961  -2.162  0.03065 *  
output_ntokens  0.12957    0.06563   1.974  0.04837 *  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 2346.8  on 1999  degrees of freedom
Residual deviance: 1357.6  on 1994  degrees of freedom
AIC: 1369.6

Number of Fisher Scoring iterations: 16


In [529]:
vif(acronym_gpt4_1and2_model)

In [530]:
vif(acronym_gpt35_1and2_model)

In [531]:
vif(acronym_llama3_1and2_model)

In [532]:
vif(acronym_claude3_1and2_model)

In [533]:
vif(acronym_gemini1_1and2_model)

# Multiplication

In [809]:
# Read in data
mult_gpt4_df <- read.table(file = 'table_multiplication_gpt-4-0613.tsv', sep = '\t', header = TRUE)
mult_gpt35_df <- read.table(file = 'table_multiplication_gpt-3.5-turbo-0613.tsv', sep = '\t', header = TRUE)
mult_llama3_df <- read.table(file = 'table_multiplication_llama-3-70b-chat-hf.tsv', sep = '\t', header = TRUE)
mult_claude3_df <- read.table(file = 'table_multiplication_claude-3-opus-20240229.tsv', sep = '\t', header = TRUE)
mult_gemini1_df <- read.table(file = 'table_multiplication_gemini-1.0-pro-001.tsv', sep = '\t', header = TRUE)


In [810]:
mult_gpt4_df$method <- factor(mult_gpt4_df$method)
mult_gpt35_df$method <- factor(mult_gpt35_df$method)
mult_llama3_df$method <- factor(mult_llama3_df$method)
mult_claude3_df$method <- factor(mult_claude3_df$method)
mult_gemini1_df$method <- factor(mult_gemini1_df$method)



In [811]:
contrasts(mult_gpt4_df$method) <- contr.sum(4)
contrasts(mult_gpt35_df$method) <- contr.sum(4)
contrasts(mult_llama3_df$method) <- contr.sum(4)
contrasts(mult_claude3_df$method) <- contr.sum(4)
contrasts(mult_gemini1_df$method) <- contr.sum(4)


In [812]:
mult_gpt4_model <- glmer(correct ~ method + (1|index), 
               data=mult_gpt4_df, family=binomial)
mult_gpt4_null_model <- glmer(correct ~ (1|index), 
               data=mult_gpt4_df, family=binomial)   

In [813]:
mult_gpt35_model <- glmer(correct ~ method + (1|index), 
               data=mult_gpt35_df, family=binomial)
mult_gpt35_null_model <- glmer(correct ~ (1|index), 
               data=mult_gpt35_df, family=binomial)   

In [814]:
mult_llama3_model <- glmer(correct ~ method + (1|index), 
               data=mult_llama3_df, family=binomial)
mult_llama3_null_model <- glmer(correct ~ (1|index), 
               data=mult_llama3_df, family=binomial) 

In [815]:
mult_claude3_model <- glmer(correct ~ method + (1|index), 
               data=mult_claude3_df, family=binomial)
mult_claude3_null_model <- glmer(correct ~ (1|index), 
               data=mult_claude3_df, family=binomial) 

In [816]:
mult_gemini1_model <- glmer(correct ~ method + (1|index), 
               data=mult_gemini1_df, family=binomial)
mult_gemini1_null_model <- glmer(correct ~ (1|index), 
               data=mult_gemini1_df, family=binomial) 

In [817]:
anova(mult_gpt4_model,mult_gpt4_null_model,test="Chisq")

Unnamed: 0_level_0,npar,AIC,BIC,logLik,deviance,Chisq,Df,Pr(>Chisq)
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
mult_gpt4_null_model,2,431.4605,439.4434,-213.7303,427.4605,,,
mult_gpt4_model,5,393.3867,413.344,-191.6933,383.3867,44.07383,3.0,1.455654e-09


In [818]:
anova(mult_gpt35_model,mult_gpt35_null_model,test="Chisq")

Unnamed: 0_level_0,npar,AIC,BIC,logLik,deviance,Chisq,Df,Pr(>Chisq)
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
mult_gpt35_null_model,2,395.8202,403.8031,-195.9101,391.8202,,,
mult_gpt35_model,5,324.5254,344.4827,-157.2627,314.5254,77.29482,3.0,1.16727e-16


In [819]:
anova(mult_llama3_model,mult_llama3_null_model,test="Chisq")

Unnamed: 0_level_0,npar,AIC,BIC,logLik,deviance,Chisq,Df,Pr(>Chisq)
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
mult_llama3_null_model,2,360.8588,368.8417,-178.4294,356.8588,,,
mult_llama3_model,5,352.3373,372.2946,-171.1686,342.3373,14.52148,3.0,0.002274787


In [820]:
anova(mult_claude3_model,mult_claude3_null_model,test="Chisq")

Unnamed: 0_level_0,npar,AIC,BIC,logLik,deviance,Chisq,Df,Pr(>Chisq)
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
mult_claude3_null_model,2,323.5376,331.5205,-159.7688,319.5376,,,
mult_claude3_model,5,287.409,307.3663,-138.7045,277.409,42.12863,3.0,3.767707e-09


In [821]:
anova(mult_gemini1_model,mult_gemini1_null_model,test="Chisq")

Unnamed: 0_level_0,npar,AIC,BIC,logLik,deviance,Chisq,Df,Pr(>Chisq)
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
mult_gemini1_null_model,2,182.3111,190.294,-89.15554,178.3111,,,
mult_gemini1_model,5,169.414,189.3714,-79.70702,159.414,18.89705,3.0,0.0002871311


In [822]:
gpt4_mult_multcomp <- glht(mult_gpt4_model, linfct=mcp(method="Tukey"))


In [823]:
summary(gpt4_mult_multcomp)


	 Simultaneous Tests for General Linear Hypotheses

Multiple Comparisons of Means: Tukey Contrasts


Fit: glmer(formula = correct ~ method + (1 | index), data = mult_gpt4_df, 
    family = binomial)

Linear Hypotheses:
                                                             Estimate
multiplication_alternatingcaps - multiplication_allcaps == 0  -2.1294
multiplication_number - multiplication_allcaps == 0            1.0515
multiplication_word - multiplication_allcaps == 0              0.3911
multiplication_number - multiplication_alternatingcaps == 0    3.1809
multiplication_word - multiplication_alternatingcaps == 0      2.5205
multiplication_word - multiplication_number == 0              -0.6604
                                                             Std. Error z value
multiplication_alternatingcaps - multiplication_allcaps == 0     0.5432  -3.920
multiplication_number - multiplication_allcaps == 0              0.4521   2.326
multiplication_word - multiplication_allcaps == 0 

In [824]:
gpt35_mult_multcomp <- glht(mult_gpt35_model, linfct=mcp(method="Tukey"))


In [825]:
summary(gpt35_mult_multcomp)


	 Simultaneous Tests for General Linear Hypotheses

Multiple Comparisons of Means: Tukey Contrasts


Fit: glmer(formula = correct ~ method + (1 | index), data = mult_gpt35_df, 
    family = binomial)

Linear Hypotheses:
                                                             Estimate
multiplication_alternatingcaps - multiplication_allcaps == 0  -2.8676
multiplication_number - multiplication_allcaps == 0            1.8968
multiplication_word - multiplication_allcaps == 0              0.1711
multiplication_number - multiplication_alternatingcaps == 0    4.7644
multiplication_word - multiplication_alternatingcaps == 0      3.0387
multiplication_word - multiplication_number == 0              -1.7257
                                                             Std. Error z value
multiplication_alternatingcaps - multiplication_allcaps == 0     1.0538  -2.721
multiplication_number - multiplication_allcaps == 0              0.4042   4.693
multiplication_word - multiplication_allcaps == 0

In [826]:
llama3_mult_multcomp <- glht(mult_llama3_model, linfct=mcp(method="Tukey"))


In [827]:
summary(llama3_mult_multcomp)


	 Simultaneous Tests for General Linear Hypotheses

Multiple Comparisons of Means: Tukey Contrasts


Fit: glmer(formula = correct ~ method + (1 | index), data = mult_llama3_df, 
    family = binomial)

Linear Hypotheses:
                                                             Estimate
multiplication_alternatingcaps - multiplication_allcaps == 0  -1.4804
multiplication_number - multiplication_allcaps == 0            0.4960
multiplication_word - multiplication_allcaps == 0              0.1283
multiplication_number - multiplication_alternatingcaps == 0    1.9764
multiplication_word - multiplication_alternatingcaps == 0      1.6087
multiplication_word - multiplication_number == 0              -0.3677
                                                             Std. Error z value
multiplication_alternatingcaps - multiplication_allcaps == 0     0.5894  -2.512
multiplication_number - multiplication_allcaps == 0              0.5030   0.986
multiplication_word - multiplication_allcaps == 

In [828]:
claude3_mult_multcomp <- glht(mult_claude3_model, linfct=mcp(method="Tukey"))


In [829]:
summary(claude3_mult_multcomp)


	 Simultaneous Tests for General Linear Hypotheses

Multiple Comparisons of Means: Tukey Contrasts


Fit: glmer(formula = correct ~ method + (1 | index), data = mult_claude3_df, 
    family = binomial)

Linear Hypotheses:
                                                               Estimate
multiplication_alternatingcaps - multiplication_allcaps == 0 -2.979e+00
multiplication_number - multiplication_allcaps == 0          -8.900e-01
multiplication_word - multiplication_allcaps == 0             1.910e-06
multiplication_number - multiplication_alternatingcaps == 0   2.089e+00
multiplication_word - multiplication_alternatingcaps == 0     2.979e+00
multiplication_word - multiplication_number == 0              8.900e-01
                                                             Std. Error z value
multiplication_alternatingcaps - multiplication_allcaps == 0  6.918e-01  -4.306
multiplication_number - multiplication_allcaps == 0           6.177e-01  -1.441
multiplication_word - multiplicat

In [830]:
gemini1_mult_multcomp <- glht(mult_gemini1_model, linfct=mcp(method="Tukey"))


In [831]:
summary(gemini1_mult_multcomp)


	 Simultaneous Tests for General Linear Hypotheses

Multiple Comparisons of Means: Tukey Contrasts


Fit: glmer(formula = correct ~ method + (1 | index), data = mult_gemini1_df, 
    family = binomial)

Linear Hypotheses:
                                                             Estimate
multiplication_alternatingcaps - multiplication_allcaps == 0  -2.8042
multiplication_number - multiplication_allcaps == 0            1.7662
multiplication_word - multiplication_allcaps == 0             -0.4004
multiplication_number - multiplication_alternatingcaps == 0    4.5704
multiplication_word - multiplication_alternatingcaps == 0      2.4038
multiplication_word - multiplication_number == 0              -2.1666
                                                             Std. Error z value
multiplication_alternatingcaps - multiplication_allcaps == 0     1.3055  -2.148
multiplication_number - multiplication_allcaps == 0              0.9242   1.911
multiplication_word - multiplication_allcaps ==

# Linear function

In [728]:
# Read in data
linfwd_gpt4_df <- read.table(file = 'table_conversion_fwd_gpt-4-0613.tsv', sep = '\t', header = TRUE)
linfwd_gpt35_df <- read.table(file = 'table_conversion_fwd_gpt-3.5-turbo-0613.tsv', sep = '\t', header = TRUE)
linfwd_llama3_df <- read.table(file = 'table_conversion_fwd_llama-3-70b-chat-hf.tsv', sep = '\t', header = TRUE)
linfwd_claude3_df <- read.table(file = 'table_conversion_fwd_claude-3-opus-20240229.tsv', sep = '\t', header = TRUE)
linfwd_gemini1_df <- read.table(file = 'table_conversion_fwd_gemini-1.0-pro-001.tsv', sep = '\t', header = TRUE)


linrev_gpt4_df <- read.table(file = 'table_conversion_rev_gpt-4-0613.tsv', sep = '\t', header = TRUE)
linrev_gpt35_df <- read.table(file = 'table_conversion_rev_gpt-3.5-turbo-0613.tsv', sep = '\t', header = TRUE)
linrev_llama3_df <- read.table(file = 'table_conversion_rev_llama-3-70b-chat-hf.tsv', sep = '\t', header = TRUE)
linrev_claude3_df <- read.table(file = 'table_conversion_rev_claude-3-opus-20240229.tsv', sep = '\t', header = TRUE)
linrev_gemini1_df <- read.table(file = 'table_conversion_rev_gemini-1.0-pro-001.tsv', sep = '\t', header = TRUE)


In [729]:
scale_lin_df <- function(df) {
    new_df <- data.frame(scale(df[3:5]))
    new_df$index <- as.factor(df$index)
    new_df$correct <- df$correct
    new_df$task <- factor(df$task)
    
    return(new_df)
}

In [730]:
scaled_linfwd_gpt4_df <- scale_lin_df(linfwd_gpt4_df)
scaled_linfwd_gpt35_df <- scale_lin_df(linfwd_gpt35_df)
scaled_linfwd_llama3_df <- scale_lin_df(linfwd_llama3_df)
scaled_linfwd_claude3_df <- scale_lin_df(linfwd_claude3_df)
scaled_linfwd_gemini1_df <- scale_lin_df(linfwd_gemini1_df)


scaled_linrev_gpt4_df <- scale_lin_df(linrev_gpt4_df)
scaled_linrev_gpt35_df <- scale_lin_df(linrev_gpt35_df)
scaled_linrev_llama3_df <- scale_lin_df(linrev_llama3_df)
scaled_linrev_claude3_df <- scale_lin_df(linrev_claude3_df)
scaled_linrev_gemini1_df <- scale_lin_df(linrev_gemini1_df)

In [731]:
linfwd_gpt4_model <- bayesglm(correct ~ task + input + output, 
               data=scaled_linfwd_gpt4_df, family=binomial)
linfwd_gpt35_model <- bayesglm(correct ~ task + input + output, 
               data=scaled_linfwd_gpt35_df, family=binomial)
linfwd_llama3_model <- bayesglm(correct ~ task + input + output, 
               data=scaled_linfwd_llama3_df, family=binomial)
linfwd_claude3_model <- bayesglm(correct ~ task + input + output, 
               data=scaled_linfwd_claude3_df, family=binomial)
linfwd_gemini1_model <- bayesglm(correct ~ task + input + output, 
               data=scaled_linfwd_gemini1_df, family=binomial)

linrev_gpt4_model <- bayesglm(correct ~ task + input + output, 
               data=scaled_linrev_gpt4_df, family=binomial)
linrev_gpt35_model <- bayesglm(correct ~ task + input + output, 
               data=scaled_linrev_gpt35_df, family=binomial)
linrev_llama3_model <- bayesglm(correct ~ task + input + output, 
               data=scaled_linrev_llama3_df, family=binomial)
linrev_claude3_model <- bayesglm(correct ~ task + input + output, 
               data=scaled_linrev_claude3_df, family=binomial)
linrev_gemini1_model <- bayesglm(correct ~ task + input + output, 
               data=scaled_linrev_gemini1_df, family=binomial)

In [732]:
summary(linfwd_gpt4_model)


Call:
bayesglm(formula = correct ~ task + input + output, family = binomial, 
    data = scaled_linfwd_gpt4_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.38997  -0.40293  -0.08789  -0.03170   2.28552  

Coefficients:
                    Estimate Std. Error z value Pr(>|z|)   
(Intercept)          -0.8416     0.2924  -2.878  0.00400 **
taskconversion_fake  -5.4764     1.6658  -3.288  0.00101 **
input                -0.5816     0.8363  -0.695  0.48679   
output               -0.5344     0.7688  -0.695  0.48703   
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 179.15  on 199  degrees of freedom
Residual deviance: 104.27  on 196  degrees of freedom
AIC: 112.27

Number of Fisher Scoring iterations: 18


In [733]:
summary(linfwd_gpt35_model)


Call:
bayesglm(formula = correct ~ task + input + output, family = binomial, 
    data = scaled_linfwd_gpt35_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.93263  -0.26941  -0.04796  -0.01200   2.05953  

Coefficients:
                    Estimate Std. Error z value Pr(>|z|)    
(Intercept)          -0.1080     0.3119  -0.346 0.729269    
taskconversion_fake  -6.9788     1.8301  -3.813 0.000137 ***
input                -0.8874     0.9413  -0.943 0.345840    
output               -0.8883     0.8660  -1.026 0.305031    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 218.099  on 199  degrees of freedom
Residual deviance:  88.084  on 196  degrees of freedom
AIC: 96.084

Number of Fisher Scoring iterations: 16


In [734]:
summary(linfwd_llama3_model)


Call:
bayesglm(formula = correct ~ task + input + output, family = binomial, 
    data = scaled_linfwd_llama3_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.3860   0.3302   0.4838   0.6707   1.1087  

Coefficients:
                    Estimate Std. Error z value Pr(>|z|)    
(Intercept)           2.0222     0.3676   5.502 3.76e-08 ***
taskconversion_fake  -0.6365     0.5159  -1.234    0.217    
input                -0.8564     0.6791  -1.261    0.207    
output                0.1551     0.7049   0.220    0.826    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 188.56  on 199  degrees of freedom
Residual deviance: 170.44  on 196  degrees of freedom
AIC: 178.44

Number of Fisher Scoring iterations: 12


In [735]:
summary(linfwd_claude3_model)


Call:
bayesglm(formula = correct ~ task + input + output, family = binomial, 
    data = scaled_linfwd_claude3_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.8226  -0.7272  -0.4137   0.7383   2.5266  

Coefficients:
                    Estimate Std. Error z value Pr(>|z|)    
(Intercept)           0.3710     0.2600   1.427    0.154    
taskconversion_fake  -2.6370     0.4526  -5.826 5.66e-09 ***
input                -0.2738     0.7048  -0.388    0.698    
output               -0.5623     0.6860  -0.820    0.412    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 256.41  on 199  degrees of freedom
Residual deviance: 187.89  on 196  degrees of freedom
AIC: 195.89

Number of Fisher Scoring iterations: 10


In [736]:
summary(linfwd_gemini1_model)


Call:
bayesglm(formula = correct ~ task + input + output, family = binomial, 
    data = scaled_linfwd_gemini1_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-0.3089  -0.2023  -0.1470  -0.1127   3.1970  

Coefficients:
                    Estimate Std. Error z value Pr(>|z|)    
(Intercept)          -4.6907     0.9684  -4.844 1.27e-06 ***
taskconversion_fake   0.5185     1.0694   0.485    0.628    
input                 0.6711     0.8106   0.828    0.408    
output                0.0417     0.8144   0.051    0.959    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 31.153  on 199  degrees of freedom
Residual deviance: 28.702  on 196  degrees of freedom
AIC: 36.702

Number of Fisher Scoring iterations: 16


In [737]:
summary(linrev_gpt4_model)


Call:
bayesglm(formula = correct ~ task + input + output, family = binomial, 
    data = scaled_linrev_gpt4_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.40818  -0.60031  -0.07781  -0.02974   1.97667  

Coefficients:
                           Estimate Std. Error z value Pr(>|z|)    
(Intercept)                 -0.3915     0.2716  -1.442 0.149357    
taskconversion_fakeinverse  -5.6718     1.6886  -3.359 0.000782 ***
input                       -0.4566     0.8349  -0.547 0.584435    
output                      -0.4366     0.7182  -0.608 0.543210    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 210.76  on 199  degrees of freedom
Residual deviance: 122.78  on 196  degrees of freedom
AIC: 130.78

Number of Fisher Scoring iterations: 18


In [738]:
summary(linrev_gpt35_model)


Call:
bayesglm(formula = correct ~ task + input + output, family = binomial, 
    data = scaled_linrev_gpt35_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.8763  -0.2377  -0.1307   0.7056   3.2600  

Coefficients:
                           Estimate Std. Error z value Pr(>|z|)    
(Intercept)                  0.4090     0.2653   1.541    0.123    
taskconversion_fakeinverse  -4.9606     0.9044  -5.485 4.13e-08 ***
input                       -0.1544     0.8135  -0.190    0.849    
output                      -0.6415     0.7081  -0.906    0.365    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 246.02  on 199  degrees of freedom
Residual deviance: 131.42  on 196  degrees of freedom
AIC: 139.42

Number of Fisher Scoring iterations: 11


In [739]:
summary(linrev_llama3_model)


Call:
bayesglm(formula = correct ~ task + input + output, family = binomial, 
    data = scaled_linrev_llama3_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.2572   0.3589   0.5206   0.6887   1.2611  

Coefficients:
                           Estimate Std. Error z value Pr(>|z|)    
(Intercept)                  1.0836     0.2763   3.922 8.77e-05 ***
taskconversion_fakeinverse   0.6159     0.4651   1.324    0.185    
input                       -0.1441     0.6643  -0.217    0.828    
output                      -0.5817     0.6447  -0.902    0.367    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 213.27  on 199  degrees of freedom
Residual deviance: 193.11  on 196  degrees of freedom
AIC: 201.11

Number of Fisher Scoring iterations: 9


In [740]:
summary(linrev_claude3_model)


Call:
bayesglm(formula = correct ~ task + input + output, family = binomial, 
    data = scaled_linrev_claude3_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.7616  -0.6884  -0.4960   0.8348   2.2110  

Coefficients:
                           Estimate Std. Error z value Pr(>|z|)    
(Intercept)                  0.6961     0.2527   2.754  0.00588 ** 
taskconversion_fakeinverse  -2.4356     0.4185  -5.819 5.91e-09 ***
input                        0.3545     0.6973   0.508  0.61112    
output                      -0.7565     0.6614  -1.144  0.25270    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 269.99  on 199  degrees of freedom
Residual deviance: 209.87  on 196  degrees of freedom
AIC: 217.87

Number of Fisher Scoring iterations: 14


In [741]:
summary(linrev_gemini1_model)


Call:
bayesglm(formula = correct ~ task + input + output, family = binomial, 
    data = scaled_linrev_gemini1_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-0.37546  -0.22349  -0.11494  -0.07701   2.91799  

Coefficients:
                           Estimate Std. Error z value Pr(>|z|)    
(Intercept)                 -3.9416     0.7851  -5.020 5.16e-07 ***
taskconversion_fakeinverse  -1.9485     1.4403  -1.353    0.176    
input                       -0.4655     0.9106  -0.511    0.609    
output                      -0.4255     0.8301  -0.513    0.608    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 31.153  on 199  degrees of freedom
Residual deviance: 25.592  on 196  degrees of freedom
AIC: 33.592

Number of Fisher Scoring iterations: 21


In [742]:
vif(linfwd_gpt4_model)

In [743]:
vif(linfwd_gpt35_model)

In [744]:
vif(linfwd_llama3_model)

In [745]:
vif(linfwd_claude3_model)

In [746]:
vif(linfwd_gemini1_model)

In [747]:
vif(linrev_gpt4_model)

In [748]:
vif(linrev_gpt35_model)

In [749]:
vif(linrev_llama3_model)

In [750]:
vif(linrev_claude3_model)

In [751]:
vif(linrev_gemini1_model)

### Comparing methods

In [None]:
# We ended up not using this part, but instead using the OOD part

In [None]:
linmethod_gpt4_df <- read.table(file = 'table_conversion_method_gpt-4-0613.tsv', sep = '\t', header = TRUE)
linmethod_gpt35_df <- read.table(file = 'table_conversion_method_gpt-3.5-turbo-0613.tsv', sep = '\t', header = TRUE)


In [None]:
scaled_linmethod_gpt4_df <- scale_lin_df(linmethod_gpt4_df)
scaled_linmethod_gpt35_df <- scale_lin_df(linmethod_gpt35_df)


In [None]:
contrasts(scaled_linmethod_gpt4_df$task) <- contr.sum(3)
contrasts(scaled_linmethod_gpt35_df$task) <- contr.sum(3)


In [None]:
linmethod_gpt4_model <- glmer(correct ~ task + (1|index), 
               data=scaled_linmethod_gpt4_df, family=binomial)
linmethod_gpt35_model <- glmer(correct ~ task + (1|index), 
               data=scaled_linmethod_gpt35_df , family=binomial)


In [None]:
linmethod_gpt4_null_model <- glmer(correct ~ (1|index), 
               data=scaled_linmethod_gpt4_df, family=binomial)
linmethod_gpt35_null_model <- glmer(correct ~ (1|index), 
               data=scaled_linmethod_gpt35_df , family=binomial)


In [None]:
summary(linmethod_gpt4_model)

In [None]:
summary(linmethod_gpt35_model)

In [None]:
anova(linmethod_gpt4_model,linmethod_gpt4_null_model,test="Chisq")

In [None]:
anova(linmethod_gpt35_model,linmethod_gpt35_null_model,test="Chisq")

In [None]:
gpt4_lin_multcomp <- glht(linmethod_gpt4_model, linfct=mcp(task="Tukey"))
gpt35_lin_multcomp <- glht(linmethod_gpt35_model, linfct=mcp(task="Tukey"))


In [None]:
summary(gpt4_lin_multcomp)

In [None]:
summary(gpt35_lin_multcomp)

### Basic test: OOD

In [752]:
# Read in data
linfwdood_gpt4_df <- read.table(file = 'table_conversion_ood_fwd_gpt-4-0613.tsv', sep = '\t', header = TRUE)
linfwdood_gpt35_df <- read.table(file = 'table_conversion_ood_fwd_gpt-3.5-turbo-0613.tsv', sep = '\t', header = TRUE)
linfwdood_llama3_df <- read.table(file = 'table_conversion_ood_fwd_llama-3-70b-chat-hf.tsv', sep = '\t', header = TRUE)
linfwdood_claude3_df <- read.table(file = 'table_conversion_ood_fwd_claude-3-opus-20240229.tsv', sep = '\t', header = TRUE)
linfwdood_gemini1_df <- read.table(file = 'table_conversion_ood_fwd_gemini-1.0-pro-001.tsv', sep = '\t', header = TRUE)


linrevood_gpt4_df <- read.table(file = 'table_conversion_ood_rev_gpt-4-0613.tsv', sep = '\t', header = TRUE)
linrevood_gpt35_df <- read.table(file = 'table_conversion_ood_rev_gpt-3.5-turbo-0613.tsv', sep = '\t', header = TRUE)
linrevood_llama3_df <- read.table(file = 'table_conversion_ood_rev_llama-3-70b-chat-hf.tsv', sep = '\t', header = TRUE)
linrevood_claude3_df <- read.table(file = 'table_conversion_ood_rev_claude-3-opus-20240229.tsv', sep = '\t', header = TRUE)
linrevood_gemini1_df <- read.table(file = 'table_conversion_ood_rev_gemini-1.0-pro-001.tsv', sep = '\t', header = TRUE)


In [753]:
scaled_linfwdood_gpt4_df <- scale_lin_df(linfwdood_gpt4_df)
scaled_linfwdood_gpt35_df <- scale_lin_df(linfwdood_gpt35_df)
scaled_linfwdood_llama3_df <- scale_lin_df(linfwdood_llama3_df)
scaled_linfwdood_claude3_df <- scale_lin_df(linfwdood_claude3_df)
scaled_linfwdood_gemini1_df <- scale_lin_df(linfwdood_gemini1_df)

scaled_linrevood_gpt4_df <- scale_lin_df(linrevood_gpt4_df)
scaled_linrevood_gpt35_df <- scale_lin_df(linrevood_gpt35_df)
scaled_linrevood_llama3_df <- scale_lin_df(linrevood_llama3_df)
scaled_linrevood_claude3_df <- scale_lin_df(linrevood_claude3_df)
scaled_linrevood_gemini1_df <- scale_lin_df(linrevood_gemini1_df)

In [754]:
linfwdood_gpt4_model <- bayesglm(correct ~ task + input + output, 
               data=scaled_linfwdood_gpt4_df, family=binomial)
linfwdood_gpt35_model <- glm(correct ~ task + input + output, 
               data=scaled_linfwdood_gpt35_df, family=binomial)
linfwdood_llama3_model <- bayesglm(correct ~ task + input + output, 
               data=scaled_linfwdood_llama3_df, family=binomial)
linfwdood_claude3_model <- glm(correct ~ task + input + output, 
               data=scaled_linfwdood_claude3_df, family=binomial)
linfwdood_gemini1_model <- bayesglm(correct ~ task + input + output, 
               data=scaled_linfwdood_gemini1_df, family=binomial)


linrevood_gpt4_model <- bayesglm(correct ~ task + input + output, 
               data=scaled_linrevood_gpt4_df, family=binomial)
linrevood_gpt35_model <- bayesglm(correct ~ task + input + output, 
               data=scaled_linrevood_gpt35_df, family=binomial)
linrevood_llama3_model <- bayesglm(correct ~ task + input + output, 
               data=scaled_linrevood_llama3_df, family=binomial)
linrevood_claude3_model <- glm(correct ~ task + input + output, 
               data=scaled_linrevood_claude3_df, family=binomial)
linrevood_gemini1_model <- bayesglm(correct ~ task + input + output, 
               data=scaled_linrevood_gemini1_df, family=binomial)

In [755]:
summary(linfwdood_gpt4_model)


Call:
bayesglm(formula = correct ~ task + input + output, family = binomial, 
    data = scaled_linfwdood_gpt4_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-0.7808  -0.3932  -0.1228  -0.0655   2.4532  

Coefficients:
                        Estimate Std. Error z value Pr(>|z|)    
(Intercept)              -2.2336     0.5434  -4.110 3.95e-05 ***
taskconversion_ood_fake  -3.6606     1.6255  -2.252   0.0243 *  
input                    -0.5776     0.7349  -0.786   0.4319    
output                   -0.1299     0.7855  -0.165   0.8687    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 85.193  on 199  degrees of freedom
Residual deviance: 64.937  on 196  degrees of freedom
AIC: 72.937

Number of Fisher Scoring iterations: 21


In [756]:
summary(linfwdood_gpt35_model)


Call:
glm(formula = correct ~ task + input + output, family = binomial, 
    data = scaled_linfwdood_gpt35_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.49051  -0.29198  -0.17157  -0.05437   2.81301  

Coefficients:
                        Estimate Std. Error z value Pr(>|z|)
(Intercept)              -1.6873     4.1137  -0.410    0.682
taskconversion_ood_fake  -4.1387     6.8204  -0.607    0.544
input                    -0.8731     6.7937  -0.129    0.898
output                   -0.7120     7.4484  -0.096    0.924

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 130.033  on 199  degrees of freedom
Residual deviance:  83.488  on 196  degrees of freedom
AIC: 91.488

Number of Fisher Scoring iterations: 8


In [757]:
summary(linfwdood_llama3_model)


Call:
bayesglm(formula = correct ~ task + input + output, family = binomial, 
    data = scaled_linfwdood_llama3_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-0.54685  -0.20901  -0.11648  -0.05595   2.38931  

Coefficients:
                        Estimate Std. Error z value Pr(>|z|)    
(Intercept)              -5.8880     1.5081  -3.904 9.46e-05 ***
taskconversion_ood_fake   1.7079     1.5515   1.101    0.271    
input                    -0.4229     0.7577  -0.558    0.577    
output                   -0.8453     0.9707  -0.871    0.384    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 39.216  on 199  degrees of freedom
Residual deviance: 30.002  on 196  degrees of freedom
AIC: 38.002

Number of Fisher Scoring iterations: 20


In [758]:
summary(linfwdood_claude3_model)


Call:
glm(formula = correct ~ task + input + output, family = binomial, 
    data = scaled_linfwdood_claude3_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.0111  -0.8869  -0.4530  -0.2997   2.2195  

Coefficients:
                        Estimate Std. Error z value Pr(>|z|)
(Intercept)               -1.617      1.165  -1.388    0.165
taskconversion_ood_fake    0.204      2.213   0.092    0.927
input                     -1.763      1.803  -0.978    0.328
output                     1.779      2.055   0.866    0.387

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 210.76  on 199  degrees of freedom
Residual deviance: 187.84  on 196  degrees of freedom
AIC: 195.84

Number of Fisher Scoring iterations: 5


In [759]:
summary(linfwdood_gemini1_model)


Call:
bayesglm(formula = correct ~ task + input + output, family = binomial, 
    data = scaled_linfwdood_gemini1_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-0.03007  -0.03007  -0.03007  -0.03007  -0.03007  

Coefficients:
                          Estimate Std. Error z value Pr(>|z|)  
(Intercept)             -7.701e+00  3.324e+00  -2.317   0.0205 *
taskconversion_ood_fake -2.695e-16  2.240e+00   0.000   1.0000  
input                   -1.204e-16  1.125e+00   0.000   1.0000  
output                  -1.684e-16  1.129e+00   0.000   1.0000  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 0.00000  on 199  degrees of freedom
Residual deviance: 0.18087  on 196  degrees of freedom
AIC: 8.1809

Number of Fisher Scoring iterations: 23


In [760]:
summary(linrevood_gpt4_model)


Call:
bayesglm(formula = correct ~ task + input + output, family = binomial, 
    data = scaled_linrevood_gpt4_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-0.85211  -0.69168  -0.09061  -0.07368   1.85504  

Coefficients:
                               Estimate Std. Error z value Pr(>|z|)   
(Intercept)                     -1.2347     0.4630  -2.666  0.00767 **
taskconversion_ood_fakeinverse  -4.4110     1.7146  -2.573  0.01009 * 
input                           -0.1056     0.8171  -0.129  0.89714   
output                           0.3170     0.6315   0.502  0.61571   
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 146.77  on 199  degrees of freedom
Residual deviance: 109.74  on 196  degrees of freedom
AIC: 117.74

Number of Fisher Scoring iterations: 20


In [761]:
summary(linrevood_gpt35_model)


Call:
bayesglm(formula = correct ~ task + input + output, family = binomial, 
    data = scaled_linrevood_gpt35_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.05965  -0.83455  -0.08504  -0.06645   1.60478  

Coefficients:
                                Estimate Std. Error z value Pr(>|z|)   
(Intercept)                    -0.792035   0.458688  -1.727  0.08421 . 
taskconversion_ood_fakeinverse -4.812195   1.725003  -2.790  0.00528 **
input                          -0.310161   0.823762  -0.377  0.70653   
output                          0.003018   0.631042   0.005  0.99618   
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 185.49  on 199  degrees of freedom
Residual deviance: 128.92  on 196  degrees of freedom
AIC: 136.92

Number of Fisher Scoring iterations: 19


In [762]:
summary(linrevood_llama3_model)


Call:
bayesglm(formula = correct ~ task + input + output, family = binomial, 
    data = scaled_linrevood_llama3_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-0.03007  -0.03007  -0.03007  -0.03007  -0.03007  

Coefficients:
                                 Estimate Std. Error z value Pr(>|z|)  
(Intercept)                    -7.701e+00  3.324e+00  -2.317   0.0205 *
taskconversion_ood_fakeinverse -4.066e-16  2.238e+00   0.000   1.0000  
input                           3.050e-17  1.129e+00   0.000   1.0000  
output                          2.461e-16  1.126e+00   0.000   1.0000  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 0.00000  on 199  degrees of freedom
Residual deviance: 0.18087  on 196  degrees of freedom
AIC: 8.1809

Number of Fisher Scoring iterations: 23


In [763]:
summary(linrevood_claude3_model)


Call:
glm(formula = correct ~ task + input + output, family = binomial, 
    data = scaled_linrevood_claude3_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.4726  -0.5075  -0.4394   0.9688   2.2499  

Coefficients:
                               Estimate Std. Error z value Pr(>|z|)
(Intercept)                     -0.6762     0.9203  -0.735    0.462
taskconversion_ood_fakeinverse  -0.5683     1.8056  -0.315    0.753
input                           -1.9481     1.8237  -1.068    0.285
output                           1.7303     1.4929   1.159    0.246

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 255.07  on 199  degrees of freedom
Residual deviance: 199.81  on 196  degrees of freedom
AIC: 207.81

Number of Fisher Scoring iterations: 5


In [764]:
summary(linrevood_gemini1_model)


Call:
bayesglm(formula = correct ~ task + input + output, family = binomial, 
    data = scaled_linrevood_gemini1_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-0.22038  -0.11695  -0.07297  -0.04757   2.78091  

Coefficients:
                               Estimate Std. Error z value Pr(>|z|)    
(Intercept)                     -6.4660     1.7349  -3.727 0.000194 ***
taskconversion_ood_fakeinverse   1.3082     1.6342   0.800 0.423426    
input                           -0.3456     0.9525  -0.363 0.716704    
output                          -0.7206     0.9781  -0.737 0.461282    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 12.5916  on 199  degrees of freedom
Residual deviance:  9.8651  on 196  degrees of freedom
AIC: 17.865

Number of Fisher Scoring iterations: 24


In [765]:
vif(linfwdood_gpt4_model)

In [766]:
vif(linfwdood_gpt35_model)

In [767]:
vif(linfwdood_llama3_model)

In [768]:
vif(linfwdood_claude3_model)

In [769]:
vif(linfwdood_gemini1_model)

In [770]:
vif(linrevood_gpt4_model)

In [771]:
vif(linrevood_gpt35_model)

In [772]:
vif(linrevood_llama3_model)

In [773]:
vif(linrevood_claude3_model)

In [774]:
vif(linrevood_gemini1_model)

### Comparing methods: OOD

In [775]:
linmethodood_gpt4_df <- read.table(file = 'table_conversion_ood_method_gpt-4-0613.tsv', sep = '\t', header = TRUE)
linmethodood_gpt35_df <- read.table(file = 'table_conversion_ood_method_gpt-3.5-turbo-0613.tsv', sep = '\t', header = TRUE)
linmethodood_llama3_df <- read.table(file = 'table_conversion_ood_method_llama-3-70b-chat-hf.tsv', sep = '\t', header = TRUE)
linmethodood_claude3_df <- read.table(file = 'table_conversion_ood_method_claude-3-opus-20240229.tsv', sep = '\t', header = TRUE)
linmethodood_gemini1_df <- read.table(file = 'table_conversion_ood_method_gemini-1.0-pro-001.tsv', sep = '\t', header = TRUE)




In [776]:
scaled_linmethodood_gpt4_df <- scale_lin_df(linmethodood_gpt4_df)
scaled_linmethodood_gpt35_df <- scale_lin_df(linmethodood_gpt35_df)
scaled_linmethodood_llama3_df <- scale_lin_df(linmethodood_llama3_df)
scaled_linmethodood_claude3_df <- scale_lin_df(linmethodood_claude3_df)
scaled_linmethodood_gemini1_df <- scale_lin_df(linmethodood_gemini1_df)



In [777]:
contrasts(scaled_linmethodood_gpt4_df$task) <- contr.sum(3)
contrasts(scaled_linmethodood_gpt35_df$task) <- contr.sum(3)
contrasts(scaled_linmethodood_llama3_df$task) <- contr.sum(3)
contrasts(scaled_linmethodood_claude3_df$task) <- contr.sum(3)
contrasts(scaled_linmethodood_gemini1_df$task) <- contr.sum(3)


In [787]:
linmethodood_gpt4_model <- bglmer(correct ~ task + (1|index), 
               data=scaled_linmethodood_gpt4_df, family=binomial, control=glmerControl(optimizer="bobyqa",optCtrl=list(maxfun=2e5)))
linmethodood_gpt35_model <- bglmer(correct ~ task + (1|index), 
               data=scaled_linmethodood_gpt35_df , family=binomial, control=glmerControl(optimizer="bobyqa",optCtrl=list(maxfun=2e5)))
linmethodood_llama3_model <- bayesglm(correct ~ task + input + output, 
               data=scaled_linmethodood_llama3_df , family=binomial)
#linmethodood_claude3_model <- bglmer(correct ~ task + (1|index), 
#               data=scaled_linmethodood_claude3_df , family=binomial, control=glmerControl(optimizer="bobyqa",optCtrl=list(maxfun=2e5)))
#linmethodood_claude3_model <- bglmer(correct ~ task + (1|index), 
#               data=scaled_linmethodood_claude3_df , family=binomial)
linmethodood_claude3_model <- bayesglm(correct ~ task + input + output, 
               data=scaled_linmethodood_claude3_df , family=binomial)
linmethodood_gemini1_model <- bglmer(correct ~ task + (1|index), 
               data=scaled_linmethodood_gemini1_df , family=binomial)



In [788]:
linmethodood_gpt4_null_model <- bglmer(correct ~ (1|index), 
               data=scaled_linmethodood_gpt4_df, family=binomial, control=glmerControl(optimizer="bobyqa",optCtrl=list(maxfun=2e5)))
linmethodood_gpt35_null_model <- bglmer(correct ~ (1|index), 
               data=scaled_linmethodood_gpt35_df , family=binomial, control=glmerControl(optimizer="bobyqa",optCtrl=list(maxfun=2e5)))
linmethodood_llama3_null_model <- bayesglm(correct ~  input + output, 
               data=scaled_linmethodood_llama3_df , family=binomial)
linmethodood_claude3_null_model <- bayesglm(correct ~  input + output, 
               data=scaled_linmethodood_claude3_df , family=binomial)
linmethodood_gemini1_null_model <- bglmer(correct ~ (1|index), 
               data=scaled_linmethodood_gemini1_df , family=binomial, control=glmerControl(optimizer="bobyqa",optCtrl=list(maxfun=2e5)))



In [789]:
summary(linmethodood_gpt4_model)

Cov prior  : index ~ wishart(df = 3.5, scale = Inf, posterior.scale = cov, common.scale = TRUE)
Prior dev  : -4.5409

Generalized linear mixed model fit by maximum likelihood (Laplace
  Approximation) [bglmerMod]
 Family: binomial  ( logit )
Formula: correct ~ task + (1 | index)
   Data: scaled_linmethodood_gpt4_df
Control: glmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e+05))

     AIC      BIC   logLik deviance df.resid 
   293.5    308.3   -142.7    285.5      296 

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
-2.1554 -0.2182 -0.0490  0.1089  4.5829 

Random effects:
 Groups Name        Variance Std.Dev.
 index  (Intercept) 20.64    4.543   
Number of obs: 300, groups:  index, 100

Fixed effects:
            Estimate Std. Error z value Pr(>|z|)   
(Intercept)  -1.9844     0.7911  -2.508  0.01213 * 
task1        -4.0201     1.2351  -3.255  0.00113 **
task2         3.4597     1.1213   3.085  0.00203 **
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.

In [790]:
summary(linmethodood_gpt35_model)

Cov prior  : index ~ wishart(df = 3.5, scale = Inf, posterior.scale = cov, common.scale = TRUE)
Prior dev  : -4.068

Generalized linear mixed model fit by maximum likelihood (Laplace
  Approximation) [bglmerMod]
 Family: binomial  ( logit )
Formula: correct ~ task + (1 | index)
   Data: scaled_linmethodood_gpt35_df
Control: glmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e+05))

     AIC      BIC   logLik deviance df.resid 
   309.3    324.1   -150.6    301.3      296 

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
-8.5770 -0.4138 -0.0273  0.1937  2.1967 

Random effects:
 Groups Name        Variance Std.Dev.
 index  (Intercept) 15.06    3.881   
Number of obs: 300, groups:  index, 100

Fixed effects:
            Estimate Std. Error z value Pr(>|z|)    
(Intercept)  -1.1687     0.5473  -2.135 0.032740 *  
task1        -2.7432     0.7012  -3.912 9.15e-05 ***
task2         3.1288     0.8316   3.762 0.000168 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.0

In [791]:
summary(linmethodood_llama3_model)


Call:
bayesglm(formula = correct ~ task + input + output, family = binomial, 
    data = scaled_linmethodood_llama3_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-0.34618  -0.28850  -0.06479  -0.05863   2.53700  

Coefficients:
            Estimate Std. Error z value Pr(>|z|)    
(Intercept) -5.20309    0.99300  -5.240 1.61e-07 ***
task1       -1.09288    1.02632  -1.065    0.287    
task2       -1.09288    1.02632  -1.065    0.287    
input        0.07428    0.76622   0.097    0.923    
output       0.07428    0.76622   0.097    0.923    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 50.860  on 299  degrees of freedom
Residual deviance: 40.338  on 295  degrees of freedom
AIC: 50.338

Number of Fisher Scoring iterations: 21


In [792]:
summary(linmethodood_claude3_model)


Call:
bayesglm(formula = correct ~ task + input + output, family = binomial, 
    data = scaled_linmethodood_claude3_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.24258  -0.95536   0.09639   1.14401   1.53292  

Coefficients:
            Estimate Std. Error z value Pr(>|z|)    
(Intercept)  1.53949    0.44855   3.432 0.000599 ***
task1       -2.13742    0.45983  -4.648 3.35e-06 ***
task2        3.75987    0.87696   4.287 1.81e-05 ***
input       -0.06424    0.72739  -0.088 0.929630    
output      -0.06424    0.72739  -0.088 0.929630    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 401.25  on 299  degrees of freedom
Residual deviance: 268.16  on 295  degrees of freedom
AIC: 278.16

Number of Fisher Scoring iterations: 16


In [793]:
summary(linmethodood_gemini1_model)

Cov prior  : index ~ wishart(df = 3.5, scale = Inf, posterior.scale = cov, common.scale = TRUE)
Prior dev  : -13.7628

Generalized linear mixed model fit by maximum likelihood (Laplace
  Approximation) [bglmerMod]
 Family: binomial  ( logit )
Formula: correct ~ task + (1 | index)
   Data: scaled_linmethodood_gemini1_df

     AIC      BIC   logLik deviance df.resid 
    92.7    107.5    -42.3     84.7      296 

Scaled residuals: 
      Min        1Q    Median        3Q       Max 
-0.045255 -0.000792  0.000000  0.001143  0.046064 

Random effects:
 Groups Name        Variance Std.Dev.
 index  (Intercept) 9655     98.26   
Number of obs: 300, groups:  index, 100

Fixed effects:
            Estimate Std. Error z value Pr(>|z|)    
(Intercept)  -18.126      4.310  -4.205 2.61e-05 ***
task1        -35.507      8.367  -4.244 2.20e-05 ***
task2         31.668      4.334   7.307 2.73e-13 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Correlation of Fixed Effects:
     

In [794]:
anova(linmethodood_gpt4_model,linmethodood_gpt4_null_model,test="Chisq")

Unnamed: 0_level_0,npar,AIC,BIC,logLik,deviance,Chisq,Df,Pr(>Chisq)
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
linmethodood_gpt4_null_model,2,387.7033,395.1109,-191.8516,383.7033,,,
linmethodood_gpt4_model,4,293.4944,308.3095,-142.7472,285.4944,98.20891,2.0,4.7228620000000005e-22


In [795]:
anova(linmethodood_gpt35_model,linmethodood_gpt35_null_model,test="Chisq")

Unnamed: 0_level_0,npar,AIC,BIC,logLik,deviance,Chisq,Df,Pr(>Chisq)
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
linmethodood_gpt35_null_model,2,391.4305,398.8381,-193.7153,387.4305,,,
linmethodood_gpt35_model,4,309.2508,324.0659,-150.6254,301.2508,86.17976,2.0,1.9333149999999997e-19


In [796]:
anova(linmethodood_llama3_model,linmethodood_llama3_null_model,test="Chisq")

Unnamed: 0_level_0,Resid. Df,Resid. Dev,Df,Deviance,Pr(>Chi)
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,295,40.33844,,,
2,297,50.7367,-2.0,-10.39826,0.005521366


In [797]:
anova(linmethodood_claude3_model,linmethodood_claude3_null_model,test="Chisq")

Unnamed: 0_level_0,Resid. Df,Resid. Dev,Df,Deviance,Pr(>Chi)
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,295,268.1559,,,
2,297,400.6999,-2.0,-132.544,1.653641e-29


In [798]:
anova(linmethodood_gemini1_model,linmethodood_gemini1_null_model,test="Chisq")

Unnamed: 0_level_0,npar,AIC,BIC,logLik,deviance,Chisq,Df,Pr(>Chisq)
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
linmethodood_gemini1_null_model,2,373.70742,381.115,-184.85371,369.70742,,,
linmethodood_gemini1_model,4,92.68443,107.4996,-42.34221,84.68443,285.023,2.0,1.282459e-62


In [799]:
gpt4_linood_multcomp <- glht(linmethodood_gpt4_model, linfct=mcp(task="Tukey"))

In [800]:
gpt35_linood_multcomp <- glht(linmethodood_gpt35_model, linfct=mcp(task="Tukey"))

In [801]:
llama3_linood_multcomp <- glht(linmethodood_llama3_model, linfct=mcp(task="Tukey"))

In [802]:
claude3_linood_multcomp <- glht(linmethodood_claude3_model, linfct=mcp(task="Tukey"))

In [803]:
gemini1_linood_multcomp <- glht(linmethodood_gemini1_model, linfct=mcp(task="Tukey"))

In [804]:
summary(gpt4_linood_multcomp)


	 Simultaneous Tests for General Linear Hypotheses

Multiple Comparisons of Means: Tukey Contrasts


Fit: bglmer(formula = correct ~ task + (1 | index), data = scaled_linmethodood_gpt4_df, 
    family = binomial, control = glmerControl(optimizer = "bobyqa", 
        optCtrl = list(maxfun = 2e+05)))

Linear Hypotheses:
                                                                      Estimate
conversion_ood_actualprimed - conversion_ood_actual == 0                 7.480
conversion_ood_actualprimedcontrol - conversion_ood_actual == 0          4.581
conversion_ood_actualprimedcontrol - conversion_ood_actualprimed == 0   -2.899
                                                                      Std. Error
conversion_ood_actualprimed - conversion_ood_actual == 0                   2.332
conversion_ood_actualprimedcontrol - conversion_ood_actual == 0            1.430
conversion_ood_actualprimedcontrol - conversion_ood_actualprimed == 0      1.114
                                       

In [805]:
summary(gpt35_linood_multcomp)


	 Simultaneous Tests for General Linear Hypotheses

Multiple Comparisons of Means: Tukey Contrasts


Fit: bglmer(formula = correct ~ task + (1 | index), data = scaled_linmethodood_gpt35_df, 
    family = binomial, control = glmerControl(optimizer = "bobyqa", 
        optCtrl = list(maxfun = 2e+05)))

Linear Hypotheses:
                                                                      Estimate
conversion_ood_actualprimed - conversion_ood_actual == 0                5.8721
conversion_ood_actualprimedcontrol - conversion_ood_actual == 0         2.3577
conversion_ood_actualprimedcontrol - conversion_ood_actualprimed == 0  -3.5144
                                                                      Std. Error
conversion_ood_actualprimed - conversion_ood_actual == 0                  1.5005
conversion_ood_actualprimedcontrol - conversion_ood_actual == 0           0.7224
conversion_ood_actualprimedcontrol - conversion_ood_actualprimed == 0     1.0590
                                      

In [806]:
summary(llama3_linood_multcomp)


	 Simultaneous Tests for General Linear Hypotheses

Multiple Comparisons of Means: Tukey Contrasts


Fit: bayesglm(formula = correct ~ task + input + output, family = binomial, 
    data = scaled_linmethodood_llama3_df)

Linear Hypotheses:
                                                                       Estimate
conversion_ood_actualprimed - conversion_ood_actual == 0              3.553e-15
conversion_ood_actualprimedcontrol - conversion_ood_actual == 0       3.279e+00
conversion_ood_actualprimedcontrol - conversion_ood_actualprimed == 0 3.279e+00
                                                                      Std. Error
conversion_ood_actualprimed - conversion_ood_actual == 0               1.793e+00
conversion_ood_actualprimedcontrol - conversion_ood_actual == 0        1.746e+00
conversion_ood_actualprimedcontrol - conversion_ood_actualprimed == 0  1.746e+00
                                                                      z value
conversion_ood_actualprimed - convers

In [807]:
summary(claude3_linood_multcomp)


	 Simultaneous Tests for General Linear Hypotheses

Multiple Comparisons of Means: Tukey Contrasts


Fit: bayesglm(formula = correct ~ task + input + output, family = binomial, 
    data = scaled_linmethodood_claude3_df)

Linear Hypotheses:
                                                                      Estimate
conversion_ood_actualprimed - conversion_ood_actual == 0                5.8973
conversion_ood_actualprimedcontrol - conversion_ood_actual == 0         0.5150
conversion_ood_actualprimedcontrol - conversion_ood_actualprimed == 0  -5.3823
                                                                      Std. Error
conversion_ood_actualprimed - conversion_ood_actual == 0                  1.3214
conversion_ood_actualprimedcontrol - conversion_ood_actual == 0           0.2891
conversion_ood_actualprimedcontrol - conversion_ood_actualprimed == 0     1.3253
                                                                      z value
conversion_ood_actualprimed - conversion

In [808]:
summary(gemini1_linood_multcomp)


	 Simultaneous Tests for General Linear Hypotheses

Multiple Comparisons of Means: Tukey Contrasts


Fit: bglmer(formula = correct ~ task + (1 | index), data = scaled_linmethodood_gemini1_df, 
    family = binomial)

Linear Hypotheses:
                                                                      Estimate
conversion_ood_actualprimed - conversion_ood_actual == 0                67.176
conversion_ood_actualprimedcontrol - conversion_ood_actual == 0         39.346
conversion_ood_actualprimedcontrol - conversion_ood_actualprimed == 0  -27.829
                                                                      Std. Error
conversion_ood_actualprimed - conversion_ood_actual == 0                  12.557
conversion_ood_actualprimedcontrol - conversion_ood_actual == 0           12.691
conversion_ood_actualprimedcontrol - conversion_ood_actualprimed == 0      2.718
                                                                      z value
conversion_ood_actualprimed - conversion_ood_

# Counting

In [557]:
# Read in data
countwords_gpt4_df <- read.table(file = 'table_counting_words_gpt-4-0613.tsv', sep = '\t', header = TRUE)
countwords_gpt35_df <- read.table(file = 'table_counting_words_gpt-3.5-turbo-0613.tsv', sep = '\t', header = TRUE)
countwords_llama3_df <- read.table(file = 'table_counting_words_llama-3-70b-chat-hf.tsv', sep = '\t', header = TRUE)
countwords_claude3_df <- read.table(file = 'table_counting_words_claude-3-opus-20240229.tsv', sep = '\t', header = TRUE)
countwords_gemini1_df <- read.table(file = 'table_counting_words_gemini-1.0-pro-001.tsv', sep = '\t', header = TRUE)



countchars_gpt4_df <- read.table(file = 'table_counting_chars_gpt-4-0613.tsv', sep = '\t', header = TRUE)
countchars_gpt35_df <- read.table(file = 'table_counting_chars_gpt-3.5-turbo-0613.tsv', sep = '\t', header = TRUE)
countchars_llama3_df <- read.table(file = 'table_counting_chars_llama-3-70b-chat-hf.tsv', sep = '\t', header = TRUE)
countchars_claude3_df <- read.table(file = 'table_counting_chars_claude-3-opus-20240229.tsv', sep = '\t', header = TRUE)
countchars_gemini1_df <- read.table(file = 'table_counting_chars_gemini-1.0-pro-001.tsv', sep = '\t', header = TRUE)




In [558]:
# Z-score data
scaled_countwords_gpt4_df <- scale_df_with_index(countwords_gpt4_df)
scaled_countwords_gpt35_df <- scale_df_with_index(countwords_gpt35_df)
scaled_countwords_llama3_df <- scale_df_with_index(countwords_llama3_df)
scaled_countwords_claude3_df <- scale_df_with_index(countwords_claude3_df)
scaled_countwords_gemini1_df <- scale_df_with_index(countwords_gemini1_df)



scaled_countchars_gpt4_df <- scale_df_with_index(countchars_gpt4_df)
scaled_countchars_gpt35_df <- scale_df_with_index(countchars_gpt35_df)
scaled_countchars_llama3_df <- scale_df_with_index(countchars_llama3_df)
scaled_countchars_claude3_df <- scale_df_with_index(countchars_claude3_df)
scaled_countchars_gemini1_df <- scale_df_with_index(countchars_gemini1_df)



In [559]:
countwords_gpt4_model <- glm(correct ~ index + input_logprob + output_logprob, 
                   data=scaled_countwords_gpt4_df, family=binomial)
countwords_gpt35_model <- glm(correct ~ index + input_logprob + output_logprob, 
                   data=scaled_countwords_gpt35_df, family=binomial)
countwords_llama3_model <- glm(correct ~ index + input_logprob + output_logprob, 
                   data=scaled_countwords_llama3_df, family=binomial)
countwords_claude3_model <- glm(correct ~ index + input_logprob + output_logprob, 
                   data=scaled_countwords_claude3_df, family=binomial)
countwords_gemini1_model <- glm(correct ~ index + input_logprob + output_logprob, 
                   data=scaled_countwords_gemini1_df, family=binomial)



countchars_gpt4_model <- glm(correct ~ index + input_logprob + output_logprob, 
                   data=scaled_countchars_gpt4_df, family=binomial)
countchars_gpt35_model <- glm(correct ~ index + input_logprob + output_logprob, 
                   data=scaled_countchars_gpt35_df, family=binomial)
countchars_llama3_model <- glm(correct ~ index + input_logprob + output_logprob, 
                   data=scaled_countchars_llama3_df, family=binomial)
countchars_claude3_model <- glm(correct ~ index + input_logprob + output_logprob, 
                   data=scaled_countchars_claude3_df, family=binomial)
countchars_gemini1_model <- glm(correct ~ index + input_logprob + output_logprob, 
                   data=scaled_countchars_gemini1_df, family=binomial)



“glm.fit: fitted probabilities numerically 0 or 1 occurred”


In [560]:
summary(countwords_gpt4_model)


Call:
glm(formula = correct ~ index + input_logprob + output_logprob, 
    family = binomial, data = scaled_countwords_gpt4_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-2.57541  -0.16382  -0.04986  -0.02723   2.91722  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)    -3.35701    0.15836 -21.199   <2e-16 ***
index          -0.19417    0.10797  -1.798   0.0721 .  
input_logprob  -0.10037    0.08225  -1.220   0.2224    
output_logprob  3.59077    0.17453  20.574   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 3264.4  on 2999  degrees of freedom
Residual deviance: 1137.6  on 2996  degrees of freedom
AIC: 1145.6

Number of Fisher Scoring iterations: 7


In [561]:
summary(countwords_gpt35_model)


Call:
glm(formula = correct ~ index + input_logprob + output_logprob, 
    family = binomial, data = scaled_countwords_gpt35_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.0485  -0.2589  -0.0675  -0.0346   3.4371  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)    -4.31424    0.19524 -22.098   <2e-16 ***
index           0.06015    0.10323   0.583    0.560    
input_logprob  -0.01359    0.05816  -0.234    0.815    
output_logprob  2.95668    0.15685  18.851   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 2283.9  on 2999  degrees of freedom
Residual deviance: 1074.0  on 2996  degrees of freedom
AIC: 1082

Number of Fisher Scoring iterations: 7


In [562]:
summary(countwords_llama3_model)


Call:
glm(formula = correct ~ index + input_logprob + output_logprob, 
    family = binomial, data = scaled_countwords_llama3_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.2355  -0.0029   0.0000   0.0000   3.8106  

Coefficients:
                Estimate Std. Error z value Pr(>|z|)    
(Intercept)    -19.55755    1.66061 -11.777   <2e-16 ***
index          -12.92851    1.38981  -9.302   <2e-16 ***
input_logprob   -0.08741    0.08517  -1.026    0.305    
output_logprob   0.70170    0.44924   1.562    0.118    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 1777.9  on 2999  degrees of freedom
Residual deviance:  385.2  on 2996  degrees of freedom
AIC: 393.2

Number of Fisher Scoring iterations: 11


In [563]:
summary(countwords_claude3_model)


Call:
glm(formula = correct ~ index + input_logprob + output_logprob, 
    family = binomial, data = scaled_countwords_claude3_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-3.13366  -0.18146  -0.07462   0.06983   3.12381  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)    -2.64527    0.12493 -21.173   <2e-16 ***
index           1.20371    0.11671  10.313   <2e-16 ***
input_logprob  -0.19547    0.08116  -2.408    0.016 *  
output_logprob  4.42207    0.19597  22.565   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 3452.9  on 2999  degrees of freedom
Residual deviance: 1211.2  on 2996  degrees of freedom
AIC: 1219.2

Number of Fisher Scoring iterations: 7


In [564]:
summary(countwords_gemini1_model)


Call:
glm(formula = correct ~ index + input_logprob + output_logprob, 
    family = binomial, data = scaled_countwords_gemini1_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.2806  -0.1770  -0.0376  -0.0155   3.3341  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)    -4.89036    0.23717 -20.619  < 2e-16 ***
index          -0.60018    0.12079  -4.969 6.73e-07 ***
input_logprob  -0.09796    0.06653  -1.473    0.141    
output_logprob  3.17820    0.18626  17.063  < 2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 2476.51  on 2999  degrees of freedom
Residual deviance:  975.69  on 2996  degrees of freedom
AIC: 983.69

Number of Fisher Scoring iterations: 8


In [565]:
summary(countchars_gpt4_model)


Call:
glm(formula = correct ~ index + input_logprob + output_logprob, 
    family = binomial, data = scaled_countchars_gpt4_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-2.46780  -0.15937  -0.03976   0.07145   3.08559  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)    -3.08292    0.15416 -19.998  < 2e-16 ***
index          -0.82241    0.10574  -7.778  7.4e-15 ***
input_logprob  -0.05048    0.07837  -0.644     0.52    
output_logprob  3.55316    0.17447  20.365  < 2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 3505.5  on 2999  degrees of freedom
Residual deviance: 1070.3  on 2996  degrees of freedom
AIC: 1078.3

Number of Fisher Scoring iterations: 7


In [566]:
summary(countchars_gpt35_model)


Call:
glm(formula = correct ~ index + input_logprob + output_logprob, 
    family = binomial, data = scaled_countchars_gpt35_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.0301  -0.5265  -0.2664   0.4923   2.3786  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)    -1.08394    0.05786 -18.733   <2e-16 ***
index          -0.71066    0.07709  -9.218   <2e-16 ***
input_logprob  -0.06039    0.05344  -1.130    0.258    
output_logprob  1.44736    0.08107  17.852   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 3836.9  on 2999  degrees of freedom
Residual deviance: 2254.9  on 2996  degrees of freedom
AIC: 2262.9

Number of Fisher Scoring iterations: 5


In [567]:
summary(countchars_llama3_model)


Call:
glm(formula = correct ~ index + input_logprob + output_logprob, 
    family = binomial, data = scaled_countchars_llama3_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.1747  -0.0554  -0.0063  -0.0015   4.3360  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)    -6.74905    0.38547 -17.508   <2e-16 ***
index          -1.13772    0.13529  -8.409   <2e-16 ***
input_logprob   0.10056    0.09419   1.068    0.286    
output_logprob  4.94964    0.30828  16.056   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 2751.11  on 2999  degrees of freedom
Residual deviance:  726.91  on 2996  degrees of freedom
AIC: 734.91

Number of Fisher Scoring iterations: 9


In [568]:
summary(countchars_claude3_model)


Call:
glm(formula = correct ~ index + input_logprob + output_logprob, 
    family = binomial, data = scaled_countchars_claude3_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.6175  -0.5283  -0.3022   0.4887   2.5693  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)    -0.89150    0.05511 -16.177  < 2e-16 ***
index          -0.24147    0.07641  -3.160  0.00158 ** 
input_logprob   0.12046    0.05366   2.245  0.02477 *  
output_logprob  1.82698    0.08599  21.247  < 2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 3911.2  on 2999  degrees of freedom
Residual deviance: 2314.6  on 2996  degrees of freedom
AIC: 2322.6

Number of Fisher Scoring iterations: 5


In [569]:
summary(countchars_gemini1_model)


Call:
glm(formula = correct ~ index + input_logprob + output_logprob, 
    family = binomial, data = scaled_countchars_gemini1_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.9741  -0.2426  -0.1023  -0.0558   3.4105  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)    -2.88848    0.12435 -23.229  < 2e-16 ***
index          -0.40575    0.09277  -4.374 1.22e-05 ***
input_logprob  -0.06626    0.06824  -0.971    0.332    
output_logprob  2.65102    0.12810  20.695  < 2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 3179.1  on 2999  degrees of freedom
Residual deviance: 1383.4  on 2996  degrees of freedom
AIC: 1391.4

Number of Fisher Scoring iterations: 7


In [570]:
vif(countwords_gpt4_model)

In [571]:
vif(countwords_gpt35_model)

In [572]:
vif(countwords_llama3_model)

In [573]:
vif(countwords_claude3_model)

In [574]:
vif(countwords_gemini1_model)

In [575]:
vif(countchars_gpt4_model)

In [576]:
vif(countchars_gpt35_model)

In [577]:
vif(countchars_llama3_model)

In [578]:
vif(countchars_claude3_model)

In [579]:
vif(countchars_gemini1_model)

### Varying input

In [603]:
# Read in data
countwords_both_gpt4_df <- read.table(file = 'table_counting_words_both_gpt-4-0613.tsv', sep = '\t', header = TRUE)
countwords_both_gpt35_df <- read.table(file = 'table_counting_words_both_gpt-3.5-turbo-0613.tsv', sep = '\t', header = TRUE)
countwords_both_llama3_df <- read.table(file = 'table_counting_words_both_llama-3-70b-chat-hf.tsv', sep = '\t', header = TRUE)
countwords_both_claude3_df <- read.table(file = 'table_counting_words_both_claude-3-opus-20240229.tsv', sep = '\t', header = TRUE)
countwords_both_gemini1_df <- read.table(file = 'table_counting_words_both_gemini-1.0-pro-001.tsv', sep = '\t', header = TRUE)


countchars_both_gpt4_df <- read.table(file = 'table_counting_chars_both_gpt-4-0613.tsv', sep = '\t', header = TRUE)
countchars_both_gpt35_df <- read.table(file = 'table_counting_chars_both_gpt-3.5-turbo-0613.tsv', sep = '\t', header = TRUE)
countchars_both_llama3_df <- read.table(file = 'table_counting_chars_both_llama-3-70b-chat-hf.tsv', sep = '\t', header = TRUE)
countchars_both_claude3_df <- read.table(file = 'table_counting_chars_both_claude-3-opus-20240229.tsv', sep = '\t', header = TRUE)
countchars_both_gemini1_df <- read.table(file = 'table_counting_chars_both_gemini-1.0-pro-001.tsv', sep = '\t', header = TRUE)



In [604]:
# Z-score data
scaled_countwords_both_gpt4_df <- scale_df_with_index(countwords_both_gpt4_df)
scaled_countwords_both_gpt35_df <- scale_df_with_index(countwords_both_gpt35_df)
scaled_countwords_both_llama3_df <- scale_df_with_index(countwords_both_llama3_df)
scaled_countwords_both_claude3_df <- scale_df_with_index(countwords_both_claude3_df)
scaled_countwords_both_gemini1_df <- scale_df_with_index(countwords_both_gemini1_df)



scaled_countchars_both_gpt4_df <- scale_df_with_index(countchars_both_gpt4_df)
scaled_countchars_both_gpt35_df <- scale_df_with_index(countchars_both_gpt35_df)
scaled_countchars_both_llama3_df <- scale_df_with_index(countchars_both_llama3_df)
scaled_countchars_both_claude3_df <- scale_df_with_index(countchars_both_claude3_df)
scaled_countchars_both_gemini1_df <- scale_df_with_index(countchars_both_gemini1_df)


In [605]:
countwords_both_gpt4_model <- glm(correct ~ index + input_logprob + output_logprob, 
                   data=scaled_countwords_both_gpt4_df, family=binomial)
countwords_both_gpt35_model <- glm(correct ~ index + input_logprob + output_logprob, 
                   data=scaled_countwords_both_gpt35_df, family=binomial)
countwords_both_llama3_model <- glm(correct ~ index + input_logprob + output_logprob, 
                   data=scaled_countwords_both_llama3_df, family=binomial)
countwords_both_claude3_model <- glm(correct ~ index + input_logprob + output_logprob, 
                   data=scaled_countwords_both_claude3_df, family=binomial)
countwords_both_gemini1_model <- glm(correct ~ index + input_logprob + output_logprob, 
                   data=scaled_countwords_both_gemini1_df, family=binomial)


countchars_both_gpt4_model <- glm(correct ~ index + input_logprob + output_logprob + input_ntokens, 
                   data=scaled_countchars_both_gpt4_df, family=binomial)
countchars_both_gpt35_model <- glm(correct ~ index + input_logprob + output_logprob + input_ntokens, 
                   data=scaled_countchars_both_gpt35_df, family=binomial)
countchars_both_llama3_model <- glm(correct ~ index + input_logprob + output_logprob + input_ntokens, 
                   data=scaled_countchars_both_llama3_df, family=binomial)
countchars_both_claude3_model <- glm(correct ~ index + input_logprob + output_logprob + input_ntokens, 
                   data=scaled_countchars_both_claude3_df, family=binomial)
countchars_both_gemini1_model <- glm(correct ~ index + input_logprob + output_logprob + input_ntokens, 
                   data=scaled_countchars_both_gemini1_df, family=binomial)


In [606]:
summary(countwords_both_gpt4_model)


Call:
glm(formula = correct ~ index + input_logprob + output_logprob, 
    family = binomial, data = scaled_countwords_both_gpt4_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-2.85516  -0.14205  -0.04240  -0.02356   3.10665  

Coefficients:
                Estimate Std. Error z value Pr(>|z|)    
(Intercept)    -3.597483   0.121388 -29.636   <2e-16 ***
index           0.006453   0.079863   0.081   0.9356    
input_logprob   0.117880   0.054903   2.147   0.0318 *  
output_logprob  3.927246   0.138109  28.436   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 6427.5  on 5999  degrees of freedom
Residual deviance: 2162.5  on 5996  degrees of freedom
AIC: 2170.5

Number of Fisher Scoring iterations: 7


In [607]:
summary(countwords_both_gpt35_model)


Call:
glm(formula = correct ~ index + input_logprob + output_logprob, 
    family = binomial, data = scaled_countwords_both_gpt35_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.8573  -0.3448  -0.1093  -0.0759   3.2168  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)    -3.47416    0.09882 -35.157  < 2e-16 ***
index           0.43608    0.07043   6.192 5.95e-10 ***
input_logprob  -0.11203    0.05037  -2.224   0.0261 *  
output_logprob  2.63160    0.09325  28.222  < 2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 4749.4  on 5999  degrees of freedom
Residual deviance: 2581.4  on 5996  degrees of freedom
AIC: 2589.4

Number of Fisher Scoring iterations: 7


In [608]:
summary(countwords_both_llama3_model)


Call:
glm(formula = correct ~ index + input_logprob + output_logprob, 
    family = binomial, data = scaled_countwords_both_llama3_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.0807  -0.0182  -0.0004   0.0000   6.4976  

Coefficients:
                Estimate Std. Error z value Pr(>|z|)    
(Intercept)    -12.94239    0.71725 -18.044  < 2e-16 ***
index           -6.69316    0.62785 -10.661  < 2e-16 ***
input_logprob    0.03139    0.08290   0.379    0.705    
output_logprob   2.20461    0.27893   7.904 2.71e-15 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 3532.37  on 5999  degrees of freedom
Residual deviance:  943.47  on 5996  degrees of freedom
AIC: 951.47

Number of Fisher Scoring iterations: 10


In [609]:
summary(countwords_both_claude3_model)


Call:
glm(formula = correct ~ index + input_logprob + output_logprob, 
    family = binomial, data = scaled_countwords_both_claude3_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-3.0231  -0.1686  -0.0694   0.0785   3.1789  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)    -2.78614    0.09305 -29.944   <2e-16 ***
index           1.11689    0.08064  13.850   <2e-16 ***
input_logprob   0.08618    0.05237   1.646   0.0999 .  
output_logprob  4.44465    0.14135  31.444   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 6826.0  on 5999  degrees of freedom
Residual deviance: 2379.7  on 5996  degrees of freedom
AIC: 2387.7

Number of Fisher Scoring iterations: 7


In [610]:
summary(countwords_both_gemini1_model)


Call:
glm(formula = correct ~ index + input_logprob + output_logprob, 
    family = binomial, data = scaled_countwords_both_gemini1_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.2936  -0.1409  -0.0254  -0.0093   3.4855  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)    -5.44407    0.19449 -27.991  < 2e-16 ***
index          -0.72120    0.09169  -7.866 3.67e-15 ***
input_logprob   0.11904    0.06032   1.973   0.0484 *  
output_logprob  3.45681    0.14819  23.326  < 2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 4819.5  on 5999  degrees of freedom
Residual deviance: 1778.1  on 5996  degrees of freedom
AIC: 1786.1

Number of Fisher Scoring iterations: 8


In [611]:
summary(countchars_both_gpt4_model)


Call:
glm(formula = correct ~ index + input_logprob + output_logprob + 
    input_ntokens, family = binomial, data = scaled_countchars_both_gpt4_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-3.9362  -0.1098  -0.0262   0.0312   3.4657  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     -4.1767     0.1621 -25.768  < 2e-16 ***
index           -0.6219     0.1174  -5.296 1.18e-07 ***
input_logprob   -0.6803     0.1281  -5.313 1.08e-07 ***
output_logprob   3.3781     0.1396  24.202  < 2e-16 ***
input_ntokens   -2.5456     0.3061  -8.315  < 2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 6819.6  on 5999  degrees of freedom
Residual deviance: 1846.0  on 5995  degrees of freedom
AIC: 1856

Number of Fisher Scoring iterations: 8


In [612]:
summary(countchars_both_gpt35_model)


Call:
glm(formula = correct ~ index + input_logprob + output_logprob + 
    input_ntokens, family = binomial, data = scaled_countchars_both_gpt35_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.2093  -0.3962  -0.1922   0.2725   2.6262  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)    -1.86072    0.05794 -32.115  < 2e-16 ***
index          -0.38644    0.08268  -4.674 2.95e-06 ***
input_logprob   0.49788    0.07428   6.702 2.05e-11 ***
output_logprob  2.01289    0.07029  28.636  < 2e-16 ***
input_ntokens  -0.24959    0.12127  -2.058   0.0396 *  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 7098.4  on 5999  degrees of freedom
Residual deviance: 3590.7  on 5995  degrees of freedom
AIC: 3600.7

Number of Fisher Scoring iterations: 6


In [613]:
summary(countchars_both_llama3_model)


Call:
glm(formula = correct ~ index + input_logprob + output_logprob + 
    input_ntokens, family = binomial, data = scaled_countchars_both_llama3_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-3.0612  -0.0435  -0.0056  -0.0007   4.2653  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     -7.2487     0.3041 -23.833  < 2e-16 ***
index           -0.9411     0.1475  -6.379 1.78e-10 ***
input_logprob   -0.3386     0.1223  -2.769  0.00562 ** 
output_logprob   4.6506     0.2186  21.273  < 2e-16 ***
input_ntokens   -1.5947     0.3648  -4.371 1.24e-05 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 5406.7  on 5999  degrees of freedom
Residual deviance: 1393.4  on 5995  degrees of freedom
AIC: 1403.4

Number of Fisher Scoring iterations: 9


In [614]:
summary(countchars_both_claude3_model)


Call:
glm(formula = correct ~ index + input_logprob + output_logprob + 
    input_ntokens, family = binomial, data = scaled_countchars_both_claude3_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.8143  -0.5165  -0.2630   0.4236   2.7214  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)    -0.26980    0.03970  -6.796 1.07e-11 ***
index          -0.06894    0.07888  -0.874    0.382    
input_logprob  -1.05462    0.08541 -12.348  < 2e-16 ***
output_logprob  2.14899    0.06970  30.832  < 2e-16 ***
input_ntokens  -0.53436    0.10465  -5.106 3.29e-07 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 8208.1  on 5999  degrees of freedom
Residual deviance: 4300.8  on 5995  degrees of freedom
AIC: 4310.8

Number of Fisher Scoring iterations: 5


In [615]:
summary(countchars_both_gemini1_model)


Call:
glm(formula = correct ~ index + input_logprob + output_logprob + 
    input_ntokens, family = binomial, data = scaled_countchars_both_gemini1_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.1754  -0.1772  -0.0488  -0.0170   3.7965  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)    -4.04660    0.13403 -30.191  < 2e-16 ***
index          -0.17309    0.11755  -1.472   0.1409    
input_logprob   0.44829    0.08143   5.506 3.68e-08 ***
output_logprob  3.32526    0.12103  27.474  < 2e-16 ***
input_ntokens  -0.32724    0.16804  -1.947   0.0515 .  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 5802.6  on 5999  degrees of freedom
Residual deviance: 2191.4  on 5995  degrees of freedom
AIC: 2201.4

Number of Fisher Scoring iterations: 7


In [616]:
vif(countwords_both_gpt4_model)

In [617]:
vif(countwords_both_gpt35_model)

In [618]:
vif(countwords_both_llama3_model)

In [619]:
vif(countwords_both_claude3_model)

In [620]:
vif(countwords_both_gemini1_model)

In [621]:
vif(countchars_both_gpt4_model)

In [622]:
vif(countchars_both_gpt35_model)

In [623]:
vif(countchars_both_llama3_model)

In [624]:
vif(countchars_both_claude3_model)

In [625]:
vif(countchars_both_gemini1_model)

### Varying output

In [580]:
# Read in data
countwords_binary_gpt4_df <- read.table(file = 'table_counting_words_binary_gpt-4-0613.tsv', sep = '\t', header = TRUE)
countwords_binary_gpt35_df <- read.table(file = 'table_counting_words_binary_gpt-3.5-turbo-0613.tsv', sep = '\t', header = TRUE)
countwords_binary_llama3_df <- read.table(file = 'table_counting_words_binary_llama-3-70b-chat-hf.tsv', sep = '\t', header = TRUE)
countwords_binary_claude3_df <- read.table(file = 'table_counting_words_binary_claude-3-opus-20240229.tsv', sep = '\t', header = TRUE)
countwords_binary_gemini1_df <- read.table(file = 'table_counting_words_binary_gemini-1.0-pro-001.tsv', sep = '\t', header = TRUE)


countchars_binary_gpt4_df <- read.table(file = 'table_counting_chars_binary_gpt-4-0613.tsv', sep = '\t', header = TRUE)
countchars_binary_gpt35_df <- read.table(file = 'table_counting_chars_binary_gpt-3.5-turbo-0613.tsv', sep = '\t', header = TRUE)
countchars_binary_llama3_df <- read.table(file = 'table_counting_chars_binary_llama-3-70b-chat-hf.tsv', sep = '\t', header = TRUE)
countchars_binary_claude3_df <- read.table(file = 'table_counting_chars_binary_claude-3-opus-20240229.tsv', sep = '\t', header = TRUE)
countchars_binary_gemini1_df <- read.table(file = 'table_counting_chars_binary_gemini-1.0-pro-001.tsv', sep = '\t', header = TRUE)


In [581]:
# Z-score data
scaled_countwords_binary_gpt4_df <- scale_df_with_index(countwords_binary_gpt4_df)
scaled_countwords_binary_gpt35_df <- scale_df_with_index(countwords_binary_gpt35_df)
scaled_countwords_binary_llama3_df <- scale_df_with_index(countwords_binary_llama3_df)
scaled_countwords_binary_claude3_df <- scale_df_with_index(countwords_binary_claude3_df)
scaled_countwords_binary_gemini1_df <- scale_df_with_index(countwords_binary_gemini1_df)



scaled_countchars_binary_gpt4_df <- scale_df_with_index(countchars_binary_gpt4_df)
scaled_countchars_binary_gpt35_df <- scale_df_with_index(countchars_binary_gpt35_df)
scaled_countchars_binary_llama3_df <- scale_df_with_index(countchars_binary_llama3_df)
scaled_countchars_binary_claude3_df <- scale_df_with_index(countchars_binary_claude3_df)
scaled_countchars_binary_gemini1_df <- scale_df_with_index(countchars_binary_gemini1_df)


In [582]:
countwords_binary_gpt4_model <- glm(correct ~ index + input_logprob + output_logprob, 
                   data=scaled_countwords_binary_gpt4_df, family=binomial)
countwords_binary_gpt35_model <- glm(correct ~ index + input_logprob + output_logprob, 
                   data=scaled_countwords_binary_gpt35_df, family=binomial)
countwords_binary_llama3_model <- glm(correct ~ index + input_logprob + output_logprob, 
                   data=scaled_countwords_binary_llama3_df, family=binomial)
countwords_binary_claude3_model <- glm(correct ~ index + input_logprob + output_logprob, 
                   data=scaled_countwords_binary_claude3_df, family=binomial)
countwords_binary_gemini1_model <- glm(correct ~ index + input_logprob + output_logprob, 
                   data=scaled_countwords_binary_gemini1_df, family=binomial)


countchars_binary_gpt4_model <- glm(correct ~ index + input_logprob + output_logprob, 
                   data=scaled_countchars_binary_gpt4_df, family=binomial)
countchars_binary_gpt35_model <- glm(correct ~ index + input_logprob + output_logprob, 
                   data=scaled_countchars_binary_gpt35_df, family=binomial)
countchars_binary_llama3_model <- glm(correct ~ index + input_logprob + output_logprob, 
                   data=scaled_countchars_binary_llama3_df, family=binomial)
countchars_binary_claude3_model <- glm(correct ~ index + input_logprob + output_logprob, 
                   data=scaled_countchars_binary_claude3_df, family=binomial)
countchars_binary_gemini1_model <- glm(correct ~ index + input_logprob + output_logprob, 
                   data=scaled_countchars_binary_gemini1_df, family=binomial)


“glm.fit: fitted probabilities numerically 0 or 1 occurred”


In [583]:
summary(countwords_binary_gpt4_model)


Call:
glm(formula = correct ~ index + input_logprob + output_logprob, 
    family = binomial, data = scaled_countwords_binary_gpt4_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-2.13331  -0.37089  -0.08181  -0.04723   2.96403  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)    -2.73270    0.12591 -21.704  < 2e-16 ***
index           0.52051    0.08623   6.036 1.58e-09 ***
input_logprob   0.12293    0.06573   1.870   0.0615 .  
output_logprob  3.22597    0.15411  20.933  < 2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 2633.6  on 2399  degrees of freedom
Residual deviance: 1259.9  on 2396  degrees of freedom
AIC: 1267.9

Number of Fisher Scoring iterations: 7


In [584]:
summary(countwords_binary_gpt35_model)


Call:
glm(formula = correct ~ index + input_logprob + output_logprob, 
    family = binomial, data = scaled_countwords_binary_gpt35_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.6350  -0.3409  -0.1558  -0.0843   3.4431  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     -3.8385     0.1616 -23.747   <2e-16 ***
index            1.0924     0.0941  11.609   <2e-16 ***
input_logprob   -0.2086     0.1051  -1.984   0.0473 *  
output_logprob   1.8626     0.1338  13.917   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 1154.44  on 2399  degrees of freedom
Residual deviance:  764.18  on 2396  degrees of freedom
AIC: 772.18

Number of Fisher Scoring iterations: 7


In [592]:
summary(countwords_binary_llama3_model)


Call:
glm(formula = correct ~ index + input_logprob + output_logprob, 
    family = binomial, data = scaled_countwords_binary_llama3_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-0.54376  -0.00089  -0.00002   0.00000   2.99657  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)
(Intercept)    -22.6723    29.3206  -0.773    0.439
index           -1.4921     3.2166  -0.464    0.643
input_logprob   -0.4407     0.3664  -1.203    0.229
output_logprob   9.6374    19.4494   0.496    0.620

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 59.169  on 2399  degrees of freedom
Residual deviance: 36.622  on 2396  degrees of freedom
AIC: 44.622

Number of Fisher Scoring iterations: 16


In [587]:
summary(countwords_binary_claude3_model)


Call:
glm(formula = correct ~ index + input_logprob + output_logprob, 
    family = binomial, data = scaled_countwords_binary_claude3_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.6886  -0.3079  -0.1161   0.3457   2.6246  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)    -1.24281    0.08178 -15.197  < 2e-16 ***
index           0.42105    0.07520   5.599 2.16e-08 ***
input_logprob   0.02800    0.06487   0.432    0.666    
output_logprob  3.21623    0.13943  23.067  < 2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 3171.4  on 2399  degrees of freedom
Residual deviance: 1445.0  on 2396  degrees of freedom
AIC: 1453

Number of Fisher Scoring iterations: 6


In [588]:
summary(countwords_binary_gemini1_model)


Call:
glm(formula = correct ~ index + input_logprob + output_logprob, 
    family = binomial, data = scaled_countwords_binary_gemini1_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.84744  -0.31605  -0.12956  -0.04227   2.76151  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)    -3.77198    0.17371 -21.715   <2e-16 ***
index          -0.81426    0.08395  -9.699   <2e-16 ***
input_logprob   0.09063    0.06469   1.401    0.161    
output_logprob  2.07263    0.13711  15.117   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 1792.9  on 2399  degrees of freedom
Residual deviance: 1032.3  on 2396  degrees of freedom
AIC: 1040.3

Number of Fisher Scoring iterations: 7


In [585]:
summary(countchars_binary_gpt4_model)


Call:
glm(formula = correct ~ index + input_logprob + output_logprob, 
    family = binomial, data = scaled_countchars_binary_gpt4_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.8510  -0.4543  -0.1344  -0.0681   2.2621  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)    -2.65994    0.11628 -22.875   <2e-16 ***
index           0.01801    0.06977   0.258    0.796    
input_logprob  -0.09741    0.06844  -1.423    0.155    
output_logprob  2.54242    0.11741  21.655   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 2488.0  on 2399  degrees of freedom
Residual deviance: 1364.7  on 2396  degrees of freedom
AIC: 1372.7

Number of Fisher Scoring iterations: 6


In [586]:
summary(countchars_binary_gpt35_model)


Call:
glm(formula = correct ~ index + input_logprob + output_logprob, 
    family = binomial, data = scaled_countchars_binary_gpt35_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.8611  -0.6616  -0.2926   0.5626   2.4637  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)    -1.48069    0.06776 -21.851   <2e-16 ***
index          -0.53780    0.05743  -9.365   <2e-16 ***
input_logprob  -0.03054    0.05630  -0.542    0.588    
output_logprob  1.37065    0.07117  19.259   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 2821.3  on 2399  degrees of freedom
Residual deviance: 1974.3  on 2396  degrees of freedom
AIC: 1982.3

Number of Fisher Scoring iterations: 5


In [591]:
summary(countchars_binary_llama3_model)


Call:
glm(formula = correct ~ index + input_logprob + output_logprob, 
    family = binomial, data = scaled_countchars_binary_llama3_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-2.28104  -0.07501  -0.01256  -0.00135   3.05189  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     -6.9931     0.4221 -16.568   <2e-16 ***
index           -1.4732     0.1214 -12.140   <2e-16 ***
input_logprob    0.1542     0.1103   1.399    0.162    
output_logprob   4.5468     0.3132  14.518   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 1827.86  on 2399  degrees of freedom
Residual deviance:  593.92  on 2396  degrees of freedom
AIC: 601.92

Number of Fisher Scoring iterations: 8


In [589]:
summary(countchars_binary_claude3_model)


Call:
glm(formula = correct ~ index + input_logprob + output_logprob, 
    family = binomial, data = scaled_countchars_binary_claude3_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.1773  -0.6392  -0.3139   0.6397   2.5528  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)    -0.87348    0.05843 -14.949  < 2e-16 ***
index          -0.07358    0.05833  -1.262    0.207    
input_logprob   0.22605    0.05539   4.081 4.48e-05 ***
output_logprob  1.70411    0.07359  23.155  < 2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 3150.0  on 2399  degrees of freedom
Residual deviance: 2113.8  on 2396  degrees of freedom
AIC: 2121.8

Number of Fisher Scoring iterations: 5


In [590]:
summary(countchars_binary_gemini1_model)


Call:
glm(formula = correct ~ index + input_logprob + output_logprob, 
    family = binomial, data = scaled_countchars_binary_gemini1_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.9811  -0.4063  -0.1055  -0.0422   3.7119  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)    -3.25211    0.14635 -22.222  < 2e-16 ***
index          -0.18999    0.07104  -2.674  0.00749 ** 
input_logprob   0.06340    0.07523   0.843  0.39937    
output_logprob  2.75162    0.13137  20.946  < 2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 2313.4  on 2399  degrees of freedom
Residual deviance: 1188.2  on 2396  degrees of freedom
AIC: 1196.2

Number of Fisher Scoring iterations: 7


In [593]:
vif(countwords_binary_gpt4_model)

In [594]:
vif(countwords_binary_gpt35_model)

In [595]:
vif(countwords_binary_llama3_model)

In [596]:
vif(countwords_binary_claude3_model)

In [597]:
vif(countwords_binary_gemini1_model)

In [598]:
vif(countchars_binary_gpt4_model)

In [599]:
vif(countchars_binary_gpt35_model)

In [600]:
vif(countchars_binary_llama3_model)

In [601]:
vif(countchars_binary_claude3_model)

In [602]:
vif(countchars_binary_gemini1_model)

## Counting: few shot

In [626]:
countwords_gpt35_0shot_df <- read.table(file = 'table_few_counting_words_gpt-3.5-turbo-0613.tsv', sep = '\t', header = TRUE)
countwords_gpt4_0shot_df <- read.table(file = 'table_few_counting_words_gpt-4-0613.tsv', sep = '\t', header = TRUE)
countwords_claude3_0shot_df <- read.table(file = 'table_few_counting_words_claude-3-opus-20240229.tsv', sep = '\t', header = TRUE)

countwords_gpt35_5shot_df <- read.table(file = 'table_few_counting_words_5shot_gpt-3.5-turbo-0613.tsv', sep = '\t', header = TRUE)
countwords_gpt4_5shot_df <- read.table(file = 'table_few_counting_words_5shot_gpt-4-0613.tsv', sep = '\t', header = TRUE)
countwords_claude3_5shot_df <- read.table(file = 'table_few_counting_words_5shot_claude-3-opus-20240229.tsv', sep = '\t', header = TRUE)

countwords_gpt35_10shot_df <- read.table(file = 'table_few_counting_words_10shot_gpt-3.5-turbo-0613.tsv', sep = '\t', header = TRUE)
countwords_gpt4_10shot_df <- read.table(file = 'table_few_counting_words_10shot_gpt-4-0613.tsv', sep = '\t', header = TRUE)
countwords_claude3_10shot_df <- read.table(file = 'table_few_counting_words_10shot_claude-3-opus-20240229.tsv', sep = '\t', header = TRUE)



countwords_gpt35ft_0shot_df <- read.table(file = 'table_few_counting_words_gpt-3.5-turbo-0613.tsv', sep = '\t', header = TRUE)
countwords_gpt35ft_10shot_df <- read.table(file = 'table_few_counting_words_ft:gpt-3.5-turbo-0613:personal:count-10shot:9NYCyc4X.tsv', sep = '\t', header = TRUE)
countwords_gpt35ft_100shot_df <- read.table(file = 'table_few_counting_words_ft:gpt-3.5-turbo-0613:personal:count-100shot:9NYN8hZQ.tsv', sep = '\t', header = TRUE)




In [627]:
scaled_countwords_gpt35_0shot_df <- scale_df_with_index_demos(countwords_gpt35_0shot_df)
scaled_countwords_gpt4_0shot_df <- scale_df_with_index_demos(countwords_gpt4_0shot_df)
scaled_countwords_claude3_0shot_df <- scale_df_with_index_demos(countwords_claude3_0shot_df)

scaled_countwords_gpt35_5shot_df <- scale_df_with_index_demos(countwords_gpt35_5shot_df)
scaled_countwords_gpt4_5shot_df <- scale_df_with_index_demos(countwords_gpt4_5shot_df)
scaled_countwords_claude3_5shot_df <- scale_df_with_index_demos(countwords_claude3_5shot_df)

scaled_countwords_gpt35_10shot_df <- scale_df_with_index_demos(countwords_gpt35_10shot_df)
scaled_countwords_gpt4_10shot_df <- scale_df_with_index_demos(countwords_gpt4_10shot_df)
scaled_countwords_claude3_10shot_df <- scale_df_with_index_demos(countwords_claude3_10shot_df)

scaled_countwords_gpt35ft_0shot_df <- scale_df_with_index_demos(countwords_gpt35ft_0shot_df)
scaled_countwords_gpt35ft_10shot_df <- scale_df_with_index_demos(countwords_gpt35ft_10shot_df)
scaled_countwords_gpt35ft_100shot_df <- scale_df_with_index_demos(countwords_gpt35ft_100shot_df)


In [628]:
head(scaled_countwords_gpt35_0shot_df)

Unnamed: 0_level_0,index,demonstration_count,input_nchars,input_ntokens,input_logprob,output_nchars,output_ntokens,output_logprob,correct
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<int>
1,-1.71453,,-1.741827,-1.739786,-4.4424536,-3.006613,,2.316207,1
2,-1.71453,,-1.738059,-1.771344,8.7549067,-3.006613,,2.316207,1
3,-1.71453,,-1.726757,-1.755565,-1.9608672,-3.006613,,2.316207,1
4,-1.71453,,-1.738059,-1.739786,-4.623797,-3.006613,,2.316207,1
5,-1.71453,,-1.726757,-1.739786,-3.510291,-3.006613,,2.316207,1
6,-1.71453,,-1.726757,-1.739786,-0.5541231,-3.006613,,2.316207,1


In [629]:
countwords_gpt35_0shot_df_model <- glm(correct ~ index + input_logprob + output_logprob, 
                   data=scaled_countwords_gpt35_0shot_df, family=binomial)
countwords_gpt4_0shot_df_model <- glm(correct ~ index + input_logprob + output_logprob, 
                   data=scaled_countwords_gpt4_0shot_df, family=binomial)
countwords_claude3_0shot_df_model <- glm(correct ~ index + input_logprob + output_logprob, 
                   data=scaled_countwords_claude3_0shot_df, family=binomial)


countwords_gpt35_5shot_df_model <- glm(correct ~ index + demonstration_count + input_logprob + output_logprob, 
                   data=scaled_countwords_gpt35_5shot_df, family=binomial)
countwords_gpt4_5shot_df_model <- glm(correct ~ index + demonstration_count + input_logprob + output_logprob, 
                   data=scaled_countwords_gpt4_5shot_df, family=binomial)
countwords_claude3_5shot_df_model <- glm(correct ~ index + demonstration_count + input_logprob + output_logprob, 
                   data=scaled_countwords_claude3_5shot_df, family=binomial)


countwords_gpt35_10shot_df_model <- glm(correct ~ index + demonstration_count + input_logprob + output_logprob, 
                   data=scaled_countwords_gpt35_10shot_df, family=binomial)
countwords_gpt4_10shot_df_model <- glm(correct ~ index + demonstration_count + input_logprob + output_logprob, 
                   data=scaled_countwords_gpt4_10shot_df, family=binomial)
countwords_claude3_10shot_df_model <- glm(correct ~ index + demonstration_count + input_logprob + output_logprob, 
                   data=scaled_countwords_claude3_10shot_df, family=binomial)



countwords_gpt35ft_0shot_df_model <- glm(correct ~ index + input_logprob + output_logprob, 
                   data=scaled_countwords_gpt35ft_0shot_df, family=binomial)
countwords_gpt35ft_10shot_df_model <- glm(correct ~ index + demonstration_count + input_logprob + output_logprob, 
                   data=scaled_countwords_gpt35ft_10shot_df, family=binomial)
countwords_gpt35ft_100shot_df_model <- glm(correct ~ index + demonstration_count + input_logprob + output_logprob, 
                   data=scaled_countwords_gpt35ft_100shot_df, family=binomial)

“glm.fit: fitted probabilities numerically 0 or 1 occurred”


In [630]:
summary(countwords_gpt35_0shot_df_model)


Call:
glm(formula = correct ~ index + input_logprob + output_logprob, 
    family = binomial, data = scaled_countwords_gpt35_0shot_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.0485  -0.2589  -0.0675  -0.0346   3.4371  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)    -4.31424    0.19524 -22.098   <2e-16 ***
index           0.06015    0.10323   0.583    0.560    
input_logprob  -0.01359    0.05816  -0.234    0.815    
output_logprob  2.95668    0.15685  18.851   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 2283.9  on 2999  degrees of freedom
Residual deviance: 1074.0  on 2996  degrees of freedom
AIC: 1082

Number of Fisher Scoring iterations: 7


In [631]:
summary(countwords_gpt35_5shot_df_model)


Call:
glm(formula = correct ~ index + demonstration_count + input_logprob + 
    output_logprob, family = binomial, data = scaled_countwords_gpt35_5shot_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.8751  -0.0004   0.0000   0.0000   3.2441  

Coefficients:
                    Estimate Std. Error z value Pr(>|z|)    
(Intercept)         -21.9756    51.3595  -0.428    0.669    
index                 2.8995     0.3988   7.271 3.56e-13 ***
demonstration_count  -2.9641   223.6628  -0.013    0.989    
input_logprob         0.1365     0.1027   1.330    0.184    
output_logprob       12.9133     1.4086   9.167  < 2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 1167.42  on 2999  degrees of freedom
Residual deviance:  178.66  on 2995  degrees of freedom
AIC: 188.66

Number of Fisher Scoring iterations: 19


In [632]:
summary(countwords_gpt35_10shot_df_model)


Call:
glm(formula = correct ~ index + demonstration_count + input_logprob + 
    output_logprob, family = binomial, data = scaled_countwords_gpt35_10shot_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.7936  -0.1398  -0.0323  -0.0188   4.1623  

Coefficients:
                    Estimate Std. Error z value Pr(>|z|)    
(Intercept)         -6.03330    0.35319 -17.082  < 2e-16 ***
index                1.15123    0.14443   7.971 1.58e-15 ***
demonstration_count -0.01000    0.11849  -0.084    0.933    
input_logprob       -0.01160    0.04996  -0.232    0.816    
output_logprob       3.59808    0.24678  14.580  < 2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 1179.29  on 2999  degrees of freedom
Residual deviance:  598.07  on 2995  degrees of freedom
AIC: 608.07

Number of Fisher Scoring iterations: 8


In [633]:
summary(countwords_gpt4_0shot_df_model)


Call:
glm(formula = correct ~ index + input_logprob + output_logprob, 
    family = binomial, data = scaled_countwords_gpt4_0shot_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-2.57541  -0.16382  -0.04986  -0.02723   2.91722  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)    -3.35701    0.15836 -21.199   <2e-16 ***
index          -0.19417    0.10797  -1.798   0.0721 .  
input_logprob  -0.10037    0.08225  -1.220   0.2224    
output_logprob  3.59077    0.17453  20.574   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 3264.4  on 2999  degrees of freedom
Residual deviance: 1137.6  on 2996  degrees of freedom
AIC: 1145.6

Number of Fisher Scoring iterations: 7


In [634]:
summary(countwords_gpt4_5shot_df_model)


Call:
glm(formula = correct ~ index + demonstration_count + input_logprob + 
    output_logprob, family = binomial, data = scaled_countwords_gpt4_5shot_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.0528  -0.0748  -0.0080  -0.0009   4.7868  

Coefficients:
                    Estimate Std. Error z value Pr(>|z|)    
(Intercept)         -6.26081    0.36232 -17.280  < 2e-16 ***
index               -4.41831    0.38160 -11.578  < 2e-16 ***
demonstration_count  0.16083    0.22598   0.712    0.477    
input_logprob       -0.18717    0.08729  -2.144    0.032 *  
output_logprob       1.71799    0.23558   7.293 3.04e-13 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 2977.29  on 2999  degrees of freedom
Residual deviance:  818.79  on 2995  degrees of freedom
AIC: 828.79

Number of Fisher Scoring iterations: 9


In [635]:
summary(countwords_gpt4_10shot_df_model)


Call:
glm(formula = correct ~ index + demonstration_count + input_logprob + 
    output_logprob, family = binomial, data = scaled_countwords_gpt4_10shot_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-2.22289  -0.18376  -0.06156  -0.02080   2.97925  

Coefficients:
                    Estimate Std. Error z value Pr(>|z|)    
(Intercept)         -3.60310    0.16996 -21.199  < 2e-16 ***
index               -1.73828    0.17057 -10.191  < 2e-16 ***
demonstration_count -0.23628    0.07134  -3.312 0.000927 ***
input_logprob       -0.07187    0.07690  -0.935 0.349955    
output_logprob       2.15766    0.16217  13.305  < 2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 3143.6  on 2999  degrees of freedom
Residual deviance: 1152.2  on 2995  degrees of freedom
AIC: 1162.2

Number of Fisher Scoring iterations: 7


In [636]:
summary(countwords_claude3_0shot_df_model)


Call:
glm(formula = correct ~ index + input_logprob + output_logprob, 
    family = binomial, data = scaled_countwords_claude3_0shot_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-3.13366  -0.18146  -0.07462   0.06983   3.12381  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)    -2.64527    0.12493 -21.173   <2e-16 ***
index           1.20371    0.11671  10.313   <2e-16 ***
input_logprob  -0.19547    0.08116  -2.408    0.016 *  
output_logprob  4.42207    0.19597  22.565   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 3452.9  on 2999  degrees of freedom
Residual deviance: 1211.2  on 2996  degrees of freedom
AIC: 1219.2

Number of Fisher Scoring iterations: 7


In [637]:
summary(countwords_claude3_5shot_df_model)


Call:
glm(formula = correct ~ index + demonstration_count + input_logprob + 
    output_logprob, family = binomial, data = scaled_countwords_claude3_5shot_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.0724  -0.4881  -0.2710   0.3701   2.8303  

Coefficients:
                    Estimate Std. Error z value Pr(>|z|)    
(Intercept)         -1.65559    0.06613 -25.035   <2e-16 ***
index               -0.01925    0.08251  -0.233    0.816    
demonstration_count -0.08298    0.05791  -1.433    0.152    
input_logprob        0.07244    0.05602   1.293    0.196    
output_logprob       1.81364    0.08918  20.336   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 3423.6  on 2999  degrees of freedom
Residual deviance: 2161.1  on 2995  degrees of freedom
AIC: 2171.1

Number of Fisher Scoring iterations: 5


In [638]:
summary(countwords_claude3_10shot_df_model)


Call:
glm(formula = correct ~ index + demonstration_count + input_logprob + 
    output_logprob, family = binomial, data = scaled_countwords_claude3_10shot_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.0495  -0.6192  -0.3984   0.5991   2.3850  

Coefficients:
                     Estimate Std. Error z value Pr(>|z|)    
(Intercept)         -1.169574   0.052396 -22.322   <2e-16 ***
index               -0.003974   0.073811  -0.054   0.9571    
demonstration_count -0.091239   0.047221  -1.932   0.0533 .  
input_logprob       -0.042695   0.049495  -0.863   0.3884    
output_logprob       1.412293   0.076489  18.464   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 3644.6  on 2999  degrees of freedom
Residual deviance: 2659.1  on 2995  degrees of freedom
AIC: 2669.1

Number of Fisher Scoring iterations: 5


In [639]:
summary(countwords_gpt35ft_0shot_df_model)


Call:
glm(formula = correct ~ index + input_logprob + output_logprob, 
    family = binomial, data = scaled_countwords_gpt35ft_0shot_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.0485  -0.2589  -0.0675  -0.0346   3.4371  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)    -4.31424    0.19524 -22.098   <2e-16 ***
index           0.06015    0.10323   0.583    0.560    
input_logprob  -0.01359    0.05816  -0.234    0.815    
output_logprob  2.95668    0.15685  18.851   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 2283.9  on 2999  degrees of freedom
Residual deviance: 1074.0  on 2996  degrees of freedom
AIC: 1082

Number of Fisher Scoring iterations: 7


In [640]:
summary(countwords_gpt35ft_10shot_df_model)


Call:
glm(formula = correct ~ index + demonstration_count + input_logprob + 
    output_logprob, family = binomial, data = scaled_countwords_gpt35ft_10shot_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.3279  -0.5756  -0.3759  -0.3391   2.4806  

Coefficients:
                    Estimate Std. Error z value Pr(>|z|)    
(Intercept)         -2.02072    0.06241 -32.378  < 2e-16 ***
index                0.36091    0.07934   4.549 5.39e-06 ***
demonstration_count -0.07857    0.05804  -1.354    0.176    
input_logprob        0.05938    0.04305   1.379    0.168    
output_logprob       1.09891    0.07997  13.741  < 2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 2487.2  on 2999  degrees of freedom
Residual deviance: 2202.1  on 2995  degrees of freedom
AIC: 2212.1

Number of Fisher Scoring iterations: 5


In [641]:
summary(countwords_gpt35ft_100shot_df_model)


Call:
glm(formula = correct ~ index + demonstration_count + input_logprob + 
    output_logprob, family = binomial, data = scaled_countwords_gpt35ft_100shot_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.7869  -0.4426  -0.1961  -0.1188   3.2408  

Coefficients:
                    Estimate Std. Error z value Pr(>|z|)    
(Intercept)         -2.72621    0.10035 -27.168  < 2e-16 ***
index               -0.34358    0.10043  -3.421 0.000623 ***
demonstration_count  0.24157    0.06927   3.488 0.000487 ***
input_logprob        0.08805    0.05062   1.739 0.081968 .  
output_logprob       1.66124    0.09972  16.659  < 2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 2608.0  on 2999  degrees of freedom
Residual deviance: 1660.1  on 2995  degrees of freedom
AIC: 1670.1

Number of Fisher Scoring iterations: 6


In [642]:
vif(countwords_gpt35_0shot_df_model)

In [643]:
vif(countwords_gpt35_5shot_df_model)

In [644]:
vif(countwords_gpt35_10shot_df_model)

In [645]:
vif(countwords_gpt4_0shot_df_model)

In [646]:
vif(countwords_gpt4_5shot_df_model)

In [647]:
vif(countwords_gpt4_10shot_df_model)

In [648]:
vif(countwords_claude3_0shot_df_model)

In [649]:
vif(countwords_claude3_5shot_df_model)

In [650]:
vif(countwords_claude3_10shot_df_model)

In [651]:
vif(countwords_gpt35ft_0shot_df_model)

In [652]:
vif(countwords_gpt35ft_10shot_df_model)

In [653]:
vif(countwords_gpt35ft_100shot_df_model)

# Sorting

In [654]:
# Read in data
sortwords_gpt4_df <- read.table(file = 'table_sortwords_gpt-4-0613.tsv', sep = '\t', header = TRUE)
sortwords_gpt35_df <- read.table(file = 'table_sortwords_gpt-3.5-turbo-0613.tsv', sep = '\t', header = TRUE)
sortwords_llama3_df <- read.table(file = 'table_sortwords_llama-3-70b-chat-hf.tsv', sep = '\t', header = TRUE)
sortwords_claude3_df <- read.table(file = 'table_sortwords_claude-3-opus-20240229.tsv', sep = '\t', header = TRUE)
sortwords_gemini1_df <- read.table(file = 'table_sortwords_gemini-1.0-pro-001.tsv', sep = '\t', header = TRUE)


sortnumbers_gpt4_df <- read.table(file = 'table_sortnumbers_gpt-4-0613.tsv', sep = '\t', header = TRUE)
sortnumbers_gpt35_df <- read.table(file = 'table_sortnumbers_gpt-3.5-turbo-0613.tsv', sep = '\t', header = TRUE)
sortnumbers_llama3_df <- read.table(file = 'table_sortnumbers_llama-3-70b-chat-hf.tsv', sep = '\t', header = TRUE)
sortnumbers_claude3_df <- read.table(file = 'table_sortnumbers_claude-3-opus-20240229.tsv', sep = '\t', header = TRUE)
sortnumbers_gemini1_df <- read.table(file = 'table_sortnumbers_gemini-1.0-pro-001.tsv', sep = '\t', header = TRUE)


In [655]:
# Z-score data
scaled_sortwords_gpt4_df <- scale_taskpair_df(sortwords_gpt4_df)
scaled_sortwords_gpt35_df <- scale_taskpair_df(sortwords_gpt35_df)
scaled_sortwords_llama3_df <- scale_taskpair_df(sortwords_llama3_df)
scaled_sortwords_claude3_df <- scale_taskpair_df(sortwords_claude3_df)
scaled_sortwords_gemini1_df <- scale_taskpair_df(sortwords_gemini1_df)



scaled_sortnumbers_gpt4_df <- scale_taskpair_df(sortnumbers_gpt4_df)
scaled_sortnumbers_gpt35_df <- scale_taskpair_df(sortnumbers_gpt35_df)
scaled_sortnumbers_llama3_df <- scale_taskpair_df(sortnumbers_llama3_df)
scaled_sortnumbers_claude3_df <- scale_taskpair_df(sortnumbers_claude3_df)
scaled_sortnumbers_gemini1_df <- scale_taskpair_df(sortnumbers_gemini1_df)



In [700]:
sortnumbers_gemini1_df

index,task,input_nchars,input_ntokens,input_logprob,output_nchars,output_ntokens,output_logprob,correct
<int>,<chr>,<int>,<int>,<dbl>,<int>,<int>,<dbl>,<int>
0,ascending,64,64,-121.90702,64,64,-104.21918,1
1,ascending,58,58,-104.33441,58,58,-96.52242,1
2,ascending,52,52,-99.49874,52,52,-88.16746,0
3,ascending,91,91,-166.68651,91,91,-133.25128,0
4,ascending,82,82,-145.13611,82,82,-124.06780,1
5,ascending,117,117,-206.94763,117,117,-157.55121,1
6,ascending,99,99,-176.64972,99,99,-144.69113,0
7,ascending,64,64,-113.86436,64,64,-90.82893,1
8,ascending,75,75,-134.33185,75,75,-109.77455,1
9,ascending,57,57,-113.19421,57,57,-92.94866,1


In [656]:
sort_gpt4_words_model <- glm(correct ~ task + input_nchars + input_ntokens + input_logprob + output_logprob, 
               data=scaled_sortwords_gpt4_df, family=binomial)
sort_gpt35_words_model <- glm(correct ~ task + input_nchars + input_ntokens + input_logprob + output_logprob, 
               data=scaled_sortwords_gpt35_df, family=binomial)
sort_llama3_words_model <- glm(correct ~ task + input_nchars + input_ntokens + input_logprob + output_logprob, 
               data=scaled_sortwords_llama3_df, family=binomial)
sort_claude3_words_model <- glm(correct ~ task + input_nchars + input_logprob + output_logprob, 
               data=scaled_sortwords_claude3_df, family=binomial)
sort_gemini1_words_model <- glm(correct ~ task + input_nchars + input_ntokens + input_logprob + output_logprob, 
               data=scaled_sortwords_gemini1_df, family=binomial)



In [678]:
sort_gpt4_numbers_model <- glm(correct ~ task + input_nchars + input_ntokens + input_logprob + output_logprob, 
               data=scaled_sortnumbers_gpt4_df, family=binomial)
sort_gpt35_numbers_model <- glm(correct ~ task + input_nchars + input_ntokens + input_logprob + output_logprob, 
               data=scaled_sortnumbers_gpt35_df, family=binomial)
sort_llama3_numbers_model <- glm(correct ~ task + input_nchars + input_ntokens + input_logprob + output_logprob, 
               data=scaled_sortnumbers_llama3_df, family=binomial)
sort_claude3_numbers_model <- bayesglm(correct ~ task + input_nchars + input_logprob + output_logprob, 
               data=scaled_sortnumbers_claude3_df, family=binomial)
sort_gemini1_numbers_model <- glm(correct ~ task + input_nchars + input_logprob + output_logprob, 
               data=scaled_sortnumbers_gemini1_df, family=binomial)


In [679]:
summary(sort_gpt4_words_model)


Call:
glm(formula = correct ~ task + input_nchars + input_ntokens + 
    input_logprob + output_logprob, family = binomial, data = scaled_sortwords_gpt4_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.2232  -0.8916   0.4892   0.7003   1.9249  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     1.59101    0.31446   5.059 4.20e-07 ***
taskrev        -2.38837    0.45625  -5.235 1.65e-07 ***
input_nchars    0.38206    0.64249   0.595    0.552    
input_ntokens  -0.67072    0.87302  -0.768    0.442    
input_logprob   0.17060    0.91596   0.186    0.852    
output_logprob  0.02066    0.78236   0.026    0.979    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 273.33  on 199  degrees of freedom
Residual deviance: 211.06  on 194  degrees of freedom
AIC: 223.06

Number of Fisher Scoring iterations: 4


In [680]:
summary(sort_gpt35_words_model)


Call:
glm(formula = correct ~ task + input_nchars + input_ntokens + 
    input_logprob + output_logprob, family = binomial, data = scaled_sortwords_gpt35_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.9744  -0.6160  -0.3827   0.7223   2.2874  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)      1.0275     0.2797   3.674 0.000239 ***
taskrev         -2.6897     0.4560  -5.898 3.67e-09 ***
input_nchars    -0.2244     0.7079  -0.317 0.751250    
input_ntokens    0.2777     0.9500   0.292 0.770045    
input_logprob   -0.4364     0.9579  -0.456 0.648674    
output_logprob   0.9603     0.8631   1.113 0.265902    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 275.64  on 199  degrees of freedom
Residual deviance: 188.36  on 194  degrees of freedom
AIC: 200.36

Number of Fisher Scoring iterations: 4


In [681]:
summary(sort_llama3_words_model)


Call:
glm(formula = correct ~ task + input_nchars + input_ntokens + 
    input_logprob + output_logprob, family = binomial, data = scaled_sortwords_llama3_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.9234  -0.7136  -0.4497   0.8262   2.3513  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)      0.6689     0.2638   2.535   0.0112 *  
taskrev         -2.2123     0.4329  -5.111 3.21e-07 ***
input_nchars     0.3251     0.6560   0.496   0.6202    
input_ntokens    0.6636     0.8773   0.756   0.4494    
input_logprob    1.0254     0.8902   1.152   0.2494    
output_logprob   0.4593     0.7988   0.575   0.5653    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 272.74  on 199  degrees of freedom
Residual deviance: 210.93  on 194  degrees of freedom
AIC: 222.93

Number of Fisher Scoring iterations: 4


In [682]:
summary(sort_claude3_words_model)


Call:
glm(formula = correct ~ task + input_nchars + input_logprob + 
    output_logprob, family = binomial, data = scaled_sortwords_claude3_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.5999  -0.9017   0.3678   0.6900   1.7255  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)      2.4958     0.4139   6.030 1.64e-09 ***
taskrev         -2.4497     0.5338  -4.589 4.44e-06 ***
input_nchars    -0.7735     0.5686  -1.360    0.174    
input_logprob   -0.5703     0.9007  -0.633    0.527    
output_logprob   0.2755     0.8414   0.327    0.743    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 240.86  on 199  degrees of freedom
Residual deviance: 186.29  on 195  degrees of freedom
AIC: 196.29

Number of Fisher Scoring iterations: 5


In [683]:
summary(sort_gemini1_words_model)


Call:
glm(formula = correct ~ task + input_nchars + input_ntokens + 
    input_logprob + output_logprob, family = binomial, data = scaled_sortwords_gemini1_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.1503  -0.3723  -0.1979   0.6304   2.5365  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     0.73867    0.29533   2.501   0.0124 *  
taskrev        -4.26069    0.65643  -6.491 8.54e-11 ***
input_nchars   -0.26137    0.73016  -0.358   0.7204    
input_ntokens  -0.17588    0.87150  -0.202   0.8401    
input_logprob   0.58838    1.11636   0.527   0.5982    
output_logprob -0.09457    1.00494  -0.094   0.9250    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 257.72  on 199  degrees of freedom
Residual deviance: 143.78  on 194  degrees of freedom
AIC: 155.78

Number of Fisher Scoring iterations: 6


In [684]:
summary(sort_gpt4_numbers_model)


Call:
glm(formula = correct ~ task + input_nchars + input_ntokens + 
    input_logprob + output_logprob, family = binomial, data = scaled_sortnumbers_gpt4_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.2265   0.4663   0.5864   0.6988   0.9245  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     1.52436    0.35167   4.335 1.46e-05 ***
taskdescending -0.03131    0.58923  -0.053    0.958    
input_nchars   -8.09348    6.40191  -1.264    0.206    
input_ntokens   6.68226    6.03090   1.108    0.268    
input_logprob  -1.26602    1.20055  -1.055    0.292    
output_logprob  0.19837    0.89414   0.222    0.824    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 194.49  on 199  degrees of freedom
Residual deviance: 188.87  on 194  degrees of freedom
AIC: 200.87

Number of Fisher Scoring iterations: 4


In [685]:
summary(sort_gpt35_numbers_model)


Call:
glm(formula = correct ~ task + input_nchars + input_ntokens + 
    input_logprob + output_logprob, family = binomial, data = scaled_sortnumbers_gpt35_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.9344  -1.0194   0.6502   0.8800   2.0047  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)      0.7018     0.3059   2.294   0.0218 *  
taskdescending  -0.2664     0.5108  -0.522   0.6020    
input_nchars   -23.7603     6.0042  -3.957 7.58e-05 ***
input_ntokens   24.8100     5.7384   4.324 1.54e-05 ***
input_logprob    1.1925     1.0635   1.121   0.2621    
output_logprob   0.1633     0.7746   0.211   0.8330    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 264.63  on 199  degrees of freedom
Residual deviance: 233.73  on 194  degrees of freedom
AIC: 245.73

Number of Fisher Scoring iterations: 4


In [686]:
summary(sort_llama3_numbers_model)


Call:
glm(formula = correct ~ task + input_nchars + input_ntokens + 
    input_logprob + output_logprob, family = binomial, data = scaled_sortnumbers_llama3_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.91798  -0.97352   0.03537   0.99783   1.89837  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)
(Intercept)     0.04089    0.29319   0.139    0.889
taskdescending -0.08731    0.49212  -0.177    0.859
input_nchars    0.78842    5.64171   0.140    0.889
input_ntokens  -1.35101    5.36785  -0.252    0.801
input_logprob  -0.84342    1.02307  -0.824    0.410
output_logprob  1.15444    0.79511   1.452    0.147

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 277.26  on 199  degrees of freedom
Residual deviance: 243.13  on 194  degrees of freedom
AIC: 255.13

Number of Fisher Scoring iterations: 4


In [687]:
summary(sort_claude3_numbers_model)


Call:
bayesglm(formula = correct ~ task + input_nchars + input_logprob + 
    output_logprob, family = binomial, data = scaled_sortnumbers_claude3_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
0.03007  0.03007  0.03007  0.03007  0.03007  

Coefficients:
                 Estimate Std. Error z value Pr(>|z|)  
(Intercept)     7.701e+00  3.323e+00   2.318   0.0205 *
taskdescending -3.337e-16  2.234e+00   0.000   1.0000  
input_nchars   -1.922e-16  1.137e+00   0.000   1.0000  
input_logprob   1.799e-16  1.137e+00   0.000   1.0000  
output_logprob  1.302e-16  1.137e+00   0.000   1.0000  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 0.00000  on 199  degrees of freedom
Residual deviance: 0.18087  on 195  degrees of freedom
AIC: 10.181

Number of Fisher Scoring iterations: 23


In [688]:
summary(sort_gemini1_numbers_model)


Call:
glm(formula = correct ~ task + input_nchars + input_logprob + 
    output_logprob, family = binomial, data = scaled_sortnumbers_gemini1_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.9611  -1.0655   0.6576   0.9837   1.6869  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)  
(Intercept)     0.71281    0.29564   2.411   0.0159 *
taskdescending -0.65409    0.48634  -1.345   0.1787  
input_nchars    0.24302    0.99496   0.244   0.8070  
input_logprob   0.06579    0.99501   0.066   0.9473  
output_logprob  0.76906    0.76924   1.000   0.3174  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 271.45  on 199  degrees of freedom
Residual deviance: 246.34  on 195  degrees of freedom
AIC: 256.34

Number of Fisher Scoring iterations: 4


In [689]:
vif(sort_gpt4_words_model)

In [690]:
vif(sort_gpt35_words_model)

In [691]:
vif(sort_llama3_words_model)

In [692]:
vif(sort_claude3_words_model)

In [693]:
vif(sort_gemini1_words_model)

In [694]:
vif(sort_gpt4_numbers_model)

In [695]:
vif(sort_gpt35_numbers_model)

In [696]:
vif(sort_llama3_numbers_model)

In [697]:
vif(sort_claude3_numbers_model)

In [698]:
vif(sort_gemini1_numbers_model)

## Sorting: few-shot

In [701]:
sortwords_gpt4_0shot_df <- read.table(file = 'table_few_sortwords_gpt-4-0613_0shot.tsv', sep = '\t', header = TRUE)
sortwords_gpt4_5shot_df <- read.table(file = 'table_few_sortwords_gpt-4-0613_5shot.tsv', sep = '\t', header = TRUE)
sortwords_gpt4_10shot_df <- read.table(file = 'table_few_sortwords_gpt-4-0613_10shot.tsv', sep = '\t', header = TRUE)

sortwords_gpt35_0shot_df <- read.table(file = 'table_few_sortwords_gpt-3.5-turbo-0613_0shot.tsv', sep = '\t', header = TRUE)
sortwords_gpt35_5shot_df <- read.table(file = 'table_few_sortwords_gpt-3.5-turbo-0613_5shot.tsv', sep = '\t', header = TRUE)
sortwords_gpt35_10shot_df <- read.table(file = 'table_few_sortwords_gpt-3.5-turbo-0613_10shot.tsv', sep = '\t', header = TRUE)

sortwords_claude3_0shot_df <- read.table(file = 'table_few_sortwords_claude-3-opus-20240229_0shot.tsv', sep = '\t', header = TRUE)
sortwords_claude3_5shot_df <- read.table(file = 'table_few_sortwords_claude-3-opus-20240229_5shot.tsv', sep = '\t', header = TRUE)
sortwords_claude3_10shot_df <- read.table(file = 'table_few_sortwords_claude-3-opus-20240229_10shot.tsv', sep = '\t', header = TRUE)

sortwords_gpt35ft_0shot_df <- read.table(file = 'table_few_sortwords_gpt-3.5-turbo-0613_0shot.tsv', sep = '\t', header = TRUE)
sortwords_gpt35ft_10shot_df <- read.table(file = 'table_few_sortwords_ft_gpt-3.5_10shot_0shot.tsv', sep = '\t', header = TRUE)
sortwords_gpt35ft_100shot_df <- read.table(file = 'table_few_sortwords_ft_gpt-3.5_100shot_0shot.tsv', sep = '\t', header = TRUE)



In [702]:
scaled_sortwords_gpt4_0shot_df <- scale_taskpair_df(sortwords_gpt4_0shot_df)
scaled_sortwords_gpt4_5shot_df <- scale_taskpair_df(sortwords_gpt4_5shot_df)
scaled_sortwords_gpt4_10shot_df <- scale_taskpair_df(sortwords_gpt4_10shot_df)

scaled_sortwords_gpt35_0shot_df <- scale_taskpair_df(sortwords_gpt35_0shot_df)
scaled_sortwords_gpt35_5shot_df <- scale_taskpair_df(sortwords_gpt35_5shot_df)
scaled_sortwords_gpt35_10shot_df <- scale_taskpair_df(sortwords_gpt35_10shot_df)

scaled_sortwords_claude3_0shot_df <- scale_taskpair_df(sortwords_claude3_0shot_df)
scaled_sortwords_claude3_5shot_df <- scale_taskpair_df(sortwords_claude3_5shot_df)
scaled_sortwords_claude3_10shot_df <- scale_taskpair_df(sortwords_claude3_10shot_df)

scaled_sortwords_gpt35ft_0shot_df <- scale_taskpair_df(sortwords_gpt35ft_0shot_df)
scaled_sortwords_gpt35ft_10shot_df <- scale_taskpair_df(sortwords_gpt35ft_10shot_df)
scaled_sortwords_gpt35ft_100shot_df <- scale_taskpair_df(sortwords_gpt35ft_100shot_df)



In [703]:
sort_gpt4_words_0shot_model <- glm(correct ~ task + input_nchars + input_ntokens + input_logprob + output_logprob, 
               data=scaled_sortwords_gpt4_0shot_df, family=binomial)
sort_gpt4_words_5shot_model <- glm(correct ~ task + input_nchars + input_ntokens + input_logprob + output_logprob, 
               data=scaled_sortwords_gpt4_5shot_df, family=binomial)
sort_gpt4_words_10shot_model <- glm(correct ~ task + input_nchars + input_ntokens + input_logprob + output_logprob, 
               data=scaled_sortwords_gpt4_10shot_df, family=binomial)

sort_gpt35_words_0shot_model <- glm(correct ~ task + input_nchars + input_ntokens + input_logprob + output_logprob, 
               data=scaled_sortwords_gpt35_0shot_df, family=binomial)
sort_gpt35_words_5shot_model <- glm(correct ~ task + input_nchars + input_ntokens + input_logprob + output_logprob, 
               data=scaled_sortwords_gpt35_5shot_df, family=binomial)
sort_gpt35_words_10shot_model <- glm(correct ~ task + input_nchars + input_ntokens + input_logprob + output_logprob, 
               data=scaled_sortwords_gpt35_10shot_df, family=binomial)

sort_claude3_words_0shot_model <- glm(correct ~ task + input_nchars + input_logprob + output_logprob, 
               data=scaled_sortwords_claude3_0shot_df, family=binomial)
sort_claude3_words_5shot_model <- glm(correct ~ task + input_nchars + input_logprob + output_logprob, 
               data=scaled_sortwords_claude3_5shot_df, family=binomial)
sort_claude3_words_10shot_model <- glm(correct ~ task + input_nchars + input_logprob + output_logprob, 
               data=scaled_sortwords_claude3_10shot_df, family=binomial)

sort_gpt35ft_words_0shot_model <- glm(correct ~ task + input_nchars + input_ntokens + input_logprob + output_logprob, 
               data=scaled_sortwords_gpt35ft_0shot_df, family=binomial)
sort_gpt35ft_words_10shot_model <- glm(correct ~ task + input_nchars + input_ntokens + input_logprob + output_logprob, 
               data=scaled_sortwords_gpt35ft_10shot_df, family=binomial)
sort_gpt35ft_words_100shot_model <- glm(correct ~ task + input_nchars + input_ntokens + input_logprob + output_logprob, 
               data=scaled_sortwords_gpt35ft_100shot_df, family=binomial)


In [704]:
summary(sort_gpt4_words_0shot_model)


Call:
glm(formula = correct ~ task + input_nchars + input_ntokens + 
    input_logprob + output_logprob, family = binomial, data = scaled_sortwords_gpt4_0shot_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.2232  -0.8916   0.4892   0.7003   1.9249  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     1.59101    0.31446   5.059 4.20e-07 ***
taskrev        -2.38837    0.45625  -5.235 1.65e-07 ***
input_nchars    0.38206    0.64249   0.595    0.552    
input_ntokens  -0.67072    0.87302  -0.768    0.442    
input_logprob   0.17060    0.91596   0.186    0.852    
output_logprob  0.02066    0.78236   0.026    0.979    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 273.33  on 199  degrees of freedom
Residual deviance: 211.06  on 194  degrees of freedom
AIC: 223.06

Number of Fisher Scoring iterations: 4


In [705]:
summary(sort_gpt4_words_5shot_model)


Call:
glm(formula = correct ~ task + input_nchars + input_ntokens + 
    input_logprob + output_logprob, family = binomial, data = scaled_sortwords_gpt4_5shot_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.4508  -0.9117   0.4363   0.8825   1.9478  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)      1.4301     0.3101   4.612 3.99e-06 ***
taskrev         -1.9968     0.4592  -4.349 1.37e-05 ***
input_nchars    -0.3746     0.6191  -0.605    0.545    
input_ntokens   -1.0427     0.8488  -1.228    0.219    
input_logprob    0.1611     0.8894   0.181    0.856    
output_logprob  -0.6855     0.7699  -0.890    0.373    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 272.12  on 199  degrees of freedom
Residual deviance: 216.80  on 194  degrees of freedom
AIC: 228.8

Number of Fisher Scoring iterations: 4


In [706]:
summary(sort_gpt4_words_10shot_model)


Call:
glm(formula = correct ~ task + input_nchars + input_ntokens + 
    input_logprob + output_logprob, family = binomial, data = scaled_sortwords_gpt4_10shot_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.4338  -0.8919   0.4163   0.8721   2.0843  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     1.22240    0.30449   4.015 5.96e-05 ***
taskrev        -2.07336    0.45828  -4.524 6.06e-06 ***
input_nchars    0.02336    0.63802   0.037    0.971    
input_ntokens  -0.46182    0.86232  -0.536    0.592    
input_logprob   0.70765    0.90908   0.778    0.436    
output_logprob -0.14975    0.79495  -0.188    0.851    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 276.28  on 199  degrees of freedom
Residual deviance: 211.18  on 194  degrees of freedom
AIC: 223.18

Number of Fisher Scoring iterations: 4


In [707]:
summary(sort_gpt35_words_0shot_model)


Call:
glm(formula = correct ~ task + input_nchars + input_ntokens + 
    input_logprob + output_logprob, family = binomial, data = scaled_sortwords_gpt35_0shot_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.9744  -0.6160  -0.3827   0.7223   2.2874  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)      1.0275     0.2797   3.674 0.000239 ***
taskrev         -2.6897     0.4560  -5.898 3.67e-09 ***
input_nchars    -0.2244     0.7079  -0.317 0.751250    
input_ntokens    0.2777     0.9500   0.292 0.770045    
input_logprob   -0.4364     0.9579  -0.456 0.648674    
output_logprob   0.9603     0.8631   1.113 0.265902    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 275.64  on 199  degrees of freedom
Residual deviance: 188.36  on 194  degrees of freedom
AIC: 200.36

Number of Fisher Scoring iterations: 4


In [708]:
summary(sort_gpt35_words_5shot_model)


Call:
glm(formula = correct ~ task + input_nchars + input_ntokens + 
    input_logprob + output_logprob, family = binomial, data = scaled_sortwords_gpt35_5shot_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.2415  -0.3527  -0.2093   0.5892   2.4737  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)      1.4277     0.3250   4.393 1.12e-05 ***
taskrev         -4.4741     0.6359  -7.036 1.98e-12 ***
input_nchars    -0.5028     0.8579  -0.586    0.558    
input_ntokens    0.6435     1.1608   0.554    0.579    
input_logprob    0.4324     1.1582   0.373    0.709    
output_logprob   0.2925     1.0397   0.281    0.778    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 272.74  on 199  degrees of freedom
Residual deviance: 133.35  on 194  degrees of freedom
AIC: 145.35

Number of Fisher Scoring iterations: 5


In [709]:
summary(sort_gpt35_words_10shot_model)


Call:
glm(formula = correct ~ task + input_nchars + input_ntokens + 
    input_logprob + output_logprob, family = binomial, data = scaled_sortwords_gpt35_10shot_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.1388  -0.3154  -0.2010   0.6273   2.5041  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     1.29535    0.31525   4.109 3.98e-05 ***
taskrev        -4.61100    0.67115  -6.870 6.41e-12 ***
input_nchars   -0.43935    0.84611  -0.519    0.604    
input_ntokens   1.08217    1.14550   0.945    0.345    
input_logprob   1.18459    1.13698   1.042    0.297    
output_logprob -0.04718    1.02447  -0.046    0.963    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 269.99  on 199  degrees of freedom
Residual deviance: 135.34  on 194  degrees of freedom
AIC: 147.34

Number of Fisher Scoring iterations: 6


In [710]:
summary(sort_claude3_words_0shot_model)


Call:
glm(formula = correct ~ task + input_nchars + input_logprob + 
    output_logprob, family = binomial, data = scaled_sortwords_claude3_0shot_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.5999  -0.9017   0.3678   0.6900   1.7255  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)      2.4958     0.4139   6.030 1.64e-09 ***
taskrev         -2.4497     0.5338  -4.589 4.44e-06 ***
input_nchars    -0.7735     0.5686  -1.360    0.174    
input_logprob   -0.5703     0.9007  -0.633    0.527    
output_logprob   0.2755     0.8414   0.327    0.743    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 240.86  on 199  degrees of freedom
Residual deviance: 186.29  on 195  degrees of freedom
AIC: 196.29

Number of Fisher Scoring iterations: 5


In [711]:
summary(sort_claude3_words_5shot_model)


Call:
glm(formula = correct ~ task + input_nchars + input_logprob + 
    output_logprob, family = binomial, data = scaled_sortwords_claude3_5shot_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.7322  -0.6620   0.3289   0.6953   1.9452  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)      2.7131     0.4437   6.115 9.66e-10 ***
taskrev         -2.3976     0.5664  -4.233 2.30e-05 ***
input_nchars    -1.7442     0.6112  -2.854  0.00432 ** 
input_logprob   -1.2213     0.9360  -1.305  0.19194    
output_logprob   0.2408     0.8671   0.278  0.78127    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 229.22  on 199  degrees of freedom
Residual deviance: 171.39  on 195  degrees of freedom
AIC: 181.39

Number of Fisher Scoring iterations: 5


In [712]:
summary(sort_claude3_words_10shot_model)


Call:
glm(formula = correct ~ task + input_nchars + input_logprob + 
    output_logprob, family = binomial, data = scaled_sortwords_claude3_10shot_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.5686  -0.9305   0.3866   0.8042   1.7693  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)      2.4884     0.4058   6.132 8.70e-10 ***
taskrev         -2.5678     0.5349  -4.801 1.58e-06 ***
input_nchars    -0.9337     0.5592  -1.670    0.095 .  
input_logprob    0.3025     0.8588   0.352    0.725    
output_logprob  -0.6707     0.8058  -0.832    0.405    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 242.63  on 199  degrees of freedom
Residual deviance: 192.04  on 195  degrees of freedom
AIC: 202.04

Number of Fisher Scoring iterations: 5


In [713]:
summary(sort_gpt35ft_words_0shot_model)


Call:
glm(formula = correct ~ task + input_nchars + input_ntokens + 
    input_logprob + output_logprob, family = binomial, data = scaled_sortwords_gpt35ft_0shot_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.9744  -0.6160  -0.3827   0.7223   2.2874  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)      1.0275     0.2797   3.674 0.000239 ***
taskrev         -2.6897     0.4560  -5.898 3.67e-09 ***
input_nchars    -0.2244     0.7079  -0.317 0.751250    
input_ntokens    0.2777     0.9500   0.292 0.770045    
input_logprob   -0.4364     0.9579  -0.456 0.648674    
output_logprob   0.9603     0.8631   1.113 0.265902    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 275.64  on 199  degrees of freedom
Residual deviance: 188.36  on 194  degrees of freedom
AIC: 200.36

Number of Fisher Scoring iterations: 4


In [714]:
summary(sort_gpt35ft_words_10shot_model)


Call:
glm(formula = correct ~ task + input_nchars + input_ntokens + 
    input_logprob + output_logprob, family = binomial, data = scaled_sortwords_gpt35ft_10shot_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.1989  -0.4093  -0.2284   0.6397   2.5066  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)      1.1051     0.2958   3.736 0.000187 ***
taskrev         -4.3033     0.6346  -6.782 1.19e-11 ***
input_nchars    -1.9947     0.8055  -2.476 0.013275 *  
input_ntokens   -0.1799     1.0374  -0.173 0.862342    
input_logprob   -0.5007     1.0462  -0.479 0.632207    
output_logprob  -1.1052     0.9436  -1.171 0.241483    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 264.63  on 199  degrees of freedom
Residual deviance: 152.81  on 194  degrees of freedom
AIC: 164.81

Number of Fisher Scoring iterations: 5


In [715]:
summary(sort_gpt35ft_words_100shot_model)


Call:
glm(formula = correct ~ task + input_nchars + input_ntokens + 
    input_logprob + output_logprob, family = binomial, data = scaled_sortwords_gpt35ft_100shot_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.0837  -1.0745   0.5679   0.9910   1.7993  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     0.89431    0.27052   3.306 0.000947 ***
taskrev        -1.22741    0.41057  -2.990 0.002794 ** 
input_nchars    0.01696    0.59383   0.029 0.977216    
input_ntokens  -0.90881    0.81037  -1.121 0.262084    
input_logprob  -0.41405    0.83837  -0.494 0.621395    
output_logprob  0.25244    0.72975   0.346 0.729401    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 274.37  on 199  degrees of freedom
Residual deviance: 236.56  on 194  degrees of freedom
AIC: 248.56

Number of Fisher Scoring iterations: 4


In [716]:
vif(sort_gpt4_words_0shot_model)

In [717]:
vif(sort_gpt4_words_5shot_model)

In [718]:
vif(sort_gpt4_words_10shot_model)

In [719]:
vif(sort_gpt35_words_0shot_model)

In [720]:
vif(sort_gpt35_words_5shot_model)

In [721]:
vif(sort_gpt35_words_10shot_model)

In [722]:
vif(sort_claude3_words_0shot_model)

In [723]:
vif(sort_claude3_words_5shot_model)

In [724]:
vif(sort_claude3_words_10shot_model)

In [725]:
vif(sort_gpt35ft_words_0shot_model)

In [726]:
vif(sort_gpt35ft_words_10shot_model)

In [727]:
vif(sort_gpt35ft_words_100shot_model)

# Birthdays

In [832]:
# Read in data
birthdays_gpt4_df <- read.table(file = 'table_birthdays_gpt-4-0613.tsv', sep = '\t', header = TRUE)
birthdays_gpt35_df <- read.table(file = 'table_birthdays_gpt-3.5-turbo-0613.tsv', sep = '\t', header = TRUE)
birthdays_llama3_df <- read.table(file = 'table_birthdays_llama-3-70b-chat-hf.tsv', sep = '\t', header = TRUE)
birthdays_claude3_df <- read.table(file = 'table_birthdays_claude-3-opus-20240229.tsv', sep = '\t', header = TRUE)
birthdays_gemini1_df <- read.table(file = 'table_birthdays_gemini-1.0-pro-001.tsv', sep = '\t', header = TRUE)




In [833]:
# Z-score data
scaled_birthdays_gpt4_df <- scale_df(birthdays_gpt4_df)
scaled_birthdays_gpt35_df <- scale_df(birthdays_gpt35_df)
scaled_birthdays_llama3_df <- scale_df(birthdays_llama3_df)
scaled_birthdays_claude3_df <- scale_df(birthdays_claude3_df)
scaled_birthdays_gemini1_df <- scale_df(birthdays_gemini1_df)


In [834]:
birthdays_gpt4_model <- glm(correct ~ input_logprob + output_logprob, 
               data=scaled_birthdays_gpt4_df, family=binomial)
birthdays_gpt35_model <- glm(correct ~ input_logprob + output_logprob, 
               data=scaled_birthdays_gpt35_df, family=binomial)
birthdays_llama3_model <- glm(correct ~ input_logprob + output_logprob, 
               data=scaled_birthdays_llama3_df, family=binomial)
birthdays_claude3_model <- glm(correct ~ input_logprob + output_logprob, 
               data=scaled_birthdays_claude3_df, family=binomial)
birthdays_gemini1_model <- glm(correct ~ input_logprob + output_logprob, 
               data=scaled_birthdays_gemini1_df, family=binomial)


In [835]:
summary(birthdays_gpt4_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob, family = binomial, 
    data = scaled_birthdays_gpt4_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.8769  -0.6683   0.1843   0.5264   2.0459  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)      1.4605     0.1915   7.626 2.43e-14 ***
input_logprob    2.2209     0.2229   9.964  < 2e-16 ***
output_logprob   0.1372     0.1440   0.953    0.341    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 505.92  on 399  degrees of freedom
Residual deviance: 305.07  on 397  degrees of freedom
AIC: 311.07

Number of Fisher Scoring iterations: 6


In [836]:
summary(birthdays_gpt35_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob, family = binomial, 
    data = scaled_birthdays_gpt35_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.9662  -0.4500   0.1340   0.4589   2.3463  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)    0.880415   0.178538   4.931 8.17e-07 ***
input_logprob  2.770293   0.258676  10.710  < 2e-16 ***
output_logprob 0.001702   0.156567   0.011    0.991    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 544.23  on 399  degrees of freedom
Residual deviance: 261.61  on 397  degrees of freedom
AIC: 267.61

Number of Fisher Scoring iterations: 6


In [837]:
summary(birthdays_llama3_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob, family = binomial, 
    data = scaled_birthdays_llama3_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.8146  -0.6755   0.2494   0.6386   1.9743  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     0.98044    0.15739   6.229 4.68e-10 ***
input_logprob   1.95895    0.19021  10.299  < 2e-16 ***
output_logprob  0.04273    0.13614   0.314    0.754    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 528.22  on 399  degrees of freedom
Residual deviance: 337.40  on 397  degrees of freedom
AIC: 343.4

Number of Fisher Scoring iterations: 5


In [838]:
summary(birthdays_claude3_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob, family = binomial, 
    data = scaled_birthdays_claude3_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.9396  -0.5980   0.1731   0.5308   2.2327  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)      1.2421     0.1820   6.825 8.82e-12 ***
input_logprob    2.3169     0.2240  10.342  < 2e-16 ***
output_logprob  -0.1189     0.1456  -0.817    0.414    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 522.73  on 399  degrees of freedom
Residual deviance: 299.73  on 397  degrees of freedom
AIC: 305.73

Number of Fisher Scoring iterations: 6


In [839]:
summary(birthdays_gemini1_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob, family = binomial, 
    data = scaled_birthdays_gemini1_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.3773  -0.4686  -0.1945   0.3959   2.8288  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     -0.2473     0.1514  -1.634    0.102    
input_logprob    2.4797     0.2278  10.884   <2e-16 ***
output_logprob   0.1166     0.1508   0.773    0.439    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 552.56  on 399  degrees of freedom
Residual deviance: 284.26  on 397  degrees of freedom
AIC: 290.26

Number of Fisher Scoring iterations: 5


In [840]:
vif(birthdays_gpt4_model)

In [841]:
vif(birthdays_gpt35_model)

In [842]:
vif(birthdays_llama3_model)

In [843]:
vif(birthdays_claude3_model)

In [844]:
vif(birthdays_gemini1_model)