In [352]:
# Might need to install arm - if so, uncomment the following line
#install.packages("arm")
library(arm)
library(lme4)
library(blme)
library(ggplot2)
library(stringr)
library(psycho)
library(glmnet)
library(car)
library("multcomp")

Loading required package: mvtnorm

Loading required package: survival

Loading required package: TH.data


Attaching package: ‘TH.data’


The following object is masked from ‘package:MASS’:

    geyser




In [2]:
# Set ggplot theme
base_size <- 20
theme_set(theme_bw(base_size=base_size) +
            theme(#panel.grid.major=element_blank(),
              panel.grid.minor=element_blank(),
              axis.title.y=element_text(angle=90,vjust=0.5),
              axis.text.x=element_text(angle=0, hjust=0.5),
              axis.title.x=element_blank()))

In [3]:
correct_vs_length_and_prob <- function(df, include_output_chars=TRUE, include_input_chars=TRUE){
 
    
    if (include_output_chars & include_input_chars){
        model <- glm(correct ~ input_logprob + output_logprob + input_ntokens + output_ntokens + input_nchars + output_nchars, 
               data=df, family=binomial)
    } else if (include_output_chars & !include_input_chars){
        model <- glm(correct ~ input_logprob + output_logprob + input_ntokens + output_ntokens + output_nchars, 
               data=df, family=binomial)
    } else if (!include_output_chars & include_input_chars){
        model <- glm(correct ~ input_logprob + output_logprob + input_ntokens + output_ntokens + input_nchars, 
               data=df, family=binomial)
    }
  
    
  return(model)
}

In [274]:
# For Z-scoring datasets

scale_df <- function(df) {
    new_df <- data.frame(scale(df[2:7]))
    new_df$index <- as.factor(df$index)
    new_df$correct <- df$correct
    
    return(new_df)
}

scale_taskpair_df <- function(df) {
    new_df <- data.frame(scale(df[3:8]))
    new_df$index <- as.factor(df$index)
    new_df$task <- as.factor(df$task)
    new_df$correct <- df$correct
    
    return(new_df)
}

scale_taskpair_prob_df <- function(df) {
    new_df <- data.frame(scale(df[2:8]))
    new_df$index <- as.factor(df$index)
    new_df$correct <- df$correct
    
    return(new_df)
}



scale_df_with_index <- function(df) {
    new_df <- data.frame(scale(df[1:7]))
    new_df$correct <- df$correct
    
    return(new_df)
}

# Shift ciphers

In [5]:
# Read in data
rot13enc_gpt4_df <- read.table(file = 'table_rot13enc_gpt-4.tsv', sep = '\t', header = TRUE)
rot13enc_gpt35_df <- read.table(file = 'table_rot13enc_gpt-3.5-turbo.tsv', sep = '\t', header = TRUE)

rot13dec_gpt4_df <- read.table(file = 'table_rot13dec_gpt-4.tsv', sep = '\t', header = TRUE)
rot13dec_gpt35_df <- read.table(file = 'table_rot13dec_gpt-3.5-turbo.tsv', sep = '\t', header = TRUE)

In [6]:
# Z-score data
scaled_rot13enc_gpt4_df <- scale_df(rot13enc_gpt4_df)
scaled_rot13enc_gpt35_df <- scale_df(rot13enc_gpt35_df)

scaled_rot13dec_gpt4_df <- scale_df(rot13dec_gpt4_df)
scaled_rot13dec_gpt35_df <- scale_df(rot13dec_gpt35_df)


In [7]:

rot13enc_gpt4_model <- correct_vs_length_and_prob(scaled_rot13enc_gpt4_df, include_output_chars=FALSE)
rot13enc_gpt35_model <- correct_vs_length_and_prob(scaled_rot13enc_gpt35_df, include_output_chars=FALSE)

rot13dec_gpt4_model <- correct_vs_length_and_prob(scaled_rot13dec_gpt4_df, include_output_chars=FALSE)
rot13dec_gpt35_model <- correct_vs_length_and_prob(scaled_rot13dec_gpt35_df, include_output_chars=FALSE)

“glm.fit: fitted probabilities numerically 0 or 1 occurred”


In [8]:
summary(rot13enc_gpt4_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob + input_ntokens + 
    output_ntokens + input_nchars, family = binomial, data = df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-2.06663  -0.44366  -0.14606  -0.01354   2.97169  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     -4.1193     0.5713  -7.211 5.57e-13 ***
input_logprob    1.2158     0.4594   2.646  0.00814 ** 
output_logprob   7.0316     3.1573   2.227  0.02594 *  
input_ntokens    2.9504     0.9824   3.003  0.00267 ** 
output_ntokens  -1.5485     2.5145  -0.616  0.53802    
input_nchars     3.0854     3.3378   0.924  0.35529    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 246.58  on 299  degrees of freedom
Residual deviance: 156.14  on 294  degrees of freedom
AIC: 168.14

Number of Fisher Scoring iterations: 7


In [9]:
summary(rot13enc_gpt35_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob + input_ntokens + 
    output_ntokens + input_nchars, family = binomial, data = df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.28769  -0.08616  -0.01909  -0.00030   3.10549  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)    -10.7542     2.5495  -4.218 2.46e-05 ***
input_logprob   -0.2191     1.0850  -0.202   0.8399    
output_logprob  19.6737     9.8166   2.004   0.0451 *  
input_ntokens    4.3058     2.6034   1.654   0.0982 .  
output_ntokens   4.5089     7.3753   0.611   0.5410    
input_nchars     3.5266     8.5951   0.410   0.6816    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 80.845  on 299  degrees of freedom
Residual deviance: 37.569  on 294  degrees of freedom
AIC: 49.569

Number of Fisher Scoring iterations: 10


In [10]:
summary(rot13dec_gpt4_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob + input_ntokens + 
    output_ntokens + input_nchars, family = binomial, data = df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.4640  -0.8038  -0.4112   0.8902   2.5726  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     -1.4509     0.1977  -7.340 2.13e-13 ***
input_logprob    1.4829     1.7607   0.842   0.3996    
output_logprob   1.7068     0.3173   5.380 7.46e-08 ***
input_ntokens   -3.0153     1.6183  -1.863   0.0624 .  
output_ntokens   0.5686     0.6128   0.928   0.3535    
input_nchars     4.0308     2.1224   1.899   0.0575 .  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 357.64  on 299  degrees of freedom
Residual deviance: 285.30  on 294  degrees of freedom
AIC: 297.3

Number of Fisher Scoring iterations: 6


In [11]:
summary(rot13dec_gpt35_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob + input_ntokens + 
    output_ntokens + input_nchars, family = binomial, data = df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.44662  -0.32654  -0.10349  -0.01189   2.92757  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     -5.0436     0.7792  -6.473 9.62e-11 ***
input_logprob    3.2597     3.3367   0.977   0.3286    
output_logprob   3.8136     0.8978   4.247 2.16e-05 ***
input_ntokens   -5.2379     2.7949  -1.874   0.0609 .  
output_ntokens   2.4683     1.0515   2.347   0.0189 *  
input_nchars     5.5853     3.6081   1.548   0.1216    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 203.69  on 299  degrees of freedom
Residual deviance: 125.76  on 294  degrees of freedom
AIC: 137.76

Number of Fisher Scoring iterations: 8


In [12]:
vif(rot13enc_gpt4_model)

In [13]:
vif(rot13enc_gpt35_model)

In [14]:
vif(rot13dec_gpt4_model)

In [15]:
vif(rot13dec_gpt35_model)

### Distance as output

In [16]:
# Read in data
rot13encdist_gpt4_df <- read.table(file = 'table_rot13enc_gpt-4_dist.tsv', sep = '\t', header = TRUE)
rot13encdist_gpt35_df <- read.table(file = 'table_rot13enc_gpt-3.5-turbo_dist.tsv', sep = '\t', header = TRUE)

In [17]:
# Get rid of "correct"
rot13encdist_gpt4_df <- rot13encdist_gpt4_df[,-c(1,8)]
rot13encdist_gpt35_df <- rot13encdist_gpt35_df[,-c(1,8)]

# Scale
rot13encdist_gpt4_df <- data.frame(scale(rot13encdist_gpt4_df))
rot13encdist_gpt35_df <- data.frame(scale(rot13encdist_gpt35_df))

In [19]:
model_dist4 <- glm(distance ~ input_logprob + output_logprob + input_ntokens + output_ntokens + input_nchars, 
               data=rot13encdist_gpt4_df)
model_dist35 <- glm(distance ~ input_logprob + output_logprob + input_ntokens + output_ntokens + input_nchars, 
               data=rot13encdist_gpt35_df)

In [20]:
summary(model_dist4)


Call:
glm(formula = distance ~ input_logprob + output_logprob + input_ntokens + 
    output_ntokens + input_nchars, data = rot13encdist_gpt4_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.68257  -0.33897  -0.02941   0.30112   1.84476  

Coefficients:
                 Estimate Std. Error t value Pr(>|t|)    
(Intercept)    -2.154e-16  3.249e-02   0.000 1.000000    
input_logprob  -1.267e-01  4.416e-02  -2.870 0.004400 ** 
output_logprob -4.197e-02  3.234e-01  -0.130 0.896813    
input_ntokens  -1.169e+00  1.215e-01  -9.622  < 2e-16 ***
output_ntokens  1.270e+00  3.278e-01   3.874 0.000132 ***
input_nchars    4.705e-01  4.318e-01   1.090 0.276816    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for gaussian family taken to be 0.3166029)

    Null deviance: 299.000  on 299  degrees of freedom
Residual deviance:  93.081  on 294  degrees of freedom
AIC: 514.27

Number of Fisher Scoring iterations: 2


In [21]:
summary(model_dist35)


Call:
glm(formula = distance ~ input_logprob + output_logprob + input_ntokens + 
    output_ntokens + input_nchars, data = rot13encdist_gpt35_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.1626  -0.2882  -0.0866   0.1133  10.7318  

Coefficients:
                 Estimate Std. Error t value Pr(>|t|)   
(Intercept)    -9.047e-17  4.855e-02   0.000  1.00000   
input_logprob  -2.171e-01  6.600e-02  -3.290  0.00112 **
output_logprob -1.539e-01  4.833e-01  -0.318  0.75043   
input_ntokens  -5.903e-01  1.816e-01  -3.251  0.00128 **
output_ntokens -1.081e-01  4.899e-01  -0.221  0.82552   
input_nchars    8.754e-01  6.454e-01   1.356  0.17601   
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for gaussian family taken to be 0.707216)

    Null deviance: 299.00  on 299  degrees of freedom
Residual deviance: 207.92  on 294  degrees of freedom
AIC: 755.38

Number of Fisher Scoring iterations: 2


### Comparing rot-2 to rot-13

In [200]:
# Read in data
rot13and2enc_gpt4_df <- read.table(file = 'table_rot13and2enc_gpt-4.tsv', sep = '\t', header = TRUE)
rot13and2enc_gpt35_df <- read.table(file = 'table_rot13and2enc_gpt-3.5-turbo.tsv', sep = '\t', header = TRUE)

rot13and2dec_gpt4_df <- read.table(file = 'table_rot13and2dec_gpt-4.tsv', sep = '\t', header = TRUE)
rot13and2dec_gpt35_df <- read.table(file = 'table_rot13and2dec_gpt-3.5-turbo.tsv', sep = '\t', header = TRUE)

In [201]:
# Z-score data
scaled_rot13and2enc_gpt4_df <- scale_taskpair_df(rot13and2enc_gpt4_df)
scaled_rot13and2enc_gpt35_df <- scale_taskpair_df(rot13and2enc_gpt35_df)

scaled_rot13and2dec_gpt4_df <- scale_taskpair_df(rot13and2dec_gpt4_df)
scaled_rot13and2dec_gpt35_df <- scale_taskpair_df(rot13and2dec_gpt35_df)


In [214]:
model_taskenc4 <- bayesglm(correct ~ task + input_nchars + input_ntokens + output_ntokens + input_logprob + output_logprob, 
               data=scaled_rot13and2enc_gpt4_df, family=binomial)


In [215]:
model_taskenc35 <- bayesglm(correct ~ task + input_nchars + input_ntokens + output_ntokens + input_logprob + output_logprob, 
               data=scaled_rot13and2enc_gpt35_df, family=binomial)

In [216]:
model_taskdec4 <- bayesglm(correct ~ task + input_nchars + input_ntokens + output_ntokens + input_logprob + output_logprob, 
               data=scaled_rot13and2dec_gpt4_df, family=binomial)

In [217]:
model_taskdec35 <- bayesglm(correct ~ task + input_nchars + input_ntokens + output_ntokens + input_logprob + output_logprob, 
               data=scaled_rot13and2dec_gpt35_df, family=binomial)

In [212]:
summary(model_taskenc4)


Call:
bayesglm(formula = correct ~ task + input_nchars + input_ntokens + 
    output_ntokens + input_logprob + output_logprob, family = binomial, 
    data = scaled_rot13and2enc_gpt4_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.25996  -0.21828  -0.07806  -0.01271   2.80272  

Coefficients:
                     Estimate Std. Error z value Pr(>|z|)    
(Intercept)          -2.87795    0.60339  -4.770 1.85e-06 ***
taskrot2enc_highprob -4.34689    1.55003  -2.804  0.00504 ** 
input_nchars          0.00124    1.05843   0.001  0.99907    
input_ntokens         1.97498    0.87549   2.256  0.02408 *  
output_ntokens       -1.08950    1.26502  -0.861  0.38910    
input_logprob         0.92065    0.71414   1.289  0.19734    
output_logprob        2.60027    1.44862   1.795  0.07265 .  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 134.373  on 199  degrees of free

In [218]:
summary(model_taskenc35)


Call:
bayesglm(formula = correct ~ task + input_nchars + input_ntokens + 
    output_ntokens + input_logprob + output_logprob, family = binomial, 
    data = scaled_rot13and2enc_gpt35_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-0.44232  -0.14866  -0.08226  -0.03545   2.65354  

Coefficients:
                     Estimate Std. Error z value Pr(>|z|)    
(Intercept)           -5.2583     1.4729  -3.570 0.000357 ***
taskrot2enc_highprob  -1.5737     1.4743  -1.067 0.285808    
input_nchars          -0.5263     1.1338  -0.464 0.642517    
input_ntokens          0.2504     1.0285   0.243 0.807675    
output_ntokens        -0.3844     1.1247  -0.342 0.732540    
input_logprob          0.9301     1.1364   0.818 0.413105    
output_logprob         0.5483     1.1348   0.483 0.628938    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 22.401  on 199  degrees of free

In [219]:
summary(model_taskdec4)


Call:
bayesglm(formula = correct ~ task + input_nchars + input_ntokens + 
    output_ntokens + input_logprob + output_logprob, family = binomial, 
    data = scaled_rot13and2dec_gpt4_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.4273  -0.3833  -0.2060   0.7264   2.7140  

Coefficients:
                     Estimate Std. Error z value Pr(>|z|)    
(Intercept)           -0.2299     0.2493  -0.922   0.3564    
taskrot2dec_highprob  -3.6513     0.7045  -5.183 2.19e-07 ***
input_nchars           0.5057     0.9219   0.549   0.5833    
input_ntokens         -0.6930     0.9403  -0.737   0.4611    
output_ntokens         0.7964     0.5920   1.345   0.1785    
input_logprob          0.4522     0.8507   0.532   0.5950    
output_logprob         0.9671     0.4980   1.942   0.0521 .  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 227.10  on 199  degrees of freedom
Residua

In [221]:
summary(model_taskdec35)


Call:
bayesglm(formula = correct ~ task + input_nchars + input_ntokens + 
    output_ntokens + input_logprob + output_logprob, family = binomial, 
    data = scaled_rot13and2dec_gpt35_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.30362  -0.24753  -0.08608  -0.01631   2.30323  

Coefficients:
                     Estimate Std. Error z value Pr(>|z|)    
(Intercept)           -2.4093     0.5178  -4.653 3.27e-06 ***
taskrot2dec_highprob  -4.6894     1.5828  -2.963  0.00305 ** 
input_nchars           0.0204     0.9765   0.021  0.98334    
input_ntokens         -0.7847     1.0785  -0.728  0.46685    
output_ntokens         2.1414     0.8479   2.525  0.01156 *  
input_logprob          0.7056     0.9973   0.708  0.47923    
output_logprob         2.9299     0.9103   3.219  0.00129 ** 
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 142.737  on 199  degrees of fre

In [223]:
vif(model_taskenc4)

In [224]:
vif(model_taskenc35)

In [225]:
vif(model_taskdec4)

In [226]:
vif(model_taskdec35)

# Reversal

In [231]:
# Read in data
revenc_gpt4_df <- read.table(file = 'table_revenc_gpt-4.tsv', sep = '\t', header = TRUE)
revenc_gpt35_df <- read.table(file = 'table_revenc_gpt-3.5-turbo.tsv', sep = '\t', header = TRUE)

revdec_gpt4_df <- read.table(file = 'table_revdec_gpt-4.tsv', sep = '\t', header = TRUE)
revdec_gpt35_df <- read.table(file = 'table_revdec_gpt-3.5-turbo.tsv', sep = '\t', header = TRUE)

In [232]:
# Z-score data
scaled_revenc_gpt4_df <- scale_df(revenc_gpt4_df)
scaled_revenc_gpt35_df <- scale_df(revenc_gpt35_df)

scaled_revdec_gpt4_df <- scale_df(revdec_gpt4_df)
scaled_revdec_gpt35_df <- scale_df(revdec_gpt35_df)

In [233]:

revenc_gpt4_model <- correct_vs_length_and_prob(scaled_revenc_gpt4_df, include_output_chars=FALSE)
revenc_gpt35_model <- correct_vs_length_and_prob(scaled_revenc_gpt35_df, include_output_chars=FALSE)

revdec_gpt4_model <- correct_vs_length_and_prob(scaled_revdec_gpt4_df, include_output_chars=FALSE)
revdec_gpt35_model <- correct_vs_length_and_prob(scaled_revdec_gpt35_df, include_output_chars=FALSE)


In [234]:
summary(revenc_gpt4_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob + input_ntokens + 
    output_ntokens + input_nchars, family = binomial, data = df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.3581   0.4014   0.5172   0.6303   1.6860  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     1.51883    0.16136   9.413   <2e-16 ***
input_logprob  -0.03095    0.19608  -0.158    0.875    
output_logprob -0.43336    0.70858  -0.612    0.541    
input_ntokens   1.78562    1.81154   0.986    0.324    
output_ntokens -2.41642    1.92213  -1.257    0.209    
input_nchars   -0.57747    0.61481  -0.939    0.348    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 302.99  on 299  degrees of freedom
Residual deviance: 272.09  on 294  degrees of freedom
AIC: 284.09

Number of Fisher Scoring iterations: 4


In [235]:
summary(revenc_gpt35_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob + input_ntokens + 
    output_ntokens + input_nchars, family = binomial, data = df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.5339  -0.9830  -0.4953   1.0080   2.1167  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     -0.6251     0.1412  -4.429 9.48e-06 ***
input_logprob    0.3767     0.2217   1.699   0.0893 .  
output_logprob  -0.2740     0.8132  -0.337   0.7361    
input_ntokens   -1.6740     1.7242  -0.971   0.3316    
output_ntokens  -0.1954     1.7194  -0.114   0.9095    
input_nchars     0.7770     0.6848   1.135   0.2565    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 401.25  on 299  degrees of freedom
Residual deviance: 342.73  on 294  degrees of freedom
AIC: 354.73

Number of Fisher Scoring iterations: 5


In [236]:
summary(revdec_gpt4_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob + input_ntokens + 
    output_ntokens + input_nchars, family = binomial, data = df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.9589   0.1682   0.2843   0.4231   2.0008  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)      2.2378     0.2585   8.656  < 2e-16 ***
input_logprob   -2.5944     0.9742  -2.663  0.00774 ** 
output_logprob   2.3787     0.3426   6.943 3.85e-12 ***
input_ntokens    0.0314     2.3512   0.013  0.98934    
output_ntokens  -1.5059     2.2602  -0.666  0.50522    
input_nchars     0.3893     0.7958   0.489  0.62465    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 282.84  on 299  degrees of freedom
Residual deviance: 189.52  on 294  degrees of freedom
AIC: 201.52

Number of Fisher Scoring iterations: 6


In [237]:
summary(revdec_gpt35_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob + input_ntokens + 
    output_ntokens + input_nchars, family = binomial, data = df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.0957  -0.8704   0.5145   0.8071   2.6480  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)    -0.01288    0.14432  -0.089  0.92887    
input_logprob  -2.73153    0.87909  -3.107  0.00189 ** 
output_logprob  1.63015    0.25914   6.291 3.16e-10 ***
input_ntokens  -2.72167    1.83206  -1.486  0.13739    
output_ntokens  0.19701    1.79154   0.110  0.91244    
input_nchars   -0.24920    0.73670  -0.338  0.73517    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 414.55  on 299  degrees of freedom
Residual deviance: 314.54  on 294  degrees of freedom
AIC: 326.54

Number of Fisher Scoring iterations: 5


In [238]:
vif(revenc_gpt4_model)

In [239]:
vif(revenc_gpt35_model)

In [240]:
vif(revdec_gpt4_model)

In [241]:
vif(revdec_gpt35_model)

# Pig Latin

In [242]:
# Read in data
pigenc_gpt4_df <- read.table(file = 'table_pig_ay_enc_gpt-4.tsv', sep = '\t', header = TRUE)
pigenc_gpt35_df <- read.table(file = 'table_pig_ay_enc_gpt-3.5-turbo.tsv', sep = '\t', header = TRUE)

pigdec_gpt4_df <- read.table(file = 'table_pig_ay_dec_gpt-4.tsv', sep = '\t', header = TRUE)
pigdec_gpt35_df <- read.table(file = 'table_pig_ay_dec_gpt-3.5-turbo.tsv', sep = '\t', header = TRUE)

In [243]:
# Z-score data
scaled_pigenc_gpt4_df <- scale_df(pigenc_gpt4_df)
scaled_pigenc_gpt35_df <- scale_df(pigenc_gpt35_df)

scaled_pigdec_gpt4_df <- scale_df(pigdec_gpt4_df)
scaled_pigdec_gpt35_df <- scale_df(pigdec_gpt35_df)

In [244]:

pigenc_gpt4_model <- correct_vs_length_and_prob(scaled_pigenc_gpt4_df, include_output_chars=TRUE)
pigenc_gpt35_model <- correct_vs_length_and_prob(scaled_pigenc_gpt35_df, include_output_chars=TRUE)

pigdec_gpt4_model <- correct_vs_length_and_prob(scaled_pigdec_gpt4_df, include_output_chars=TRUE)
pigdec_gpt35_model <- correct_vs_length_and_prob(scaled_pigdec_gpt35_df, include_output_chars=TRUE)

In [245]:
summary(pigenc_gpt4_model) 


Call:
glm(formula = correct ~ input_logprob + output_logprob + input_ntokens + 
    output_ntokens + input_nchars + output_nchars, family = binomial, 
    data = df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.7900  -1.0455  -0.5875   1.1576   1.8598  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)   
(Intercept)    -0.39170    0.12576  -3.115  0.00184 **
input_logprob  -0.07581    0.18159  -0.417  0.67633   
output_logprob  2.05411    0.68406   3.003  0.00267 **
input_ntokens   1.98187    0.78916   2.511  0.01203 * 
output_ntokens  1.27363    0.97383   1.308  0.19092   
input_nchars    6.18673    2.91719   2.121  0.03394 * 
output_nchars  -7.97255    3.61588  -2.205  0.02746 * 
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 407.52  on 299  degrees of freedom
Residual deviance: 373.93  on 293  degrees of freedom
AIC: 387.93

Number of Fisher 

In [246]:
summary(pigenc_gpt35_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob + input_ntokens + 
    output_ntokens + input_nchars + output_nchars, family = binomial, 
    data = df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.3236  -0.8120  -0.6535   1.1288   2.1831  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     -1.1692     0.1452  -8.054 7.99e-16 ***
input_logprob    0.1436     0.2074   0.692   0.4889    
output_logprob   1.1147     0.7234   1.541   0.1233    
input_ntokens    1.5496     0.8680   1.785   0.0742 .  
output_ntokens  -0.9289     1.0852  -0.856   0.3920    
input_nchars    -2.3922     3.2853  -0.728   0.4665    
output_nchars    2.5116     4.0163   0.625   0.5317    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 341.72  on 299  degrees of freedom
Residual deviance: 321.71  on 293  degrees of freedom
AIC: 335.71

Number of

In [247]:
summary(pigdec_gpt4_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob + input_ntokens + 
    output_ntokens + input_nchars + output_nchars, family = binomial, 
    data = df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.7284  -0.8505   0.4236   0.7521   2.4140  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)      0.5198     0.1491   3.487 0.000488 ***
input_logprob    1.4257     0.7709   1.849 0.064402 .  
output_logprob   1.9253     0.2431   7.918  2.4e-15 ***
input_ntokens    1.3736     1.1236   1.223 0.221505    
output_ntokens  -1.1396     0.9170  -1.243 0.213953    
input_nchars     1.4535     4.2228   0.344 0.730698    
output_nchars    1.1972     3.4386   0.348 0.727713    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 406.12  on 299  degrees of freedom
Residual deviance: 298.86  on 293  degrees of freedom
AIC: 312.86

Number of

In [248]:
summary(pigdec_gpt35_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob + input_ntokens + 
    output_ntokens + input_nchars + output_nchars, family = binomial, 
    data = df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.8275  -0.6129  -0.2020   0.6722   3.3606  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     -2.0469     0.2798  -7.315 2.57e-13 ***
input_logprob    1.1926     0.8695   1.372   0.1702    
output_logprob   3.2094     0.4543   7.064 1.62e-12 ***
input_ntokens   -0.6667     1.2701  -0.525   0.5996    
output_ntokens  -1.9695     1.0861  -1.813   0.0698 .  
input_nchars     6.1606     4.9011   1.257   0.2088    
output_nchars   -1.2790     3.9378  -0.325   0.7453    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 357.64  on 299  degrees of freedom
Residual deviance: 232.40  on 293  degrees of freedom
AIC: 246.4

Number of 

In [249]:
vif(pigenc_gpt4_model)

In [250]:
vif(pigenc_gpt35_model)

In [251]:
vif(pigdec_gpt4_model)

In [252]:
vif(pigdec_gpt35_model)

### Comparing Pig Latin and Boar Etruscan

In [253]:
# Read in data
pigboarenc_gpt4_df <- read.table(file = 'table_pig_boar_enc_gpt-4.tsv', sep = '\t', header = TRUE)
pigboarenc_gpt35_df <- read.table(file = 'table_pig_boar_enc_gpt-3.5-turbo.tsv', sep = '\t', header = TRUE)

pigboardec_gpt4_df <- read.table(file = 'table_pig_boar_dec_gpt-4.tsv', sep = '\t', header = TRUE)
pigboardec_gpt35_df <- read.table(file = 'table_pig_boar_dec_gpt-3.5-turbo.tsv', sep = '\t', header = TRUE)

In [254]:
# Z-score data
scaled_pigboarenc_gpt4_df <- scale_taskpair_df(pigboarenc_gpt4_df)
scaled_pigboarenc_gpt35_df <- scale_taskpair_df(pigboarenc_gpt35_df)

scaled_pigboardec_gpt4_df <- scale_taskpair_df(pigboardec_gpt4_df)
scaled_pigboardec_gpt35_df <- scale_taskpair_df(pigboardec_gpt35_df)

In [255]:
model_pigtaskenc4 <- bayesglm(correct ~ task + input_nchars + output_nchars + input_ntokens + output_ntokens + input_logprob + output_logprob, 
               data=scaled_pigboarenc_gpt4_df, family=binomial)


In [256]:
model_pigtaskenc35 <- bayesglm(correct ~ task + input_nchars + output_nchars + input_ntokens + output_ntokens + input_logprob + output_logprob, 
               data=scaled_pigboarenc_gpt35_df, family=binomial)

In [257]:
model_pigtaskdec4 <- bayesglm(correct ~ task + input_nchars + output_nchars + input_ntokens + output_ntokens + input_logprob + output_logprob, 
               data=scaled_pigboardec_gpt4_df, family=binomial)

In [269]:
model_pigtaskdec35 <- bayesglm(correct ~ task + input_nchars + output_nchars + input_ntokens + output_ntokens + input_logprob + output_logprob, 
               data=scaled_pigboardec_gpt35_df, family=binomial)

In [259]:
summary(model_pigtaskenc4)


Call:
bayesglm(formula = correct ~ task + input_nchars + output_nchars + 
    input_ntokens + output_ntokens + input_logprob + output_logprob, 
    family = binomial, data = scaled_pigboarenc_gpt4_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.3106  -0.7606  -0.5599   1.0719   2.3386  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)    -0.49126    0.22847  -2.150 0.031540 *  
taskuv         -1.29990    0.38177  -3.405 0.000662 ***
input_nchars    0.26154    0.77435   0.338 0.735545    
output_nchars   0.15138    0.86944   0.174 0.861779    
input_ntokens   0.04368    0.60986   0.072 0.942904    
output_ntokens  0.07993    0.72119   0.111 0.911752    
input_logprob  -0.11694    0.29707  -0.394 0.693847    
output_logprob  1.11571    0.69711   1.600 0.109491    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 235.27  on 199  degr

In [260]:
summary(model_pigtaskenc35)


Call:
bayesglm(formula = correct ~ task + input_nchars + output_nchars + 
    input_ntokens + output_ntokens + input_logprob + output_logprob, 
    family = binomial, data = scaled_pigboarenc_gpt35_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.34676  -0.56454  -0.09404  -0.04128   2.12791  

Coefficients:
                 Estimate Std. Error z value Pr(>|z|)    
(Intercept)    -1.1179010  0.2790838  -4.006 6.19e-05 ***
taskuv         -4.7583957  1.5923777  -2.988  0.00281 ** 
input_nchars   -0.0001285  0.8357634   0.000  0.99988    
output_nchars  -0.1209601  0.9211562  -0.131  0.89553    
input_ntokens   1.2875572  0.8349638   1.542  0.12306    
output_ntokens -0.5590609  0.8881533  -0.629  0.52905    
input_logprob   0.4117290  0.3975704   1.036  0.30038    
output_logprob  1.0567468  0.8243820   1.282  0.19989    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null d

In [261]:
summary(model_pigtaskdec4)


Call:
bayesglm(formula = correct ~ task + input_nchars + output_nchars + 
    input_ntokens + output_ntokens + input_logprob + output_logprob, 
    family = binomial, data = scaled_pigboardec_gpt4_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.3852   0.1500   0.4247   0.6446   2.1237  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     2.03290    0.33517   6.065 1.32e-09 ***
taskuv         -1.09352    0.42567  -2.569   0.0102 *  
input_nchars    0.50073    1.03124   0.486   0.6273    
output_nchars   1.78137    1.02638   1.736   0.0826 .  
input_ntokens  -0.07019    0.73409  -0.096   0.9238    
output_ntokens -0.16773    0.65056  -0.258   0.7965    
input_logprob   0.11102    0.68410   0.162   0.8711    
output_logprob  1.94851    0.37242   5.232 1.68e-07 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 220.43  on 199  degr

In [270]:
summary(model_pigtaskdec35)


Call:
bayesglm(formula = correct ~ task + input_nchars + output_nchars + 
    input_ntokens + output_ntokens + input_logprob + output_logprob, 
    family = binomial, data = scaled_pigboardec_gpt35_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.9328  -0.8885  -0.2885   0.8980   2.2525  

Coefficients:
                Estimate Std. Error z value Pr(>|z|)    
(Intercept)     0.004897   0.250383   0.020    0.984    
taskuv         -0.571599   0.366717  -1.559    0.119    
input_nchars    0.998868   1.160111   0.861    0.389    
output_nchars   1.970346   1.071603   1.839    0.066 .  
input_ntokens  -0.567395   0.740321  -0.766    0.443    
output_ntokens -0.737507   0.627864  -1.175    0.240    
input_logprob   0.496922   0.660331   0.753    0.452    
output_logprob  2.041873   0.383887   5.319 1.04e-07 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 276.28  on

In [263]:
vif(model_pigtaskenc4)

In [264]:
vif(model_pigtaskenc35)

In [265]:
vif(model_pigtaskdec4)

In [266]:
vif(model_pigtaskdec35)

### Comparing Pig Latin variants

In [271]:
# Read in data
pigprobenc_gpt4_df <- read.table(file = 'table_pig_prob_enc_gpt-4.tsv', sep = '\t', header = TRUE)
pigprobenc_gpt35_df <- read.table(file = 'table_pig_prob_enc_gpt-3.5-turbo.tsv', sep = '\t', header = TRUE)

pigprobdec_gpt4_df <- read.table(file = 'table_pig_prob_dec_gpt-4.tsv', sep = '\t', header = TRUE)
pigprobdec_gpt35_df <- read.table(file = 'table_pig_prob_dec_gpt-3.5-turbo.tsv', sep = '\t', header = TRUE)

In [275]:
# Z-score data
scaled_pigprobenc_gpt4_df <- scale_taskpair_prob_df(pigprobenc_gpt4_df)
scaled_pigprobenc_gpt35_df <- scale_taskpair_prob_df(pigprobenc_gpt35_df)

scaled_pigprobdec_gpt4_df <- scale_taskpair_prob_df(pigprobdec_gpt4_df)
scaled_pigprobdec_gpt35_df <- scale_taskpair_prob_df(pigprobdec_gpt35_df)

In [277]:
model_pigprobenc4 <- glm(correct ~ task + input_nchars + output_nchars + input_ntokens + output_ntokens + input_logprob + output_logprob, 
               data=scaled_pigprobenc_gpt4_df, family=binomial)


In [278]:
model_pigprobenc35 <- glm(correct ~ task + input_nchars + output_nchars + input_ntokens + output_ntokens + input_logprob + output_logprob, 
               data=scaled_pigprobenc_gpt35_df, family=binomial)


In [279]:
model_pigprobdec4 <- glm(correct ~ task + input_nchars + output_nchars + input_ntokens + output_ntokens + input_logprob + output_logprob, 
               data=scaled_pigprobdec_gpt4_df, family=binomial)

In [280]:
model_pigprobdec35 <- glm(correct ~ task + input_nchars + output_nchars + input_ntokens + output_ntokens + input_logprob + output_logprob, 
               data=scaled_pigprobdec_gpt35_df, family=binomial)

In [281]:
summary(model_pigprobenc4)


Call:
glm(formula = correct ~ task + input_nchars + output_nchars + 
    input_ntokens + output_ntokens + input_logprob + output_logprob, 
    family = binomial, data = scaled_pigprobenc_gpt4_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.8277  -0.7579  -0.4865   0.8862   2.4740  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)    -1.29650    0.12717 -10.195  < 2e-16 ***
task            0.77758    0.12631   6.156 7.46e-10 ***
input_nchars    5.24036    1.50782   3.475 0.000510 ***
output_nchars  -6.38759    1.90197  -3.358 0.000784 ***
input_ntokens   1.31861    0.61597   2.141 0.032296 *  
output_ntokens  1.46049    0.71862   2.032 0.042117 *  
input_logprob   0.04002    0.22077   0.181 0.856154    
output_logprob  2.14571    0.58126   3.691 0.000223 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 579.24  on 499  degrees o

In [282]:
summary(model_pigprobenc35)


Call:
glm(formula = correct ~ task + input_nchars + output_nchars + 
    input_ntokens + output_ntokens + input_logprob + output_logprob, 
    family = binomial, data = scaled_pigprobenc_gpt35_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.5288  -0.5968  -0.3621  -0.1589   2.6269  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     -2.1750     0.1812 -12.001  < 2e-16 ***
task             0.8329     0.1577   5.282 1.28e-07 ***
input_nchars     4.8477     1.8771   2.583 0.009806 ** 
output_nchars   -6.5050     2.3896  -2.722 0.006485 ** 
input_ntokens    1.9108     0.7666   2.493 0.012678 *  
output_ntokens   1.9158     0.9199   2.083 0.037290 *  
input_logprob    0.4825     0.2880   1.675 0.093839 .  
output_logprob   2.5555     0.7243   3.528 0.000419 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 449.49  on 499  degrees 

In [283]:
summary(model_pigprobdec4)


Call:
glm(formula = correct ~ task + input_nchars + output_nchars + 
    input_ntokens + output_ntokens + input_logprob + output_logprob, 
    family = binomial, data = scaled_pigprobdec_gpt4_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.7103   0.2187   0.4316   0.6099   2.1449  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)      1.7858     0.1491  11.981  < 2e-16 ***
task             0.2061     0.1298   1.587   0.1125    
input_nchars    -4.9866     2.3315  -2.139   0.0325 *  
output_nchars    7.9504     1.9907   3.994 6.50e-05 ***
input_ntokens   -0.1614     0.7548  -0.214   0.8307    
output_ntokens   0.4510     0.7559   0.597   0.5507    
input_logprob    1.3863     0.6421   2.159   0.0309 *  
output_logprob   1.6532     0.2650   6.239 4.42e-10 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 503.16  on 499  degrees o

In [284]:
summary(model_pigprobdec35)


Call:
glm(formula = correct ~ task + input_nchars + output_nchars + 
    input_ntokens + output_ntokens + input_logprob + output_logprob, 
    family = binomial, data = scaled_pigprobdec_gpt35_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.4052  -0.9218   0.3219   0.9826   2.6449  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)    -0.04271    0.10284  -0.415 0.677920    
task            0.07490    0.10425   0.718 0.472508    
input_nchars    1.39771    1.64214   0.851 0.394687    
output_nchars   2.21900    1.37007   1.620 0.105314    
input_ntokens   0.28319    0.59429   0.477 0.633703    
output_ntokens -1.89277    0.54879  -3.449 0.000563 ***
input_logprob   1.06417    0.48929   2.175 0.029636 *  
output_logprob  1.47622    0.22985   6.422 1.34e-10 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 693.14  on 499  degrees 

In [285]:
vif(model_pigprobenc4)

In [286]:
vif(model_pigprobenc35)

In [287]:
vif(model_pigprobdec4)

In [288]:
vif(model_pigprobdec35)

# Acronyms

In [291]:
# Read in data
acronym_gpt4_outp_df <- read.table(file = 'table_acronym_varyoutp_gpt-4.tsv', sep = '\t', header = TRUE)
acronym_gpt35_outp_df <- read.table(file = 'table_acronym_varyoutp_gpt-3.5-turbo.tsv', sep = '\t', header = TRUE)

acronym_gpt4_inp_df <- read.table(file = 'table_acronym_varyinp_gpt-4.tsv', sep = '\t', header = TRUE)
acronym_gpt35_inp_df <- read.table(file = 'table_acronym_varyinp_gpt-3.5-turbo.tsv', sep = '\t', header = TRUE)


In [292]:
# Z-score data
scaled_acronym_gpt4_outp_df <- scale_df(acronym_gpt4_outp_df)
scaled_acronym_gpt35_outp_df <- scale_df(acronym_gpt35_outp_df)

scaled_acronym_gpt4_inp_df <- scale_df(acronym_gpt4_inp_df)
scaled_acronym_gpt35_inp_df <- scale_df(acronym_gpt35_inp_df)

In [293]:
acronym_gpt4_outp_model <- glm(correct ~ input_logprob + output_logprob, 
               data=scaled_acronym_gpt4_outp_df, family=binomial)
acronym_gpt35_outp_model <- glm(correct ~ input_logprob + output_logprob, 
               data=scaled_acronym_gpt35_outp_df, family=binomial)
acronym_gpt4_inp_model <- glm(correct ~ input_logprob + output_logprob, 
               data=scaled_acronym_gpt4_inp_df, family=binomial)
acronym_gpt35_inp_model <- glm(correct ~ input_logprob + output_logprob, 
               data=scaled_acronym_gpt35_inp_df, family=binomial)

In [294]:
summary(acronym_gpt4_outp_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob, family = binomial, 
    data = scaled_acronym_gpt4_outp_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.8568  -1.4834   0.7522   0.8102   0.9465  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     1.01459    0.03216  31.544  < 2e-16 ***
input_logprob   0.02365    0.03219   0.734    0.463    
output_logprob  0.18045    0.03215   5.613 1.98e-08 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 5808.6  on 4999  degrees of freedom
Residual deviance: 5775.8  on 4997  degrees of freedom
AIC: 5781.8

Number of Fisher Scoring iterations: 4


In [295]:
summary(acronym_gpt35_outp_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob, family = binomial, 
    data = scaled_acronym_gpt35_outp_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.1974  -0.8503  -0.7280   1.3761   1.9420  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)    -0.95022    0.03213 -29.576   <2e-16 ***
input_logprob   0.02860    0.03171   0.902    0.367    
output_logprob  0.35613    0.03269  10.894   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 5970.6  on 4999  degrees of freedom
Residual deviance: 5845.1  on 4997  degrees of freedom
AIC: 5851.1

Number of Fisher Scoring iterations: 4


In [296]:
summary(acronym_gpt4_inp_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob, family = binomial, 
    data = scaled_acronym_gpt4_inp_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.7203  -1.5978   0.7551   0.7788   0.8319  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     1.07431    0.03249  33.069   <2e-16 ***
input_logprob   0.06318    0.03256   1.940   0.0523 .  
output_logprob -0.02536    0.03220  -0.788   0.4309    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 5675.5  on 4999  degrees of freedom
Residual deviance: 5671.1  on 4997  degrees of freedom
AIC: 5677.1

Number of Fisher Scoring iterations: 4


In [297]:
summary(acronym_gpt35_inp_model)


Call:
glm(formula = correct ~ input_logprob + output_logprob, family = binomial, 
    data = scaled_acronym_gpt35_inp_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.0004  -0.9175  -0.8470   1.4090   1.6041  

Coefficients:
                Estimate Std. Error z value Pr(>|z|)    
(Intercept)    -0.725846   0.030267 -23.982  < 2e-16 ***
input_logprob   0.153745   0.030205   5.090 3.58e-07 ***
output_logprob -0.005357   0.030269  -0.177     0.86    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 6320.3  on 4999  degrees of freedom
Residual deviance: 6294.3  on 4997  degrees of freedom
AIC: 6300.3

Number of Fisher Scoring iterations: 4


In [298]:
vif(acronym_gpt4_outp_model)

In [299]:
vif(acronym_gpt35_outp_model)

In [300]:
vif(acronym_gpt4_inp_model)

In [301]:
vif(acronym_gpt35_inp_model)

### Vary task

In [302]:
# Read in data
acronym_gpt4_1and2_df <- read.table(file = 'table_acronym_varytask_gpt-4.tsv', sep = '\t', header = TRUE)
acronym_gpt35_1and2_df <- read.table(file = 'table_acronym_varytask_gpt-3.5-turbo.tsv', sep = '\t', header = TRUE)


In [303]:
# Z-score data
scaled_acronym_gpt4_1and2_df <- scale_taskpair_df(acronym_gpt4_1and2_df)
scaled_acronym_gpt35_1and2_df <- scale_taskpair_df(acronym_gpt35_1and2_df)

In [305]:
acronym_gpt4_1and2_model <- bayesglm(correct ~ task + input_logprob + output_logprob, 
               data=scaled_acronym_gpt4_1and2_df, family=binomial)
acronym_gpt35_1and2_model <- bayesglm(correct ~ task + input_logprob + output_logprob, 
               data=scaled_acronym_gpt35_1and2_df, family=binomial)

In [306]:
summary(acronym_gpt4_1and2_model)


Call:
bayesglm(formula = correct ~ task + input_logprob + output_logprob, 
    family = binomial, data = scaled_acronym_gpt4_1and2_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.7957  -0.2273  -0.2151   0.7175   2.7694  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     1.19572    0.07495  15.954   <2e-16 ***
taskacronym2   -4.88826    0.21820 -22.402   <2e-16 ***
input_logprob  -0.07488    0.06637  -1.128    0.259    
output_logprob -0.03782    0.06980  -0.542    0.588    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 2685.4  on 1999  degrees of freedom
Residual deviance: 1308.3  on 1996  degrees of freedom
AIC: 1316.3

Number of Fisher Scoring iterations: 7


In [307]:
summary(acronym_gpt35_1and2_model)


Call:
bayesglm(formula = correct ~ task + input_logprob + output_logprob, 
    family = binomial, data = scaled_acronym_gpt35_1and2_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.06256  -0.98056  -0.02311  -0.02243   1.43915  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)    -0.43479    0.06477  -6.713 1.91e-11 ***
taskacronym2   -7.80389    1.83306  -4.257 2.07e-05 ***
input_logprob   0.04176    0.06051   0.690    0.490    
output_logprob -0.01684    0.06481  -0.260    0.795    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 1982.0  on 1999  degrees of freedom
Residual deviance: 1340.1  on 1996  degrees of freedom
AIC: 1348.1

Number of Fisher Scoring iterations: 16


In [308]:
vif(acronym_gpt4_1and2_model)

In [309]:
vif(acronym_gpt35_1and2_model)

# Multiplication

In [415]:
# Read in data
mult_gpt4_df <- read.table(file = 'table_multiplication_gpt-4.tsv', sep = '\t', header = TRUE)
mult_gpt35_df <- read.table(file = 'table_multiplication_gpt-3.5-turbo.tsv', sep = '\t', header = TRUE)

In [416]:
mult_gpt4_df$method <- factor(mult_gpt4_df$method)
mult_gpt35_df$method <- factor(mult_gpt35_df$method)

In [417]:
head(mult_gpt4_df)

Unnamed: 0_level_0,index,method,correct
Unnamed: 0_level_1,<int>,<fct>,<int>
1,0,multiplication_number,0
2,1,multiplication_number,0
3,2,multiplication_number,1
4,3,multiplication_number,0
5,4,multiplication_number,1
6,5,multiplication_number,1


In [418]:
contrasts(mult_gpt4_df$method) <- contr.sum(4)
contrasts(mult_gpt35_df$method) <- contr.sum(4)

In [419]:
mult_gpt4_model <- glmer(correct ~ method + (1|index), 
               data=mult_gpt4_df, family=binomial)
mult_gpt4_null_model <- glmer(correct ~ (1|index), 
               data=mult_gpt4_df, family=binomial)   

In [429]:
mult_gpt35_model <- glmer(correct ~ method + (1|index), 
               data=mult_gpt35_df, family=binomial)
mult_gpt35_null_model <- glmer(correct ~ (1|index), 
               data=mult_gpt35_df, family=binomial)        

“Model failed to converge with max|grad| = 0.0232921 (tol = 0.002, component 1)”


In [434]:
mult_gpt35_null_model <- glmer(correct ~ (1|index), 
               data=mult_gpt35_df, family=binomial,
               nAGQ=0
                              ) 

In [422]:
anova(mult_gpt4_model,mult_gpt4_null_model,test="Chisq")

Unnamed: 0_level_0,npar,AIC,BIC,logLik,deviance,Chisq,Df,Pr(>Chisq)
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
mult_gpt4_null_model,2,423.6559,431.6388,-209.8279,419.6559,,,
mult_gpt4_model,5,374.989,394.9463,-182.4945,364.989,54.66686,3.0,8.086872e-12


In [423]:
anova(mult_gpt35_model,mult_gpt35_null_model,test="Chisq")

Unnamed: 0_level_0,npar,AIC,BIC,logLik,deviance,Chisq,Df,Pr(>Chisq)
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
mult_gpt35_null_model,2,406.3199,414.3028,-201.1599,402.3199,,,
mult_gpt35_model,5,300.3687,320.326,-145.1843,290.3687,111.9512,3.0,4.1724379999999996e-24


In [395]:
gpt4_mult_multcomp <- glht(mult_gpt4_model, linfct=mcp(method="Tukey"))


In [396]:
summary(gpt4_mult_multcomp)


	 Simultaneous Tests for General Linear Hypotheses

Multiple Comparisons of Means: Tukey Contrasts


Fit: glmer(formula = correct ~ method + (1 | index), data = mult_gpt4_df, 
    family = binomial)

Linear Hypotheses:
                                                             Estimate
multiplication_alternatingcaps - multiplication_allcaps == 0  -2.4389
multiplication_number - multiplication_allcaps == 0            1.5418
multiplication_word - multiplication_allcaps == 0              0.4810
multiplication_number - multiplication_alternatingcaps == 0    3.9807
multiplication_word - multiplication_alternatingcaps == 0      2.9200
multiplication_word - multiplication_number == 0              -1.0607
                                                             Std. Error z value
multiplication_alternatingcaps - multiplication_allcaps == 0     0.6010  -4.058
multiplication_number - multiplication_allcaps == 0              0.5238   2.943
multiplication_word - multiplication_allcaps == 0 

In [397]:
gpt35_mult_multcomp <- glht(mult_gpt35_model, linfct=mcp(method="Tukey"))


In [398]:
summary(gpt35_mult_multcomp)


	 Simultaneous Tests for General Linear Hypotheses

Multiple Comparisons of Means: Tukey Contrasts


Fit: glmer(formula = correct ~ method + (1 | index), data = mult_gpt35_df, 
    family = binomial)

Linear Hypotheses:
                                                             Estimate
multiplication_alternatingcaps - multiplication_allcaps == 0 -26.9816
multiplication_number - multiplication_allcaps == 0            3.1096
multiplication_word - multiplication_allcaps == 0              0.6637
multiplication_number - multiplication_alternatingcaps == 0   30.0912
multiplication_word - multiplication_alternatingcaps == 0     27.6453
multiplication_word - multiplication_number == 0              -2.4458
                                                             Std. Error z value
multiplication_alternatingcaps - multiplication_allcaps == 0   110.3420  -0.245
multiplication_number - multiplication_allcaps == 0              0.6901   4.506
multiplication_word - multiplication_allcaps == 0

# Linear function

In [461]:
# Read in data
linfwd_gpt4_df <- read.table(file = 'table_conversion_fwd_gpt-4.tsv', sep = '\t', header = TRUE)
linfwd_gpt35_df <- read.table(file = 'table_conversion_fwd_gpt-3.5-turbo.tsv', sep = '\t', header = TRUE)

linrev_gpt4_df <- read.table(file = 'table_conversion_rev_gpt-4.tsv', sep = '\t', header = TRUE)
linrev_gpt35_df <- read.table(file = 'table_conversion_rev_gpt-3.5-turbo.tsv', sep = '\t', header = TRUE)

In [462]:
head(linfwd_gpt4_df)

Unnamed: 0_level_0,index,task,input,output,correct
Unnamed: 0_level_1,<int>,<chr>,<int>,<dbl>,<int>
1,0,conversion_actual,328,622.4,1
2,1,conversion_actual,941,1725.8,0
3,2,conversion_actual,476,888.8,0
4,3,conversion_actual,230,446.0,1
5,4,conversion_actual,577,1070.6,0
6,5,conversion_actual,64,147.2,1


In [463]:
scale_lin_df <- function(df) {
    new_df <- data.frame(scale(df[3:5]))
    new_df$index <- as.factor(df$index)
    new_df$correct <- df$correct
    new_df$task <- factor(df$task)
    
    return(new_df)
}

In [464]:
scaled_linfwd_gpt4_df <- scale_lin_df(linfwd_gpt4_df)
scaled_linfwd_gpt35_df <- scale_lin_df(linfwd_gpt35_df)
scaled_linrev_gpt4_df <- scale_lin_df(linrev_gpt4_df)
scaled_linrev_gpt35_df <- scale_lin_df(linrev_gpt35_df)

In [465]:
linfwd_gpt4_model <- bayesglm(correct ~ task + input + output, 
               data=scaled_linfwd_gpt4_df, family=binomial)
linfwd_gpt35_model <- bayesglm(correct ~ task + input + output, 
               data=scaled_linfwd_gpt35_df, family=binomial)
linrev_gpt4_model <- bayesglm(correct ~ task + input + output, 
               data=scaled_linrev_gpt4_df, family=binomial)
linrev_gpt35_model <- bayesglm(correct ~ task + input + output, 
               data=scaled_linrev_gpt35_df, family=binomial)

In [466]:
summary(linfwd_gpt4_model)


Call:
bayesglm(formula = correct ~ task + input + output, family = binomial, 
    data = scaled_linfwd_gpt4_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.42945  -0.37014  -0.08700  -0.02867   2.35864  

Coefficients:
                    Estimate Std. Error z value Pr(>|z|)    
(Intercept)          -0.8712     0.2998  -2.906 0.003657 ** 
taskconversion_fake  -5.5359     1.6714  -3.312 0.000926 ***
input                -0.6289     0.8501  -0.740 0.459422    
output               -0.5865     0.7817  -0.750 0.453112    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 179.15  on 199  degrees of freedom
Residual deviance: 101.14  on 196  degrees of freedom
AIC: 109.14

Number of Fisher Scoring iterations: 18


In [467]:
summary(linfwd_gpt35_model)


Call:
bayesglm(formula = correct ~ task + input + output, family = binomial, 
    data = scaled_linfwd_gpt35_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.90680  -0.26623  -0.04941  -0.01212   2.09370  

Coefficients:
                    Estimate Std. Error z value Pr(>|z|)    
(Intercept)          -0.1773     0.3132  -0.566 0.571370    
taskconversion_fake  -6.9130     1.8222  -3.794 0.000148 ***
input                -0.8923     0.9433  -0.946 0.344168    
output               -0.8935     0.8679  -1.029 0.303249    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 215.71  on 199  degrees of freedom
Residual deviance:  87.78  on 196  degrees of freedom
AIC: 95.78

Number of Fisher Scoring iterations: 16


In [468]:
summary(linrev_gpt4_model)


Call:
bayesglm(formula = correct ~ task + input + output, family = binomial, 
    data = scaled_linrev_gpt4_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.31706  -0.60593  -0.08244  -0.03245   2.04165  

Coefficients:
                           Estimate Std. Error z value Pr(>|z|)   
(Intercept)                 -0.5760     0.2755  -2.091  0.03656 * 
taskconversion_fakeinverse  -5.4503     1.6646  -3.274  0.00106 **
input                       -0.4500     0.8331  -0.540  0.58908   
output                      -0.4228     0.7169  -0.590  0.55535   
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 200.16  on 199  degrees of freedom
Residual deviance: 121.38  on 196  degrees of freedom
AIC: 129.38

Number of Fisher Scoring iterations: 18


In [469]:
summary(linrev_gpt35_model)


Call:
bayesglm(formula = correct ~ task + input + output, family = binomial, 
    data = scaled_linrev_gpt35_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.9265  -0.3663  -0.1901   0.6846   3.0292  

Coefficients:
                           Estimate Std. Error z value Pr(>|z|)    
(Intercept)                  0.3585     0.2662   1.347    0.178    
taskconversion_fakeinverse  -3.9916     0.6329  -6.307 2.84e-10 ***
input                       -0.4516     0.7993  -0.565    0.572    
output                      -0.4623     0.7021  -0.658    0.510    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 249.22  on 199  degrees of freedom
Residual deviance: 143.46  on 196  degrees of freedom
AIC: 151.46

Number of Fisher Scoring iterations: 10


In [470]:
vif(linfwd_gpt4_model)

In [471]:
vif(linfwd_gpt35_model)

In [472]:
vif(linrev_gpt4_model)

In [473]:
vif(linrev_gpt35_model)

### Comparing methods

In [553]:
linmethod_gpt4_df <- read.table(file = 'table_conversion_method_gpt-4.tsv', sep = '\t', header = TRUE)
linmethod_gpt35_df <- read.table(file = 'table_conversion_method_gpt-3.5-turbo.tsv', sep = '\t', header = TRUE)

In [554]:
scaled_linmethod_gpt4_df <- scale_lin_df(linmethod_gpt4_df)
scaled_linmethod_gpt35_df <- scale_lin_df(linmethod_gpt35_df)

In [555]:
contrasts(scaled_linmethod_gpt4_df$task) <- contr.sum(3)
contrasts(scaled_linmethod_gpt35_df$task) <- contr.sum(3)

In [556]:
linmethod_gpt4_model <- glmer(correct ~ task + (1|index), 
               data=scaled_linmethod_gpt4_df, family=binomial)
linmethod_gpt35_model <- glmer(correct ~ task + (1|index), 
               data=scaled_linmethod_gpt35_df , family=binomial)

In [557]:
linmethod_gpt4_null_model <- glmer(correct ~ (1|index), 
               data=scaled_linmethod_gpt4_df, family=binomial)
linmethod_gpt35_null_model <- glmer(correct ~ (1|index), 
               data=scaled_linmethod_gpt35_df , family=binomial)

In [558]:
summary(linmethod_gpt4_model)

Generalized linear mixed model fit by maximum likelihood (Laplace
  Approximation) [glmerMod]
 Family: binomial  ( logit )
Formula: correct ~ task + (1 | index)
   Data: scaled_linmethod_gpt4_df

     AIC      BIC   logLik deviance df.resid 
   312.0    326.9   -152.0    304.0      296 

Scaled residuals: 
     Min       1Q   Median       3Q      Max 
-2.11776 -0.21885  0.04133  0.47220  2.12911 

Random effects:
 Groups Name        Variance Std.Dev.
 index  (Intercept) 12.37    3.517   
Number of obs: 300, groups:  index, 100

Fixed effects:
            Estimate Std. Error z value Pr(>|z|)    
(Intercept)   0.8832     0.4568   1.933   0.0532 .  
task1        -2.7261     0.5910  -4.612 3.98e-06 ***
task2         2.4401     0.5332   4.576 4.74e-06 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Correlation of Fixed Effects:
      (Intr) task1 
task1 -0.353       
task2  0.341 -0.867

In [559]:
summary(linmethod_gpt35_model)

Generalized linear mixed model fit by maximum likelihood (Laplace
  Approximation) [glmerMod]
 Family: binomial  ( logit )
Formula: correct ~ task + (1 | index)
   Data: scaled_linmethod_gpt35_df

     AIC      BIC   logLik deviance df.resid 
   323.5    338.3   -157.7    315.5      296 

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
-2.1779 -0.3871  0.2243  0.4592  2.5832 

Random effects:
 Groups Name        Variance Std.Dev.
 index  (Intercept) 5.182    2.276   
Number of obs: 300, groups:  index, 100

Fixed effects:
            Estimate Std. Error z value Pr(>|z|)    
(Intercept)   1.3673     0.3554   3.847  0.00012 ***
task1        -1.5999     0.3003  -5.328 9.91e-08 ***
task2         1.8548     0.3486   5.321 1.03e-07 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Correlation of Fixed Effects:
      (Intr) task1 
task1 -0.381       
task2  0.412 -0.734

In [560]:
anova(linmethod_gpt4_model,linmethod_gpt4_null_model,test="Chisq")

Unnamed: 0_level_0,npar,AIC,BIC,logLik,deviance,Chisq,Df,Pr(>Chisq)
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
linmethod_gpt4_null_model,2,389.4639,396.8714,-192.7319,385.4639,,,
linmethod_gpt4_model,4,312.0415,326.8567,-152.0208,304.0415,81.42231,2.0,2.086268e-18


In [561]:
anova(linmethod_gpt35_model,linmethod_gpt35_null_model,test="Chisq")

Unnamed: 0_level_0,npar,AIC,BIC,logLik,deviance,Chisq,Df,Pr(>Chisq)
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
linmethod_gpt35_null_model,2,375.9768,383.3843,-185.9884,371.9768,,,
linmethod_gpt35_model,4,323.4639,338.279,-157.7319,315.4639,56.51286,2.0,5.350427e-13


In [562]:
gpt4_lin_multcomp <- glht(linmethod_gpt4_model, linfct=mcp(task="Tukey"))
gpt35_lin_multcomp <- glht(linmethod_gpt35_model, linfct=mcp(task="Tukey"))


In [563]:
summary(gpt4_lin_multcomp)


	 Simultaneous Tests for General Linear Hypotheses

Multiple Comparisons of Means: Tukey Contrasts


Fit: glmer(formula = correct ~ task + (1 | index), data = scaled_linmethod_gpt4_df, 
    family = binomial)

Linear Hypotheses:
                                                              Estimate
conversion_actualprimed - conversion_actual == 0                5.1662
conversion_actualprimedcontrol - conversion_actual == 0         3.0121
conversion_actualprimedcontrol - conversion_actualprimed == 0  -2.1540
                                                              Std. Error
conversion_actualprimed - conversion_actual == 0                  1.0864
conversion_actualprimedcontrol - conversion_actual == 0           0.7671
conversion_actualprimedcontrol - conversion_actualprimed == 0     0.6273
                                                              z value Pr(>|z|)
conversion_actualprimed - conversion_actual == 0                4.755  < 0.001
conversion_actualprimedcontrol - con

In [564]:
summary(gpt35_lin_multcomp)


	 Simultaneous Tests for General Linear Hypotheses

Multiple Comparisons of Means: Tukey Contrasts


Fit: glmer(formula = correct ~ task + (1 | index), data = scaled_linmethod_gpt35_df, 
    family = binomial)

Linear Hypotheses:
                                                              Estimate
conversion_actualprimed - conversion_actual == 0                3.4547
conversion_actualprimedcontrol - conversion_actual == 0         1.3450
conversion_actualprimedcontrol - conversion_actualprimed == 0  -2.1097
                                                              Std. Error
conversion_actualprimed - conversion_actual == 0                  0.6044
conversion_actualprimedcontrol - conversion_actual == 0           0.4183
conversion_actualprimedcontrol - conversion_actualprimed == 0     0.5187
                                                              z value Pr(>|z|)
conversion_actualprimed - conversion_actual == 0                5.716  < 0.001
conversion_actualprimedcontrol - co

### Basic test: OOD

In [493]:
# Read in data
linfwdood_gpt4_df <- read.table(file = 'table_conversion_ood_fwd_gpt-4.tsv', sep = '\t', header = TRUE)
linfwdood_gpt35_df <- read.table(file = 'table_conversion_ood_fwd_gpt-3.5-turbo.tsv', sep = '\t', header = TRUE)

linrevood_gpt4_df <- read.table(file = 'table_conversion_ood_rev_gpt-4.tsv', sep = '\t', header = TRUE)
linrevood_gpt35_df <- read.table(file = 'table_conversion_ood_rev_gpt-3.5-turbo.tsv', sep = '\t', header = TRUE)

In [494]:
scaled_linfwdood_gpt4_df <- scale_lin_df(linfwdood_gpt4_df)
scaled_linfwdood_gpt35_df <- scale_lin_df(linfwdood_gpt35_df)
scaled_linrevood_gpt4_df <- scale_lin_df(linrevood_gpt4_df)
scaled_linrevood_gpt35_df <- scale_lin_df(linrevood_gpt35_df)

In [495]:
linfwdood_gpt4_model <- bayesglm(correct ~ task + input + output, 
               data=scaled_linfwdood_gpt4_df, family=binomial)
linfwdood_gpt35_model <- bayesglm(correct ~ task + input + output, 
               data=scaled_linfwdood_gpt35_df, family=binomial)
linrevood_gpt4_model <- bayesglm(correct ~ task + input + output, 
               data=scaled_linrevood_gpt4_df, family=binomial)
linrevood_gpt35_model <- bayesglm(correct ~ task + input + output, 
               data=scaled_linrevood_gpt35_df, family=binomial)

In [496]:
summary(linfwdood_gpt4_model)


Call:
bayesglm(formula = correct ~ task + input + output, family = binomial, 
    data = scaled_linfwdood_gpt4_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.05592  -0.36821  -0.11506  -0.05064   2.49494  

Coefficients:
                        Estimate Std. Error z value Pr(>|z|)    
(Intercept)              -1.9547     0.5514  -3.545 0.000392 ***
taskconversion_ood_fake  -4.2855     1.6816  -2.548 0.010819 *  
input                    -0.7789     0.7672  -1.015 0.309970    
output                   -0.2698     0.8175  -0.330 0.741382    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 106.554  on 199  degrees of freedom
Residual deviance:  72.877  on 196  degrees of freedom
AIC: 80.877

Number of Fisher Scoring iterations: 21


In [497]:
summary(linfwdood_gpt35_model)


Call:
bayesglm(formula = correct ~ task + input + output, family = binomial, 
    data = scaled_linfwdood_gpt35_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.45508  -0.33380  -0.19971  -0.06544   2.69346  

Coefficients:
                        Estimate Std. Error z value Pr(>|z|)   
(Intercept)              -1.7558     0.5609  -3.130  0.00175 **
taskconversion_ood_fake  -3.4753     1.1415  -3.044  0.00233 **
input                    -1.1005     0.7951  -1.384  0.16630   
output                   -0.3347     0.8467  -0.395  0.69267   
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 134.373  on 199  degrees of freedom
Residual deviance:  85.759  on 196  degrees of freedom
AIC: 93.759

Number of Fisher Scoring iterations: 15


In [498]:
summary(linrevood_gpt4_model)


Call:
bayesglm(formula = correct ~ task + input + output, family = binomial, 
    data = scaled_linrevood_gpt4_df)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-0.81547  -0.70844  -0.08794  -0.07734   1.80418  

Coefficients:
                               Estimate Std. Error z value Pr(>|z|)   
(Intercept)                     -1.2428     0.4626  -2.686  0.00722 **
taskconversion_ood_fakeinverse  -4.3695     1.7047  -2.563  0.01037 * 
input                           -0.1377     0.8157  -0.169  0.86596   
output                           0.2668     0.6298   0.424  0.67189   
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 146.77  on 199  degrees of freedom
Residual deviance: 110.34  on 196  degrees of freedom
AIC: 118.34

Number of Fisher Scoring iterations: 20


In [499]:
summary(linrevood_gpt35_model)


Call:
bayesglm(formula = correct ~ task + input + output, family = binomial, 
    data = scaled_linrevood_gpt35_df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.0772  -0.7983  -0.1764  -0.1300   2.9417  

Coefficients:
                               Estimate Std. Error z value Pr(>|z|)   
(Intercept)                    -0.85847    0.44571  -1.926  0.05410 . 
taskconversion_ood_fakeinverse -3.30489    1.07731  -3.068  0.00216 **
input                          -0.33654    0.79461  -0.424  0.67191   
output                         -0.03978    0.61685  -0.064  0.94859   
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 185.49  on 199  degrees of freedom
Residual deviance: 137.47  on 196  degrees of freedom
AIC: 145.47

Number of Fisher Scoring iterations: 12


In [500]:
vif(linfwdood_gpt4_model)

In [501]:
vif(linfwdood_gpt35_model)

In [502]:
vif(linrevood_gpt4_model)

In [503]:
vif(linrevood_gpt35_model)

### Comparing methods: OOD

In [540]:
linmethodood_gpt4_df <- read.table(file = 'table_conversion_ood_method_gpt-4.tsv', sep = '\t', header = TRUE)
linmethodood_gpt35_df <- read.table(file = 'table_conversion_ood_method_gpt-3.5-turbo.tsv', sep = '\t', header = TRUE)


In [541]:
scaled_linmethodood_gpt4_df <- scale_lin_df(linmethodood_gpt4_df)
scaled_linmethodood_gpt35_df <- scale_lin_df(linmethodood_gpt35_df)


In [542]:
contrasts(scaled_linmethodood_gpt4_df$task) <- contr.sum(3)
contrasts(scaled_linmethodood_gpt35_df$task) <- contr.sum(3)

In [543]:
linmethodood_gpt4_model <- glmer(correct ~ task + (1|index), 
               data=scaled_linmethodood_gpt4_df, family=binomial, control=glmerControl(optimizer="bobyqa",optCtrl=list(maxfun=2e5)))
linmethodood_gpt35_model <- glmer(correct ~ task + (1|index), 
               data=scaled_linmethodood_gpt35_df , family=binomial, control=glmerControl(optimizer="bobyqa",optCtrl=list(maxfun=2e5)))


In [544]:
linmethodood_gpt4_null_model <- glmer(correct ~ (1|index), 
               data=scaled_linmethodood_gpt4_df, family=binomial, control=glmerControl(optimizer="bobyqa",optCtrl=list(maxfun=2e5)))
linmethodood_gpt35_null_model <- glmer(correct ~ (1|index), 
               data=scaled_linmethodood_gpt35_df , family=binomial, control=glmerControl(optimizer="bobyqa",optCtrl=list(maxfun=2e5)))


In [545]:
summary(linmethodood_gpt4_model)

Generalized linear mixed model fit by maximum likelihood (Laplace
  Approximation) [glmerMod]
 Family: binomial  ( logit )
Formula: correct ~ task + (1 | index)
   Data: scaled_linmethodood_gpt4_df
Control: glmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e+05))

     AIC      BIC   logLik deviance df.resid 
   305.7    320.5   -148.9    297.7      296 

Scaled residuals: 
     Min       1Q   Median       3Q      Max 
-1.75192 -0.36734 -0.03554  0.19293  2.72231 

Random effects:
 Groups Name        Variance Std.Dev.
 index  (Intercept) 11.27    3.357   
Number of obs: 300, groups:  index, 100

Fixed effects:
            Estimate Std. Error z value Pr(>|z|)    
(Intercept)  -1.3539     0.4794  -2.824  0.00474 ** 
task1        -2.7629     0.5786  -4.775 1.80e-06 ***
task2         2.5309     0.5218   4.851 1.23e-06 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Correlation of Fixed Effects:
      (Intr) task1 
task1  0.456       
task2 -0.452 -0.871

In [546]:
summary(linmethodood_gpt35_model)

Generalized linear mixed model fit by maximum likelihood (Laplace
  Approximation) [glmerMod]
 Family: binomial  ( logit )
Formula: correct ~ task + (1 | index)
   Data: scaled_linmethodood_gpt35_df
Control: glmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e+05))

     AIC      BIC   logLik deviance df.resid 
   309.5    324.3   -150.7    301.5      296 

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
-7.2210 -0.4353 -0.0369  0.2267  2.0395 

Random effects:
 Groups Name        Variance Std.Dev.
 index  (Intercept) 12.78    3.575   
Number of obs: 300, groups:  index, 100

Fixed effects:
            Estimate Std. Error z value Pr(>|z|)    
(Intercept)  -1.0380     0.4862  -2.135   0.0328 *  
task1        -2.4901     0.5786  -4.304 1.68e-05 ***
task2         2.8892     0.6802   4.248 2.16e-05 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Correlation of Fixed Effects:
      (Intr) task1 
task1  0.425       
task2 -0.444 -0.887

In [547]:
anova(linmethodood_gpt4_model,linmethodood_gpt4_null_model,test="Chisq")

Unnamed: 0_level_0,npar,AIC,BIC,logLik,deviance,Chisq,Df,Pr(>Chisq)
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
linmethodood_gpt4_null_model,2,385.4793,392.8868,-190.7396,381.4793,,,
linmethodood_gpt4_model,4,305.7154,320.5306,-148.8577,297.7154,83.76381,2.0,6.470238e-19


In [548]:
anova(linmethodood_gpt35_model,linmethodood_gpt35_null_model,test="Chisq")

Unnamed: 0_level_0,npar,AIC,BIC,logLik,deviance,Chisq,Df,Pr(>Chisq)
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
linmethodood_gpt35_null_model,2,390.0229,397.4305,-193.0114,386.0229,,,
linmethodood_gpt35_model,4,309.4944,324.3095,-150.7472,301.4944,84.52852,2.0,4.414331999999999e-19


In [549]:
gpt4_linood_multcomp <- glht(linmethodood_gpt4_model, linfct=mcp(task="Tukey"))

In [550]:
gpt35_linood_multcomp <- glht(linmethodood_gpt35_model, linfct=mcp(task="Tukey"))

In [551]:
summary(gpt4_linood_multcomp)


	 Simultaneous Tests for General Linear Hypotheses

Multiple Comparisons of Means: Tukey Contrasts


Fit: glmer(formula = correct ~ task + (1 | index), data = scaled_linmethodood_gpt4_df, 
    family = binomial, control = glmerControl(optimizer = "bobyqa", 
        optCtrl = list(maxfun = 2e+05)))

Linear Hypotheses:
                                                                      Estimate
conversion_ood_actualprimed - conversion_ood_actual == 0                5.2938
conversion_ood_actualprimedcontrol - conversion_ood_actual == 0         2.9949
conversion_ood_actualprimedcontrol - conversion_ood_actualprimed == 0  -2.2990
                                                                      Std. Error
conversion_ood_actualprimed - conversion_ood_actual == 0                  1.0642
conversion_ood_actualprimedcontrol - conversion_ood_actual == 0           0.7484
conversion_ood_actualprimedcontrol - conversion_ood_actualprimed == 0     0.6103
                                        

In [552]:
summary(gpt35_linood_multcomp)


	 Simultaneous Tests for General Linear Hypotheses

Multiple Comparisons of Means: Tukey Contrasts


Fit: glmer(formula = correct ~ task + (1 | index), data = scaled_linmethodood_gpt35_df, 
    family = binomial, control = glmerControl(optimizer = "bobyqa", 
        optCtrl = list(maxfun = 2e+05)))

Linear Hypotheses:
                                                                      Estimate
conversion_ood_actualprimed - conversion_ood_actual == 0                5.3794
conversion_ood_actualprimedcontrol - conversion_ood_actual == 0         2.0910
conversion_ood_actualprimedcontrol - conversion_ood_actualprimed == 0  -3.2884
                                                                      Std. Error
conversion_ood_actualprimed - conversion_ood_actual == 0                  1.2230
conversion_ood_actualprimedcontrol - conversion_ood_actual == 0           0.6361
conversion_ood_actualprimedcontrol - conversion_ood_actualprimed == 0     0.8879
                                       