In [64]:
# Minicons Installation
# Introduction can be found https://kanishka.xyz/post/minicons-running-large-scale-behavioral-analyses-on-transformer-lms/
# Tutorial and code can be found https://github.com/kanishkamisra/minicons/blob/master/examples/surprisals.md
#!pip install minicons

from minicons import scorer
import pandas as pd
import numpy as np
import json
import csv
import re
import matplotlib.pyplot as plt
#import seaborn as sns
import statsmodels.formula.api as smf
import torch
from torch.utils.data import DataLoader
from transformers import GPT2LMHeadModel, GPT2TokenizerFast

#### Resizing Model Embeddings (50527) to Match with Tokenizer Vocabulary Size (50528)

In [6]:
'''
from transformers import GPT2TokenizerFast, GPT2LMHeadModel

model_path = "gpt2-small/checkpoint-trainedtokenizer_100M"

tokenizer = GPT2TokenizerFast.from_pretrained(model_path)
model = GPT2LMHeadModel.from_pretrained(model_path)

# print mismatch
print("Tokenizer vocab size:", len(tokenizer))
print("Model vocab size:", model.config.vocab_size)

# resize model embeddings to match tokenizer
if len(tokenizer) != model.config.vocab_size:
    print(f"Resizing model embeddings from {model.config.vocab_size} → {len(tokenizer)}")
    model.resize_token_embeddings(len(tokenizer))
    model.save_pretrained(model_path)
    print("Saved updated model.")
'''

Tokenizer vocab size: 50258
Model vocab size: 50257
Resizing model embeddings from 50257 → 50258
Saved updated model.
Special tokens map: {'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|endoftext|>'}


In [65]:
#model_path = "gpt2-small/checkpoint-pretrainedtokenizer_10M"
#model_path = "gpt2-small/checkpoint-trainedtokenizer_10M"
#model_path = "gpt2-small/checkpoint-trainedtokenizer_10M_whitespace"
#model_path = "gpt2-small/checkpoint-pretrainedtokenizer_100M"
#model_path = "gpt2-small/checkpoint-trainedtokenizer_100M"
model_path = "gpt2-small/checkpoint-trainedtokenizer_100M_whitespace"

model = GPT2LMHeadModel.from_pretrained(model_path)
tokenizer = GPT2TokenizerFast.from_pretrained(model_path)

# wrap with minicons scorer
lm_scorer = scorer.IncrementalLMScorer(model_path, device = "cpu")

In [9]:
print("Special tokens:", tokenizer.all_special_tokens)
print("Special token IDs:", tokenizer.all_special_ids)
print("Special tokens map:", tokenizer.special_tokens_map)

Special tokens: ['<|endoftext|>']
Special token IDs: [50257]
Special tokens map: {'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|endoftext|>'}


In [12]:
surprisals

[[('<pad>', 0.0),
  ('ĠTheĠ', 6.771759033203125),
  ('balloon', 12.995538711547852),
  ('Ġwa', 3.036536693572998),
  ('s', 0.0033515978138893843),
  ('Ġinf', 14.873947143554688),
  ('lat', 3.8473756313323975),
  ('ingĠfor', 9.053845405578613),
  ('Ġ10', 10.562509536743164),
  ('Ġminute', 3.032411813735962),
  ('s', 0.0010208890307694674)]]

In [37]:
'''
def calculate_surprisal(sentence):
    '''
    #Takes in a sentence, and outputs surprisal values for each word.
    '''
    
    input_sentence = sentence # process per sentence, never in batches to avoid padding
    # token_score() function of Minicons takes in several parameters
    # if surprisal = True, the output value is surprisal instead of log likelihood
    # if base_two = True, the log likelihood will be in base 2
    # see Minicons documentations for details
    # score tokens
    token_surprisals = lm_scorer.token_score(input_sentence, surprisal = True, base_two = True)[0]
    #print(token_surprisals)

    # matching tokens manually back to words using offset mapping
    # tokenizer setup
    encoding = tokenizer(sentence, return_offsets_mapping = True, add_special_tokens = False)
    offsets = encoding['offset_mapping']
    token_ids = encoding['input_ids']
    tokens = tokenizer.convert_ids_to_tokens(token_ids)

    # filter out special token surprisals (like <pad>) *not needed if we set add_special_tokens to False, but just to be safe
    special_tokens = set(tokenizer.all_special_tokens + ['<pad>'])
    filtered = [
        (token, score, span)
        for (token, score), span in zip(token_surprisals, offsets)
        if token not in special_tokens
    ]

    # prepare: group surprisals by words based on character spans
    words = re.findall(r"\S+", sentence)
    word_spans = []
    i = 0
    for match in re.finditer(r"\S+", sentence):
        start, end = match.span()
        word_spans.append((i, start, end))
        i += 1

    # assign tokens to words based on character alignment (needed since BPE tokenizers break words down into subwords/tokens)
    word_surprisals = []
    word_index = 0
    word_start, word_end = word_spans[word_index][1:3] # previously: word_spans.append((i, start, end)) [0, 1, 2]
    current_surprisal = 0.0
    
    for token, score, (start, end) in filtered:
        if start >= word_end:
            word_surprisals.append((words[word_index], current_surprisal))
            word_index += 1
            if word_index >= len(word_spans):
                break
            word_start, word_end = word_spans[word_index][1:3]
            current_surprisal = 0.0
        current_surprisal += score

    # append final word
    if word_index < len(words):
        word_surprisals.append((words[word_index], current_surprisal))

    return word_surprisals


sentence = 'The teacher realized what the storm rolled in while the student in the first year was studying for the test with great enthusiasm'
calculate_surprisal(sentence)
'''

[('The', 0.0),
 ('teacher', 11.844358444213867),
 ('realized', 13.042094230651855),
 ('what', 6.120745658874512),
 ('the', 3.2175395488739014),
 ('storm', 15.96143913269043),
 ('rolled', 12.999858856201172),
 ('in', 5.186878204345703),
 ('while', 10.483074188232422),
 ('the', 1.5718594789505005),
 ('student', 10.762317657470703),
 ('in', 7.2503581047058105),
 ('the', 1.2104510068893433),
 ('first', 7.770709991455078),
 ('year', 5.073796272277832),
 ('was', 1.570701003074646),
 ('studying', 10.941052436828613),
 ('for', 5.377425670623779),
 ('the', 1.5096884965896606),
 ('test', 7.936253547668457),
 ('with', 9.095281600952148),
 ('great', 9.63071346282959),
 ('enthusiasm', 6.821305751800537)]

In [3]:
def calculate_surprisal(sentence):
    '''
    Takes in a sentence, and outputs surprisal values for each word.
    '''
    
    input_sentence = sentence
    # token_score() function of Minicons takes in several parameters
    # if surprisal = True, the output value is surprisal instead of log likelihood
    # if base_two = True, the log likelihood will be in base 2
    # see Minicons documentations for details
    # score tokens
    token_surprisals = lm_scorer.token_score(input_sentence, surprisal = True, base_two = True)[0]
    #print(token_surprisals)

    '''
    # filter out special tokens (like <pad>)
    special_tokens = set(tokenizer.all_special_tokens + ['<pad>'])
    filtered = [
        (token, score)
        for (token, score) in token_surprisals
        if token not in special_tokens
    ]
    '''

   # expand tokens that contain multiple words
    expanded = []
    for token, score in token_surprisals:
    #for token, score in filtered:
        token = token.strip('Ġ') # remove space marker
        if token.count('Ġ') > 0:
            # multiple words inside
            words = token.split('Ġ')
            words = [word for word in words if word]  # remove empty strings
            for i, word in enumerate(words):
                expanded_token = word
                expanded.append((expanded_token, score / len(words)))  # split surprisal equally
        else:
            expanded.append((token, score))    
    #print(expanded)

    # use regex to split into words and punctuation
    words = re.findall(r'\w+|[^\w\s]', sentence)
    results = []

    token_pointer = 0

    for word in words:
        accumulated = ''
        word_surprisal = 0.0

        while token_pointer < len(expanded):
            token, surprisal = expanded[token_pointer]
            accumulated += token
            word_surprisal += surprisal
            token_pointer += 1

            if accumulated == word:
                results.append((word, word_surprisal))
                break
        else:
            results.append((word, word_surprisal))

    return results


sentence = 'I know that your friend gave a baguette to Mary last weekend.'
calculate_surprisal(sentence)

[('I', 0.0),
 ('know', 4.365269184112549),
 ('that', 3.4667656421661377),
 ('your', 9.112505912780762),
 ('friend', 6.371501922607422),
 ('gave', 10.674038887023926),
 ('a', 4.927915096282959),
 ('baguette', 38.19276142120361),
 ('to', 5.820535182952881),
 ('Mary', 13.028478622436523),
 ('last', 11.46209716796875),
 ('weekend', 7.374884128570557),
 ('.', 2.0132205486297607)]

In [39]:
def calculate_sentence_surprisal(word_surprisals):
    """Returns total surprisal and average surprisal per word."""
    
    scores = [score for word, score in word_surprisals]
    total = sum(scores)
    avg = total / len(scores)
    return total, avg

def sum_region_surprisal(row, region_list):
    '''Sums surprisals of a specified sentence region.'''
    
    sentence_surprisals = row['surprisals']
    region_text = ''
    for region in region_list:
        region_text += (str(row[region]) if pd.notna(row[region]) else '') + ' '
    
    # use regex to split region cleanly into words & punctuation
    region_units = re.findall(r'\w+|[^\w\s]', region_text)
    
    # sum surprisals matching region units
    region_surprisal = 0.0
    for token, score in sentence_surprisals:
        if token in region_units:
            region_surprisal += score
    
    return region_surprisal

def compute_wh_licensing_interaction(sentences):
    """
    sentences: dict with keys
        'fg' = +Filler, +Gap
        'fng' = +Filler, −Gap
        'nfg' = −Filler, +Gap
        'nfng' = −Filler, −Gap
    Each value is a sentence string.

    Returns:
        A dict with total surprisal per sentence,
        average surprisals per sentence,
        and the wh-licensing interaction score.
    """
    
    scores = {}
    for key, sentence in sentences.items():
        word_surprisals = calculate_surprisal(sentence)
        total, avg = calculate_sentence_surprisal(word_surprisals)
        scores[key] = {'total': total, 'avg': avg, 'details': word_surprisals}

    # compute wh-licensing interaction
    interaction = (
        (scores['fng']['total'] - scores['nfng']['total']) -
        (scores['fg']['total'] - scores['nfg']['total'])
    )

    return {
        'scores': scores,
        'interaction': interaction
    }


In [60]:
def split_ends(ends):
    '''Splits off sentence-final punctuation.'''
    
    match = re.match(r'^(.*?)([.!?])$', ends.strip())
    if match:
        return pd.Series([match.group(1), match.group(2)])
    else:
        return pd.Series([ends, ''])  # no end punctuation
    
def encode_wh_licensor(df):
    """
    Adds a numeric column to the DataFrame:
    - 1 if licensor is "what"
    - -1 otherwise (licensor is "that" or absent)
    """
    df['wh_numeric'] = df['licensor'].apply(lambda x: 1 if str(x).strip().lower() == 'what' else -1)
    return df

def merge_sentence(row):
    '''Merges sentence columns back into full sentence.'''
    
    parts = [
        row['prefix'],
        row['licensor'],
        row['subj'],
        row['verb'],
        row['object'],
        row['modifier']
    ]
    # filter out NaN or empty parts
    non_empty_parts = [str(part) for part in parts if pd.notna(part) and str(part).strip() != '']
    # join with spaces and add end punctuation
    sentence = ' '.join(non_empty_parts) + (row['end'] if pd.notna(row['end']) else '')
    return sentence

#### Statistical Analysis: Mixed-Effects Linear Regression Model

In [57]:
# columns: item_id (indicates sentence set), wh_licensor (0/1), gap (0/1), island_type, surprisal

def mixed_effects_linear_regression(df, iv_list, surprisal, label):
    '''
    Fits mixed-effects model and extracts wh-licensing interaction.
    '''
    for field in iv_list:
        if field == 'wh_numeric' or 'gap_numeric':
            pass
        else:
            df[field] = df[field].astype('category')
        
    interaction_terms = ' * '.join(iv_list)
    random_effects = ' + '.join(iv_list)

    model = smf.mixedlm(
        f"{surprisal} ~ {interaction_terms}",
        df,
        groups = df["item_id"],
        re_formula = f"~{random_effects}"
    )

    result = model.fit()
    interaction_coef = result.params.get('wh_licensor[T.1]:gap[T.1]', None)
    
    print(f"\n=== {label.upper()} ===")
    print(result.summary())
    
    return interaction_coef

#interaction = mixed_effects_linear_regression(df, "construction_type") # label name to be changed according to construction type

#### Double Gap Construction

In [75]:
sentence_df = pd.read_csv('test_sentences/Double Gap Construction.csv')
sentence_df

Unnamed: 0,item_id,condition,filler,gap,subject_gap,object_gap,prefix,licensor,subj,verb,object,modifier
0,1,a,0,0,0,0,James realized,that,the dog,chased,the cat,through the yard.
1,1,b,0,1,1,0,James realized,that,,chased,the cat,through the yard.
2,1,c,0,1,0,1,James realized,that,the dog,chased,,through the yard.
3,1,d,0,1,1,1,James realized,that,,chased,,through the yard.
4,1,e,1,0,0,0,James realized,what,the dog,chased,the cat,through the yard.
...,...,...,...,...,...,...,...,...,...,...,...,...
155,20,d,0,1,1,1,The principal knows,that,,helped,,after the exam.
156,20,e,1,0,0,0,The principal knows,what,the counselor,helped,the student,after the exam.
157,20,f,1,1,1,0,The principal knows,what,,helped,the student,after the exam.
158,20,g,1,1,0,1,The principal knows,what,the counselor,helped,,after the exam.


In [76]:
sentence_df[['modifier', 'end']] = sentence_df['modifier'].apply(split_ends)
sentence_df = encode_wh_licensor(sentence_df)
sentence_df['sentence'] = sentence_df.apply(merge_sentence, axis = 1)
sentence_df['surprisals'] = sentence_df['sentence'].apply(calculate_surprisal)
sentence_df

Unnamed: 0,item_id,condition,filler,gap,subject_gap,object_gap,prefix,licensor,subj,verb,object,modifier,end,wh_numeric,sentence,surprisals
0,1,a,0,0,0,0,James realized,that,the dog,chased,the cat,through the yard,.,-1,James realized that the dog chased the cat thr...,"[(James, 0.0), (realized, 15.564455032348633),..."
1,1,b,0,1,1,0,James realized,that,,chased,the cat,through the yard,.,-1,James realized that chased the cat through the...,"[(James, 0.0), (realized, 15.564455032348633),..."
2,1,c,0,1,0,1,James realized,that,the dog,chased,,through the yard,.,-1,James realized that the dog chased through the...,"[(James, 0.0), (realized, 15.564455032348633),..."
3,1,d,0,1,1,1,James realized,that,,chased,,through the yard,.,-1,James realized that chased through the yard.,"[(James, 0.0), (realized, 15.564455032348633),..."
4,1,e,1,0,0,0,James realized,what,the dog,chased,the cat,through the yard,.,1,James realized what the dog chased the cat thr...,"[(James, 0.0), (realized, 15.564455032348633),..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
155,20,d,0,1,1,1,The principal knows,that,,helped,,after the exam,.,-1,The principal knows that helped after the exam.,"[(The, 0.0), (principal, 11.270773887634277), ..."
156,20,e,1,0,0,0,The principal knows,what,the counselor,helped,the student,after the exam,.,1,The principal knows what the counselor helped ...,"[(The, 0.0), (principal, 11.270773887634277), ..."
157,20,f,1,1,1,0,The principal knows,what,,helped,the student,after the exam,.,1,The principal knows what helped the student af...,"[(The, 0.0), (principal, 11.270773887634277), ..."
158,20,g,1,1,0,1,The principal knows,what,the counselor,helped,,after the exam,.,1,The principal knows what the counselor helped ...,"[(The, 0.0), (principal, 11.270773887634277), ..."


In [18]:
sentence_df.to_csv(('test_sentences/Double Gap Construction.csv'), index = False)
#sentence_df = pd.read_csv('test_sentences/Double Gap Construction.csv')

Modifier

In [77]:
sentence_df['region_surprisal'] = sentence_df.apply(lambda x: sum_region_surprisal(x, ['modifier']), axis = 1)
sentence_df

Unnamed: 0,item_id,condition,filler,gap,subject_gap,object_gap,prefix,licensor,subj,verb,object,modifier,end,wh_numeric,sentence,surprisals,region_surprisal
0,1,a,0,0,0,0,James realized,that,the dog,chased,the cat,through the yard,.,-1,James realized that the dog chased the cat thr...,"[(James, 0.0), (realized, 15.564455032348633),...",22.018002
1,1,b,0,1,1,0,James realized,that,,chased,the cat,through the yard,.,-1,James realized that chased the cat through the...,"[(James, 0.0), (realized, 15.564455032348633),...",19.303180
2,1,c,0,1,0,1,James realized,that,the dog,chased,,through the yard,.,-1,James realized that the dog chased through the...,"[(James, 0.0), (realized, 15.564455032348633),...",18.409940
3,1,d,0,1,1,1,James realized,that,,chased,,through the yard,.,-1,James realized that chased through the yard.,"[(James, 0.0), (realized, 15.564455032348633),...",18.516019
4,1,e,1,0,0,0,James realized,what,the dog,chased,the cat,through the yard,.,1,James realized what the dog chased the cat thr...,"[(James, 0.0), (realized, 15.564455032348633),...",24.675975
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
155,20,d,0,1,1,1,The principal knows,that,,helped,,after the exam,.,-1,The principal knows that helped after the exam.,"[(The, 0.0), (principal, 11.270773887634277), ...",26.060441
156,20,e,1,0,0,0,The principal knows,what,the counselor,helped,the student,after the exam,.,1,The principal knows what the counselor helped ...,"[(The, 0.0), (principal, 11.270773887634277), ...",26.442219
157,20,f,1,1,1,0,The principal knows,what,,helped,the student,after the exam,.,1,The principal knows what helped the student af...,"[(The, 0.0), (principal, 11.270773887634277), ...",23.145472
158,20,g,1,1,0,1,The principal knows,what,the counselor,helped,,after the exam,.,1,The principal knows what the counselor helped ...,"[(The, 0.0), (principal, 11.270773887634277), ...",26.938232


In [49]:
iv_list = ['subject_gap', 'object_gap', 'wh_numeric']
mixed_effects_linear_regression(sentence_df, iv_list, 'region_surprisal', 'double gap modifier')




=== DOUBLE GAP MODIFIER ===
                         Mixed Linear Model Regression Results
Model:                     MixedLM         Dependent Variable:         region_surprisal
No. Observations:          160             Method:                     REML            
No. Groups:                20              Scale:                      0.7720          
Min. group size:           8               Log-Likelihood:             -308.2116       
Max. group size:           8               Converged:                  Yes             
Mean group size:           8.0                                                         
---------------------------------------------------------------------------------------
                                            Coef.  Std.Err.    z    P>|z| [0.025 0.975]
---------------------------------------------------------------------------------------
Intercept                                   27.211    1.024  26.580 0.000 25.205 29.217
subject_gap[T.1]            

Embedded Clause

In [78]:
embed = ['subj', 'object', 'verb', 'modifier', 'end']
sentence_df['embed_surprisal'] = sentence_df.apply(lambda x: sum_region_surprisal(x, embed), axis = 1)
sentence_df

Unnamed: 0,item_id,condition,filler,gap,subject_gap,object_gap,prefix,licensor,subj,verb,object,modifier,end,wh_numeric,sentence,surprisals,region_surprisal,embed_surprisal
0,1,a,0,0,0,0,James realized,that,the dog,chased,the cat,through the yard,.,-1,James realized that the dog chased the cat thr...,"[(James, 0.0), (realized, 15.564455032348633),...",22.018002,56.395520
1,1,b,0,1,1,0,James realized,that,,chased,the cat,through the yard,.,-1,James realized that chased the cat through the...,"[(James, 0.0), (realized, 15.564455032348633),...",19.303180,54.383725
2,1,c,0,1,0,1,James realized,that,the dog,chased,,through the yard,.,-1,James realized that the dog chased through the...,"[(James, 0.0), (realized, 15.564455032348633),...",18.409940,46.729862
3,1,d,0,1,1,1,James realized,that,,chased,,through the yard,.,-1,James realized that chased through the yard.,"[(James, 0.0), (realized, 15.564455032348633),...",18.516019,43.208366
4,1,e,1,0,0,0,James realized,what,the dog,chased,the cat,through the yard,.,1,James realized what the dog chased the cat thr...,"[(James, 0.0), (realized, 15.564455032348633),...",24.675975,59.184489
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
155,20,d,0,1,1,1,The principal knows,that,,helped,,after the exam,.,-1,The principal knows that helped after the exam.,"[(The, 0.0), (principal, 11.270773887634277), ...",26.060441,45.421412
156,20,e,1,0,0,0,The principal knows,what,the counselor,helped,the student,after the exam,.,1,The principal knows what the counselor helped ...,"[(The, 0.0), (principal, 11.270773887634277), ...",26.442219,66.763715
157,20,f,1,1,1,0,The principal knows,what,,helped,the student,after the exam,.,1,The principal knows what helped the student af...,"[(The, 0.0), (principal, 11.270773887634277), ...",23.145472,50.321641
158,20,g,1,1,0,1,The principal knows,what,the counselor,helped,,after the exam,.,1,The principal knows what the counselor helped ...,"[(The, 0.0), (principal, 11.270773887634277), ...",26.938232,58.851973


In [79]:
iv_list = ['subject_gap', 'object_gap', 'wh_numeric']
mixed_effects_linear_regression(sentence_df, iv_list, 'embed_surprisal', 'double gap embed')




=== DOUBLE GAP EMBED ===
                     Mixed Linear Model Regression Results
Model:                   MixedLM       Dependent Variable:       embed_surprisal
No. Observations:        160           Method:                   REML           
No. Groups:              20            Scale:                    1.2237         
Min. group size:         8             Log-Likelihood:           -387.2060      
Max. group size:         8             Converged:                No             
Mean group size:         8.0                                                    
--------------------------------------------------------------------------------
                                   Coef.  Std.Err.    z    P>|z|  [0.025  0.975]
--------------------------------------------------------------------------------
Intercept                          65.619    1.869  35.102 0.000  61.955  69.283
subject_gap                       -10.011    0.845 -11.849 0.000 -11.667  -8.355
object_gap              



#### Gap Distance

In [None]:
sentence_df = pd.read_csv('test_sentences/Gap Distance Construction.csv')

In [None]:
sentence_df[['temporal_modifier', 'end']] = sentence_df['temporal_modifier'].apply(split_ends)
sentence_df = encode_wh_licensor(sentence_df)
#sentence_df['gap_numeric'] = sentence_df['gap'].replace(0, -1)
sentence_df['sentence'] = sentence_df.apply(merge_sentence, axis = 1)
sentence_df['surprisals'] = sentence_df['sentence'].apply(calculate_surprisal)
sentence_df

Gap Position == Object

In [None]:
# region surprisals are kept separate, in long format

In [73]:
# create long-format rows
region_rows = []
for region in ['to', 'goal']:
    region_df = sentence_df.copy()
    region_df['region'] = region 
    region_df['region_surprisal'] = region_df.apply(lambda x: sum_region_surprisal(x, [region]), axis = 1)
    region_rows.append(region_df)

# concatenate long-format DataFrame
long_df = pd.concat(region_rows, ignore_index = True)

# filter for object gap
objectgap_df = long_df[long_df['gap_position'] == 'object']
objectgap_df

Unnamed: 0,item_id,condition,filler,gap,subject_gap,object_gap,prefix,licensor,subj,verb,object,modifier,end,wh_numeric,sentence,surprisals,region,region_surprisal
0,1,a,0,0,0,0,James realized,that,the dog,chased,the cat,through the yard,.,-1,James realized that the dog chased the cat thr...,"[(James, 0.0), (realized, 15.564455032348633),...",modifier,22.018002
1,1,b,0,1,1,0,James realized,that,,chased,the cat,through the yard,.,-1,James realized that chased the cat through the...,"[(James, 0.0), (realized, 15.564455032348633),...",modifier,19.303180
2,1,c,0,1,0,1,James realized,that,the dog,chased,,through the yard,.,-1,James realized that the dog chased through the...,"[(James, 0.0), (realized, 15.564455032348633),...",modifier,18.409940
3,1,d,0,1,1,1,James realized,that,,chased,,through the yard,.,-1,James realized that chased through the yard.,"[(James, 0.0), (realized, 15.564455032348633),...",modifier,18.516019
4,1,e,1,0,0,0,James realized,what,the dog,chased,the cat,through the yard,.,1,James realized what the dog chased the cat thr...,"[(James, 0.0), (realized, 15.564455032348633),...",modifier,24.675975
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
315,20,d,0,1,1,1,The principal knows,that,,helped,,after the exam,.,-1,The principal knows that helped after the exam.,"[(The, 0.0), (principal, 11.270773887634277), ...",end,2.186271
316,20,e,1,0,0,0,The principal knows,what,the counselor,helped,the student,after the exam,.,1,The principal knows what the counselor helped ...,"[(The, 0.0), (principal, 11.270773887634277), ...",end,1.559880
317,20,f,1,1,1,0,The principal knows,what,,helped,the student,after the exam,.,1,The principal knows what helped the student af...,"[(The, 0.0), (principal, 11.270773887634277), ...",end,1.875990
318,20,g,1,1,0,1,The principal knows,what,the counselor,helped,,after the exam,.,1,The principal knows what the counselor helped ...,"[(The, 0.0), (principal, 11.270773887634277), ...",end,1.883400


In [None]:
iv_list = ['gap', 'wh_numeric', 'modifier']
mixed_effects_linear_regression(objectgap_df, iv_list, 'region_surprisal', 'gap distance: gap position == object')

Gap Postion == PP/Goal

In [None]:
pp_df = sentence_df.copy()
pp_df['region_surprisal'] = pp_df.apply(lambda x: sum_region_surprisal(x, ['temporal_modifier']), axis = 1)

# filter for PP gap
ppgap_df = pp_df[pp_df['gap_position'] == 'goal']
ppgap_df

In [None]:
iv_list = ['gap', 'wh_numeric', 'modifier']
mixed_effects_linear_regression(ppgap_df, iv_list, 'region_surprisal', 'gap distance: gap position == pp/goal')

Embedded Clause

In [None]:
embed = ['short modifier', 'medium modifier', 'long modifier', 'subj', 'verb', 'object', 'to', 'goal', 'temporal_modifier', 'end']
sentence_df['embed_surprisal'] = sentence_df.apply(lambda x: sum_region_surprisal(x, embed), axis = 1)
sentence_df

Embedded Clause - Object Gap Position

In [None]:
# filter for object gap
embed_objectgap_df = sentence_df[sentence_df['gap_position'] == 'object]

In [None]:
iv_list = ['gap', 'wh_numeric', 'modifier']
mixed_effects_linear_regression(embed_objectgap_df, iv_list, 'embed_surprisal', 'gap distance: embedded clause object gap position')

Embedded Clause - PP/Goal Gap Position

In [None]:
# filter for PP gap
embed_ppgap_df = sentence_df[sentence_df['gap_position'] == 'goal']

In [None]:
iv_list = ['gap', 'wh_numeric', 'modifier']
mixed_effects_linear_regression(embed_ppgap_df, iv_list, 'embed_surprisal', 'gap distance: embedded clause pp/goal gap position')

#### Gap Position

In [None]:
sentence_df = pd.read_csv('test_sentences/Gap Position Construction.csv')

In [None]:
sentence_df = encode_wh_licensor(sentence_df)
sentence_df['gap_numeric'] = sentence_df['gap'].replace(0, -1)
sentence_df['sentence'] = sentence_df.apply(merge_sentence, axis = 1)
sentence_df['surprisals'] = sentence_df['sentence'].apply(calculate_surprisal)
sentence_df

Subject Position

In [None]:
subject_df = sentence_df.copy()

In [None]:
subject_df['region_surprisal'] = subject_df.apply(lambda x: sum_region_surprisal(x, ['verb']), axis = 1)
subject_df

In [None]:
# filter for subject gap
subject_df = subject_df[subject_df['gap_position'] == 'subject']

In [None]:
iv_list = ['wh_numeric', 'gap_numeric']
mixed_effects_linear_regression(subject_df, iv_list, 'region_surprisal', 'gap position: subject')

Object Position

In [None]:
object_df = sentence_df.copy()

In [None]:
object_df['region_surprisal'] = object_df.apply(lambda x: sum_region_surprisal(x, ['prep']), axis = 1)
object_df

In [None]:
# filter for object gap
object_df = object_df[object_df['gap_position'] == 'object']

In [None]:
iv_list = ['wh_numeric', 'gap_numeric']
mixed_effects_linear_regression(object_df, iv_list, 'region_surprisal', 'gap position: object')

PP/Goal Position

In [None]:
pp_df = sentence_df.copy()

In [None]:
pp_df['region_surprisal'] = pp_df.apply(lambda x: sum_region_surprisal(x, ['end']), axis = 1)
pp_df

In [None]:
# filter for pp gap
pp_df = pp_df[pp_df['gap_position'] == 'PP']

In [None]:
iv_list = ['wh_numeric', 'gap_numeric']
mixed_effects_linear_regression(pp_df, iv_list, 'region_surprisal', 'gap position: pp')

Embedded Clause

In [None]:
embed = ['apositive', 'NP1', 'verb', 'NP2', 'prep', 'NP3', 'end']
sentence_df['embed_surprisal'] = sentence_df.apply(lambda x: sum_region_surprisal(x, embed), axis = 1)
sentence_df

In [None]:
iv_list = ['gap', 'wh_numeric', 'gap_position']
mixed_effects_linear_regression(pp_df, iv_list, 'region_surprisal', 'gap position: embedded clause')

#### Wh-Islands

In [None]:
sentence_df = pd.read_csv('test_sentences/Wh-Islands Construction.csv')

In [None]:
sentence_df[['continuation', 'end']] = sentence_df['continuation'].apply(split_ends)
sentence_df = encode_wh_licensor(sentence_df)
sentence_df['gap_numeric'] = sentence_df['gap'].replace(0, -1)
sentence_df['sentence'] = sentence_df.apply(merge_sentence, axis = 1)
sentence_df['surprisals'] = sentence_df['sentence'].apply(calculate_surprisal)
sentence_df

Continuation

In [None]:
sentence_df['region_surprisal'] = sentence_df.apply(lambda x: sum_region_surprisal(x, ['continuation']), axis = 1)
sentence_df

In [None]:
iv_list = ['wh_numeric', 'gap_numeric', 'island_type']
mixed_effects_linear_regression(sentence_df, iv_list, 'region_surprisal', 'wh-islands continuation')

Embedded Clause

In [None]:
embed = ['compl', 'embed 1', 'whether', 'subj 2', 'vp 2', 'obj_2', 'continuation', 'end']
sentence_df['embed_surprisal'] = sentence_df.apply(lambda x: sum_region_surprisal(x, embed), axis = 1)
sentence_df

In [None]:
iv_list = ['wh_numeric', 'gap_numeric', 'island_type']
mixed_effects_linear_regression(sentence_df, iv_list, 'embed_surprisal', 'adjunct islands embed')

#### Adjunct Islands 

In [None]:
sentence_df = pd.read_csv('test_sentences/Adjunct Islands Construction.csv')

In [None]:
sentence_df[['continuation', 'end']] = sentence_df['continuation'].apply(split_ends)
sentence_df = encode_wh_licensor(sentence_df)
sentence_df['gap_numeric'] = sentence_df['gap'].replace(0, -1)
sentence_df['sentence'] = sentence_df.apply(merge_sentence, axis = 1)
sentence_df['surprisals'] = sentence_df['sentence'].apply(calculate_surprisal)
sentence_df

Continuation

In [None]:
sentence_df['region_surprisal'] = sentence_df.apply(lambda x: sum_region_surprisal(x, ['continuation']), axis = 1)
sentence_df

In [None]:
iv_list = ['wh_numeric', 'gap_numeric', 'island_type']
mixed_effects_linear_regression(sentence_df, iv_list, 'region_surprisal', 'adjunct islands continuation')

Embedded Clause

In [None]:
embed = ['adjunct setup', 'subject', 'modifier', 'verb', 'object', 'end']
sentence_df['embed_surprisal'] = sentence_df.apply(lambda x: sum_region_surprisal(x, embed), axis = 1)
sentence_df

In [None]:
iv_list = ['wh_numeric', 'gap_numeric', 'island_type']
mixed_effects_linear_regression(sentence_df, iv_list, 'embed_surprisal', 'adjunct islands embed')

#### Complex NP Islands

In [None]:
sentence_df = pd.read_csv('test_sentences/Complex NP Islands Construction.csv')

In [None]:
sentence_df = encode_wh_licensor(sentence_df)
sentence_df['gap_numeric'] = sentence_df['gap'].replace(0, -1)
sentence_df['sentence'] = sentence_df.apply(merge_sentence, axis = 1)
sentence_df['surprisals'] = sentence_df['sentence'].apply(calculate_surprisal)
sentence_df

Subject Condition

In [None]:
subject_df = sentence_df.copy()

In [None]:
subject_df['region_surprisal'] = subject_df.apply(lambda x: sum_region_surprisal(x, ['subj_setup']), axis = 1)
subject_df

In [None]:
# filter for subject condition
subject_df = subject_df[subject_df['subj_obj'] == 'subject']

In [None]:
iv_list = ['wh_numeric', 'gap_numeric', 'island_type']
mixed_effects_linear_regression(subject_df, iv_list, 'region_surprisal', 'complex np islands: subject condition')

Object Condition

In [None]:
object_df = sentence_df.copy()

In [None]:
object_df['region_surprisal'] = object_df.apply(lambda x: sum_region_surprisal(x, ['end']), axis = 1)
object_df

In [None]:
# filter for object gap
object_df = object_df[object_df['subj_obj'] == 'object']

In [None]:
iv_list = ['wh_numeric', 'gap_numeric', 'island_type']
mixed_effects_linear_regression(object_df, iv_list, 'region_surprisal', 'complex np islands: object condition')

Embedded Clause

In [None]:
embed = ['subj', 'that_rc', 'what_rc', 'rc_np', 'prep', 'prep_np', 'subj_setup', 'obj_setup', 'end']
sentence_df['embed_surprisal'] = sentence_df.apply(lambda x: sum_region_surprisal(x, embed), axis = 1)
sentence_df

Embedded Clause - Subject Condition

In [None]:
# filter for subject condition
subject_df = sentence_df[sentence_df['subj_obj'] == 'subject']

In [None]:
iv_list = ['wh_numeric', 'gap_numeric', 'island_type']
mixed_effects_linear_regression(subject_df, iv_list, 'region_surprisal', 'complex np islands: embedded clause subject condition')

Embedded Clause - Object Condition

In [None]:
# filter for object gap
object_df = sentence_df[sentence_df['subj_obj'] == 'object']

In [None]:
iv_list = ['wh_numeric', 'gap_numeric', 'island_type']
mixed_effects_linear_regression(object_df, iv_list, 'region_surprisal', 'complex np islands: embedded clause object condition')