In [1]:
# Minicons Installation
# Introduction can be found https://kanishka.xyz/post/minicons-running-large-scale-behavioral-analyses-on-transformer-lms/
# Tutorial and code can be found https://github.com/kanishkamisra/minicons/blob/master/examples/surprisals.md
#!pip install minicons

from minicons import scorer
import pandas as pd
import numpy as np
import json
import csv
import re
import matplotlib.pyplot as plt
#import seaborn as sns
import statsmodels.formula.api as smf
import torch
from torch.utils.data import DataLoader
from transformers import GPT2LMHeadModel, GPT2TokenizerFast

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was too old on your system - pyarrow 10.0.1 is the current minimum supported version as of this release.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


#### Resizing Model Embeddings (50527) to Match with Tokenizer Vocabulary Size (50528)

In [6]:
'''
from transformers import GPT2TokenizerFast, GPT2LMHeadModel

model_path = "gpt2-small/checkpoint-trainedtokenizer_100M"

tokenizer = GPT2TokenizerFast.from_pretrained(model_path)
model = GPT2LMHeadModel.from_pretrained(model_path)

# print mismatch
print("Tokenizer vocab size:", len(tokenizer))
print("Model vocab size:", model.config.vocab_size)

# resize model embeddings to match tokenizer
if len(tokenizer) != model.config.vocab_size:
    print(f"Resizing model embeddings from {model.config.vocab_size} → {len(tokenizer)}")
    model.resize_token_embeddings(len(tokenizer))
    model.save_pretrained(model_path)
    print("Saved updated model.")
'''

Tokenizer vocab size: 50258
Model vocab size: 50257
Resizing model embeddings from 50257 → 50258
Saved updated model.
Special tokens map: {'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|endoftext|>'}


In [2]:
#model_path = "gpt2-small/checkpoint-pretrainedtokenizer_10M"
#model_path = "gpt2-small/checkpoint-trainedtokenizer_10M"
#model_path = "gpt2-small/checkpoint-trainedtokenizer_10M_whitespace"
#model_path = "gpt2-small/checkpoint-pretrainedtokenizer_100M"
#model_path = "gpt2-small/checkpoint-trainedtokenizer_100M"
model_path = "gpt2-small/checkpoint-trainedtokenizer_100M_whitespace"

model = GPT2LMHeadModel.from_pretrained(model_path)
tokenizer = GPT2TokenizerFast.from_pretrained(model_path)

# wrap with minicons scorer
lm_scorer = scorer.IncrementalLMScorer(model_path, device = "cpu")

In [9]:
print("Special tokens:", tokenizer.all_special_tokens)
print("Special token IDs:", tokenizer.all_special_ids)
print("Special tokens map:", tokenizer.special_tokens_map)

Special tokens: ['<|endoftext|>']
Special token IDs: [50257]
Special tokens map: {'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|endoftext|>'}


In [12]:
surprisals

[[('<pad>', 0.0),
  ('ĠTheĠ', 6.771759033203125),
  ('balloon', 12.995538711547852),
  ('Ġwa', 3.036536693572998),
  ('s', 0.0033515978138893843),
  ('Ġinf', 14.873947143554688),
  ('lat', 3.8473756313323975),
  ('ingĠfor', 9.053845405578613),
  ('Ġ10', 10.562509536743164),
  ('Ġminute', 3.032411813735962),
  ('s', 0.0010208890307694674)]]

In [37]:
'''
def calculate_surprisal(sentence):
    '''
    #Takes in a sentence, and outputs surprisal values for each word.
    '''
    
    input_sentence = sentence # process per sentence, never in batches to avoid padding
    # token_score() function of Minicons takes in several parameters
    # if surprisal = True, the output value is surprisal instead of log likelihood
    # if base_two = True, the log likelihood will be in base 2
    # see Minicons documentations for details
    # score tokens
    token_surprisals = lm_scorer.token_score(input_sentence, surprisal = True, base_two = True)[0]
    #print(token_surprisals)

    # matching tokens manually back to words using offset mapping
    # tokenizer setup
    encoding = tokenizer(sentence, return_offsets_mapping = True, add_special_tokens = False)
    offsets = encoding['offset_mapping']
    token_ids = encoding['input_ids']
    tokens = tokenizer.convert_ids_to_tokens(token_ids)

    # filter out special token surprisals (like <pad>) *not needed if we set add_special_tokens to False, but just to be safe
    special_tokens = set(tokenizer.all_special_tokens + ['<pad>'])
    filtered = [
        (token, score, span)
        for (token, score), span in zip(token_surprisals, offsets)
        if token not in special_tokens
    ]

    # prepare: group surprisals by words based on character spans
    words = re.findall(r"\S+", sentence)
    word_spans = []
    i = 0
    for match in re.finditer(r"\S+", sentence):
        start, end = match.span()
        word_spans.append((i, start, end))
        i += 1

    # assign tokens to words based on character alignment (needed since BPE tokenizers break words down into subwords/tokens)
    word_surprisals = []
    word_index = 0
    word_start, word_end = word_spans[word_index][1:3] # previously: word_spans.append((i, start, end)) [0, 1, 2]
    current_surprisal = 0.0
    
    for token, score, (start, end) in filtered:
        if start >= word_end:
            word_surprisals.append((words[word_index], current_surprisal))
            word_index += 1
            if word_index >= len(word_spans):
                break
            word_start, word_end = word_spans[word_index][1:3]
            current_surprisal = 0.0
        current_surprisal += score

    # append final word
    if word_index < len(words):
        word_surprisals.append((words[word_index], current_surprisal))

    return word_surprisals


sentence = 'The teacher realized what the storm rolled in while the student in the first year was studying for the test with great enthusiasm'
calculate_surprisal(sentence)
'''

[('The', 0.0),
 ('teacher', 11.844358444213867),
 ('realized', 13.042094230651855),
 ('what', 6.120745658874512),
 ('the', 3.2175395488739014),
 ('storm', 15.96143913269043),
 ('rolled', 12.999858856201172),
 ('in', 5.186878204345703),
 ('while', 10.483074188232422),
 ('the', 1.5718594789505005),
 ('student', 10.762317657470703),
 ('in', 7.2503581047058105),
 ('the', 1.2104510068893433),
 ('first', 7.770709991455078),
 ('year', 5.073796272277832),
 ('was', 1.570701003074646),
 ('studying', 10.941052436828613),
 ('for', 5.377425670623779),
 ('the', 1.5096884965896606),
 ('test', 7.936253547668457),
 ('with', 9.095281600952148),
 ('great', 9.63071346282959),
 ('enthusiasm', 6.821305751800537)]

In [3]:
def calculate_surprisal(sentence):
    '''
    Takes in a sentence, and outputs surprisal values for each word.
    '''
    
    input_sentence = sentence
    # token_score() function of Minicons takes in several parameters
    # if surprisal = True, the output value is surprisal instead of log likelihood
    # if base_two = True, the log likelihood will be in base 2
    # see Minicons documentations for details
    # score tokens
    token_surprisals = lm_scorer.token_score(input_sentence, surprisal = True, base_two = True)[0]
    #print(token_surprisals)

    '''
    # filter out special tokens (like <pad>)
    special_tokens = set(tokenizer.all_special_tokens + ['<pad>'])
    filtered = [
        (token, score)
        for (token, score) in token_surprisals
        if token not in special_tokens
    ]
    '''

   # expand tokens that contain multiple words
    expanded = []
    for token, score in token_surprisals:
    #for token, score in filtered:
        token = token.strip('Ġ') # remove space marker
        if token.count('Ġ') > 0:
            # multiple words inside
            words = token.split('Ġ')
            words = [word for word in words if word]  # remove empty strings
            for i, word in enumerate(words):
                expanded_token = word
                expanded.append((expanded_token, score / len(words)))  # split surprisal equally
        else:
            expanded.append((token, score))    
    #print(expanded)

    # use regex to split into words and punctuation
    words = re.findall(r'\w+|[^\w\s]', sentence)
    results = []

    token_pointer = 0

    for word in words:
        accumulated = ''
        word_surprisal = 0.0

        while token_pointer < len(expanded):
            token, surprisal = expanded[token_pointer]
            accumulated += token
            word_surprisal += surprisal
            token_pointer += 1

            if accumulated == word:
                results.append((word, word_surprisal))
                break
        else:
            results.append((word, word_surprisal))

    return results


sentence = 'I know that your friend gave a baguette to Mary last weekend.'
calculate_surprisal(sentence)

[('I', 0.0),
 ('know', 4.365269184112549),
 ('that', 3.4667656421661377),
 ('your', 9.112505912780762),
 ('friend', 6.371501922607422),
 ('gave', 10.674038887023926),
 ('a', 4.927915096282959),
 ('baguette', 38.19276142120361),
 ('to', 5.820535182952881),
 ('Mary', 13.028478622436523),
 ('last', 11.46209716796875),
 ('weekend', 7.374884128570557),
 ('.', 2.0132205486297607)]

In [4]:
def calculate_sentence_surprisal(word_surprisals):
    """Returns total surprisal and average surprisal per word."""
    
    scores = [score for word, score in word_surprisals]
    total = sum(scores)
    avg = total / len(scores)
    return total, avg

def sum_region_surprisal(row, region_list):
    '''Sums surprisals of a specified sentence region.'''
    
    sentence_surprisals = row['surprisals']
    region_text = ''
    for region in region_list:
        region_text += (str(row[region]) if pd.notna(row[region]) else '') + ' '
    
    # use regex to split region cleanly into words & punctuation
    region_units = re.findall(r'\w+|[^\w\s]', region_text)
    
    # sum surprisals matching region units
    region_surprisal = 0.0
    for token, score in sentence_surprisals:
        if token in region_units:
            region_surprisal += score
    
    return region_surprisal

def compute_wh_licensing_interaction(sentences):
    """
    sentences: dict with keys
        'fg' = +Filler, +Gap
        'fng' = +Filler, −Gap
        'nfg' = −Filler, +Gap
        'nfng' = −Filler, −Gap
    Each value is a sentence string.

    Returns:
        A dict with total surprisal per sentence,
        average surprisals per sentence,
        and the wh-licensing interaction score.
    """
    
    scores = {}
    for key, sentence in sentences.items():
        word_surprisals = calculate_surprisal(sentence)
        total, avg = calculate_sentence_surprisal(word_surprisals)
        scores[key] = {'total': total, 'avg': avg, 'details': word_surprisals}

    # compute wh-licensing interaction
    interaction = (
        (scores['fng']['total'] - scores['nfng']['total']) -
        (scores['fg']['total'] - scores['nfg']['total'])
    )

    return {
        'scores': scores,
        'interaction': interaction
    }


In [5]:
def split_ends(ends):
    '''Splits off sentence-final punctuation.'''
    
    match = re.match(r'^(.*?)([.!?])$', ends.strip())
    if match:
        return pd.Series([match.group(1), match.group(2)])
    else:
        return pd.Series([ends, ''])  # no end punctuation
    
def encode_wh_licensor(df):
    """
    Adds a numeric column to the DataFrame:
    - 1 if licensor is a wh-licensor
    - -1 otherwise (licensor is "that" or absent)
    """
    df['wh_numeric'] = df['filler'].apply(lambda x: 1 if x > 0 else -1)
    return df

def merge_sentence(row, syntactic_parts):
    '''Merges sentence columns back into full sentence.'''
    
    parts = [row[part] for part in syntactic_parts]         
    # filter out NaN or empty parts
    non_empty_parts = [str(part) for part in parts if pd.notna(part) and str(part).strip() != '']
    # join with spaces and add end punctuation
    sentence = ' '.join(non_empty_parts) + (row['end'] if pd.notna(row['end']) else '')
    return sentence

#### Statistical Analysis: Mixed-Effects Linear Regression Model

In [14]:
from IPython.display import display
import warnings
from statsmodels.tools.sm_exceptions import ConvergenceWarning

def print_summary(result):
    """
    Prints the descriptive header from a statsmodels MixedLMResults summary,
    and displays both the fixed-effects and random-effects components as pandas DataFrames.
    """
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", ConvergenceWarning)
        
        summary_str = str(result.summary())
        lines = summary_str.split('\n')

        # PART 1: HEADER
        print("\n=== MODEL SUMMARY ===")
        for line in lines:
            if 'Coef.' in line:
                break  # stop when reaching the coefficient table
            pairs = re.findall(r'(\S[^:]*):\s+([^\s][^:]*?)(?=\s{2,}|$)', line)
            for key, value in pairs:
                print(f"{key.strip()}: {value.strip()}")

        # PART 2a: FIXED EFFECTS TABLE
        fixed_table = []
        in_fixed_table = False
        random_start_idx = None

        for i, line in enumerate(lines):
            if 'Coef.' in line:
                in_fixed_table = True
                continue
            if in_fixed_table:
                if not line.strip() or set(line.strip()) in [{'='}, {'-'}]:
                    continue  # skip empty/separator lines
                parts = line.strip().split()
                if len(parts) >= 7:
                    param = parts[0]
                    row = parts[1:7]
                    fixed_table.append([param] + row)
                else:
                    random_start_idx = i
                    break

        if fixed_table:
            df_fixed = pd.DataFrame(fixed_table, columns = [
                'Parameter', 'Coef.', 'Std.Err.', 'z', 'P>|z|', '[0.025', '0.975]'
            ])
            print("\n=== FIXED EFFECTS COEFFICIENTS ===")
            display(df_fixed)
        else:
            print("\nNo fixed-effects data found.")

        # PART 2b: RANDOM EFFECTS / VARIANCE COMPONENTS TABLE
        random_table = []
        if random_start_idx:
            for line in lines[random_start_idx:]:
                if not line.strip() or set(line.strip()) in [{'='}, {'-'}]:
                    continue
                parts = line.strip().split()
                # Look for last token ending in Var or Cov
                varcov_idx = None
                for j in reversed(range(len(parts))):
                    if parts[j].endswith('Var') or parts[j].endswith('Cov'):
                        varcov_idx = j
                        break
                if varcov_idx is not None:
                    param = ' '.join(parts[:varcov_idx + 1])
                    numeric_parts = parts[varcov_idx + 1:]
                    if len(numeric_parts) == 2:
                        val1, val2 = numeric_parts
                    elif len(numeric_parts) == 1:
                        val1, val2 = numeric_parts[0], ''
                    else:
                        val1, val2 = '', ''
                    random_table.append([param, val1, val2])

        if random_table:
            df_random = pd.DataFrame(random_table, columns = ['Parameter', 'Coef.', 'Std.Err.'])
            print("\n=== RANDOM EFFECTS / VARIANCE-COVARIANCE ===")
            display(df_random)
        else:
            print("\nNo random-effects data found.")


In [15]:
# columns: item_id (indicates sentence set), wh_licensor (0/1), gap (0/1), island_type, surprisal

warnings.simplefilter("ignore", ConvergenceWarning)

def mixed_effects_linear_regression(df, iv_list, surprisal, label):
    '''
    Fits mixed-effects model and extracts wh-licensing interaction.
    '''
        
    for field in iv_list:
        if field == 'wh_numeric' or 'gap_numeric':
            pass
        else:
            df[field] = df[field].astype('category')

    interaction_terms = ' * '.join(iv_list)
    random_effects = ' + '.join(iv_list)

    model = smf.mixedlm(
        f"{surprisal} ~ {interaction_terms}",
        df,
        groups = df["item_id"],
        #re_formula = f"~{random_effects}"
        re_formula = "1"
        )

    result = model.fit()
    #interaction_coef = result.params.get('wh_licensor[T.1]:gap[T.1]', None)

    print(f"\n=== {label.upper()} ===")
    #print(result.summary())
    print_summary(result)

    return result

#interaction = mixed_effects_linear_regression(df, "construction_type") # label name to be changed according to construction type

#### Gap Position

In [None]:
sentence_df = pd.read_csv('test_sentences/Gap Position Construction.csv')

In [None]:
sentence_df = encode_wh_licensor(sentence_df)
sentence_df['gap_numeric'] = sentence_df['gap'].replace(0, -1)
sentence_df['sentence'] = sentence_df.apply(merge_sentence, axis = 1)
sentence_df['surprisals'] = sentence_df['sentence'].apply(calculate_surprisal)
sentence_df

Subject Position

In [None]:
subject_df = sentence_df.copy()

In [None]:
subject_df['region_surprisal'] = subject_df.apply(lambda x: sum_region_surprisal(x, ['verb']), axis = 1)
subject_df

In [None]:
# filter for subject gap
subject_df = subject_df[subject_df['gap_position'] == 'subject']

In [None]:
iv_list = ['wh_numeric', 'gap_numeric']
mixed_effects_linear_regression(subject_df, iv_list, 'region_surprisal', 'gap position: subject')

Object Position

In [None]:
object_df = sentence_df.copy()

In [None]:
object_df['region_surprisal'] = object_df.apply(lambda x: sum_region_surprisal(x, ['prep']), axis = 1)
object_df

In [None]:
# filter for object gap
object_df = object_df[object_df['gap_position'] == 'object']

In [None]:
iv_list = ['wh_numeric', 'gap_numeric']
mixed_effects_linear_regression(object_df, iv_list, 'region_surprisal', 'gap position: object')

PP/Goal Position

In [None]:
pp_df = sentence_df.copy()

In [None]:
pp_df['region_surprisal'] = pp_df.apply(lambda x: sum_region_surprisal(x, ['end']), axis = 1)
pp_df

In [None]:
# filter for pp gap
pp_df = pp_df[pp_df['gap_position'] == 'PP']

In [None]:
iv_list = ['wh_numeric', 'gap_numeric']
mixed_effects_linear_regression(pp_df, iv_list, 'region_surprisal', 'gap position: pp')

Embedded Clause

In [None]:
embed = ['apositive', 'NP1', 'verb', 'NP2', 'prep', 'NP3', 'end']
sentence_df['embed_surprisal'] = sentence_df.apply(lambda x: sum_region_surprisal(x, embed), axis = 1)
sentence_df

In [None]:
iv_list = ['wh_numeric', 'gap_numeric', 'gap_position']
mixed_effects_linear_regression(pp_df, iv_list, 'region_surprisal', 'gap position: embedded clause')

#### Gap Distance

In [31]:
sentence_df = pd.read_csv('test_sentences/Gap Distance.csv')
sentence_df['gap_distance'] = sentence_df['gap_distance'].fillna('null')
#sentence_df['gap_distance'].dropna(inplace = True)
#sentence_df = sentence_df[sentence_df['gap_distance'].replace(['nan', 'NaN'], np.nan).notna()]
sentence_df

Unnamed: 0,item_id,condition,filler,gap_distance,gap,DO_gap,IO_gap,prefix,licensor,subj,modifier,verb,object,prep,goal,temp_mod,gap_position
0,1,1,0,,0,0,0,The manager predicts,that,the intern,,forwarded,an important email,to,the client,earlier this morning.,DO
1,1,2,0,short,0,0,0,The manager predicts,that,the intern,who you admire,forwarded,an important email,to,the client,earlier this morning.,DO
2,1,3,0,medium,0,0,0,The manager predicts,that,the intern,who you worked closely with on the project,forwarded,an important email,to,the client,earlier this morning.,DO
3,1,4,0,long,0,0,0,The manager predicts,that,the intern,who you recommended highly after the summer pr...,forwarded,an important email,to,the client,earlier this morning.,DO
4,1,5,0,,0,0,0,The manager predicts,that,the intern,,forwarded,an important email,to,the client,earlier this morning.,IO
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
635,20,28,2,long,0,0,0,It was reported,who,the official,who coordinated disaster relief with agencies ...,allocated,additional resources,to,the emergency responders,following the storm.,IO
636,20,29,2,,1,0,1,It was reported,who,the official,,allocated,additional resources,to,,following the storm.,IO
637,20,30,2,short,1,0,1,It was reported,who,the official,who oversaw safety measures,allocated,additional resources,to,,following the storm.,IO
638,20,31,2,medium,1,0,1,It was reported,who,the official,who briefed the mayor’s office last week,allocated,additional resources,to,,following the storm.,IO


In [32]:
sentence_df[['temp_mod', 'end']] = sentence_df['temp_mod'].apply(split_ends)
sentence_df = encode_wh_licensor(sentence_df)
sentence_df['gap_numeric'] = sentence_df['gap'].replace(0, -1)
syntactic_parts = ['prefix', 'licensor', 'subj', 'modifier', 'verb', 'object', 'prep', 'goal', 'temp_mod']
sentence_df['sentence'] = sentence_df.apply(lambda x: merge_sentence(x, syntactic_parts), axis = 1)
sentence_df['surprisals'] = sentence_df['sentence'].apply(calculate_surprisal)
sentence_df

Unnamed: 0,item_id,condition,filler,gap_distance,gap,DO_gap,IO_gap,prefix,licensor,subj,...,object,prep,goal,temp_mod,gap_position,end,wh_numeric,gap_numeric,sentence,surprisals
0,1,1,0,,0,0,0,The manager predicts,that,the intern,...,an important email,to,the client,earlier this morning,DO,.,-1,-1,The manager predicts that the intern forwarded...,"[(The, 0.0), (manager, 11.191156387329102), (p..."
1,1,2,0,short,0,0,0,The manager predicts,that,the intern,...,an important email,to,the client,earlier this morning,DO,.,-1,-1,The manager predicts that the intern who you a...,"[(The, 0.0), (manager, 11.191156387329102), (p..."
2,1,3,0,medium,0,0,0,The manager predicts,that,the intern,...,an important email,to,the client,earlier this morning,DO,.,-1,-1,The manager predicts that the intern who you w...,"[(The, 0.0), (manager, 11.191156387329102), (p..."
3,1,4,0,long,0,0,0,The manager predicts,that,the intern,...,an important email,to,the client,earlier this morning,DO,.,-1,-1,The manager predicts that the intern who you r...,"[(The, 0.0), (manager, 11.191156387329102), (p..."
4,1,5,0,,0,0,0,The manager predicts,that,the intern,...,an important email,to,the client,earlier this morning,IO,.,-1,-1,The manager predicts that the intern forwarded...,"[(The, 0.0), (manager, 11.191156387329102), (p..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
635,20,28,2,long,0,0,0,It was reported,who,the official,...,additional resources,to,the emergency responders,following the storm,IO,.,1,-1,It was reported who the official who coordinat...,"[(It, 0.0), (was, 3.0059664249420166), (report..."
636,20,29,2,,1,0,1,It was reported,who,the official,...,additional resources,to,,following the storm,IO,.,1,1,It was reported who the official allocated add...,"[(It, 0.0), (was, 3.0059664249420166), (report..."
637,20,30,2,short,1,0,1,It was reported,who,the official,...,additional resources,to,,following the storm,IO,.,1,1,It was reported who the official who oversaw s...,"[(It, 0.0), (was, 3.0059664249420166), (report..."
638,20,31,2,medium,1,0,1,It was reported,who,the official,...,additional resources,to,,following the storm,IO,.,1,1,It was reported who the official who briefed t...,"[(It, 0.0), (was, 3.0059664249420166), (report..."


Gap Position == Object

In [None]:
# region surprisals are kept separate, in long format
# modifier: short_mod, med_mod, long_mod

In [33]:
# create long-format rows
region_rows = []
for region in ['prep', 'goal']:
    region_df = sentence_df.copy()
    region_df['region'] = region 
    region_df['region_surprisal'] = region_df.apply(lambda x: sum_region_surprisal(x, [region]), axis = 1)
    region_rows.append(region_df)

# concatenate long-format DataFrame
long_df = pd.concat(region_rows, ignore_index = True)

# filter for object gap
objectgap_df = long_df[long_df['gap_position'] == 'DO']
objectgap_df

Unnamed: 0,item_id,condition,filler,gap_distance,gap,DO_gap,IO_gap,prefix,licensor,subj,...,goal,temp_mod,gap_position,end,wh_numeric,gap_numeric,sentence,surprisals,region,region_surprisal
0,1,1,0,,0,0,0,The manager predicts,that,the intern,...,the client,earlier this morning,DO,.,-1,-1,The manager predicts that the intern forwarded...,"[(The, 0.0), (manager, 11.191156387329102), (p...",prep,2.627493
1,1,2,0,short,0,0,0,The manager predicts,that,the intern,...,the client,earlier this morning,DO,.,-1,-1,The manager predicts that the intern who you a...,"[(The, 0.0), (manager, 11.191156387329102), (p...",prep,2.458865
2,1,3,0,medium,0,0,0,The manager predicts,that,the intern,...,the client,earlier this morning,DO,.,-1,-1,The manager predicts that the intern who you w...,"[(The, 0.0), (manager, 11.191156387329102), (p...",prep,2.568919
3,1,4,0,long,0,0,0,The manager predicts,that,the intern,...,the client,earlier this morning,DO,.,-1,-1,The manager predicts that the intern who you r...,"[(The, 0.0), (manager, 11.191156387329102), (p...",prep,2.287273
8,1,9,0,,1,1,0,The manager predicts,that,the intern,...,the client,earlier this morning,DO,.,-1,1,The manager predicts that the intern forwarded...,"[(The, 0.0), (manager, 11.191156387329102), (p...",prep,2.033572
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1267,20,20,1,long,0,0,0,It was reported,what,the official,...,the emergency responders,following the storm,DO,.,1,-1,It was reported what the official who coordina...,"[(It, 0.0), (was, 3.0059664249420166), (report...",goal,42.282419
1268,20,21,1,,1,1,0,It was reported,what,the official,...,the emergency responders,following the storm,DO,.,1,1,It was reported what the official allocated to...,"[(It, 0.0), (was, 3.0059664249420166), (report...",goal,42.558905
1269,20,22,1,short,1,1,0,It was reported,what,the official,...,the emergency responders,following the storm,DO,.,1,1,It was reported what the official who oversaw ...,"[(It, 0.0), (was, 3.0059664249420166), (report...",goal,39.784677
1270,20,23,1,medium,1,1,0,It was reported,what,the official,...,the emergency responders,following the storm,DO,.,1,1,It was reported what the official who briefed ...,"[(It, 0.0), (was, 3.0059664249420166), (report...",goal,43.953444


In [34]:
iv_list = ['wh_numeric', 'gap_numeric', 'gap_distance']
mixed_effects_linear_regression(objectgap_df, iv_list, 'region_surprisal', 'gap distance: gap position == object')


=== GAP DISTANCE: GAP POSITION == OBJECT ===

=== MODEL SUMMARY ===
Model: MixedLM
Dependent Variable: region_surprisal
No. Observations: 640
Method: REML
No. Groups: 20
Scale: 107.6590
Min. group size: 32
Log-Likelihood: -2399.0847
Max. group size: 32
Converged: Yes
Mean group size: 32.0

=== FIXED EFFECTS COEFFICIENTS ===


Unnamed: 0,Parameter,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
0,Intercept,12.937,1.087,11.902,0.0,10.807,15.068
1,gap_distance[T.medium],-0.157,1.16,-0.136,0.892,-2.431,2.116
2,gap_distance[T.null],-0.263,1.16,-0.227,0.821,-2.537,2.011
3,gap_distance[T.short],-0.646,1.16,-0.556,0.578,-2.919,1.628
4,gap_numeric,-0.324,0.82,-0.395,0.693,-1.932,1.284
5,gap_numeric:gap_distance[T.medium],0.094,1.16,0.081,0.935,-2.18,2.368
6,gap_numeric:gap_distance[T.null],0.216,1.16,0.187,0.852,-2.057,2.49
7,gap_numeric:gap_distance[T.short],0.104,1.16,0.09,0.928,-2.169,2.378
8,wh_numeric,0.383,0.82,0.466,0.641,-1.225,1.99
9,wh_numeric:gap_distance[T.medium],0.065,1.16,0.056,0.955,-2.209,2.339



=== RANDOM EFFECTS / VARIANCE-COVARIANCE ===


Unnamed: 0,Parameter,Coef.,Std.Err.
0,Group Var,10.175,0.43


<statsmodels.regression.mixed_linear_model.MixedLMResultsWrapper at 0x1537642d4bb0>

Gap Postion == PP/Goal

In [35]:
pp_df = sentence_df.copy()
pp_df['region_surprisal'] = pp_df.apply(lambda x: sum_region_surprisal(x, ['temp_mod']), axis = 1)

# filter for PP gap
ppgap_df = pp_df[pp_df['gap_position'] == 'IO']
ppgap_df

Unnamed: 0,item_id,condition,filler,gap_distance,gap,DO_gap,IO_gap,prefix,licensor,subj,...,prep,goal,temp_mod,gap_position,end,wh_numeric,gap_numeric,sentence,surprisals,region_surprisal
4,1,5,0,,0,0,0,The manager predicts,that,the intern,...,to,the client,earlier this morning,IO,.,-1,-1,The manager predicts that the intern forwarded...,"[(The, 0.0), (manager, 11.191156387329102), (p...",23.143085
5,1,6,0,short,0,0,0,The manager predicts,that,the intern,...,to,the client,earlier this morning,IO,.,-1,-1,The manager predicts that the intern who you a...,"[(The, 0.0), (manager, 11.191156387329102), (p...",22.432522
6,1,7,0,medium,0,0,0,The manager predicts,that,the intern,...,to,the client,earlier this morning,IO,.,-1,-1,The manager predicts that the intern who you w...,"[(The, 0.0), (manager, 11.191156387329102), (p...",21.246870
7,1,8,0,long,0,0,0,The manager predicts,that,the intern,...,to,the client,earlier this morning,IO,.,-1,-1,The manager predicts that the intern who you r...,"[(The, 0.0), (manager, 11.191156387329102), (p...",21.573574
12,1,13,0,,1,0,1,The manager predicts,that,the intern,...,to,,earlier this morning,IO,.,-1,1,The manager predicts that the intern forwarded...,"[(The, 0.0), (manager, 11.191156387329102), (p...",24.799722
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
635,20,28,2,long,0,0,0,It was reported,who,the official,...,to,the emergency responders,following the storm,IO,.,1,-1,It was reported who the official who coordinat...,"[(It, 0.0), (was, 3.0059664249420166), (report...",35.049214
636,20,29,2,,1,0,1,It was reported,who,the official,...,to,,following the storm,IO,.,1,1,It was reported who the official allocated add...,"[(It, 0.0), (was, 3.0059664249420166), (report...",40.583885
637,20,30,2,short,1,0,1,It was reported,who,the official,...,to,,following the storm,IO,.,1,1,It was reported who the official who oversaw s...,"[(It, 0.0), (was, 3.0059664249420166), (report...",38.801802
638,20,31,2,medium,1,0,1,It was reported,who,the official,...,to,,following the storm,IO,.,1,1,It was reported who the official who briefed t...,"[(It, 0.0), (was, 3.0059664249420166), (report...",41.385231


In [36]:
iv_list = ['wh_numeric', 'gap_numeric', 'gap_distance']
mixed_effects_linear_regression(ppgap_df, iv_list, 'region_surprisal', 'gap distance: gap position == pp/goal')


=== GAP DISTANCE: GAP POSITION == PP/GOAL ===

=== MODEL SUMMARY ===
Model: MixedLM
Dependent Variable: region_surprisal
No. Observations: 320
Method: REML
No. Groups: 20
Scale: 11.2094
Min. group size: 16
Log-Likelihood: -881.3909
Max. group size: 16
Converged: Yes
Mean group size: 16.0

=== FIXED EFFECTS COEFFICIENTS ===


Unnamed: 0,Parameter,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
0,Intercept,28.698,2.319,12.377,0.0,24.154,33.243
1,gap_distance[T.medium],-0.639,0.529,-1.207,0.228,-1.676,0.399
2,gap_distance[T.null],-1.479,0.529,-2.793,0.005,-2.516,-0.441
3,gap_distance[T.short],-1.427,0.529,-2.696,0.007,-2.464,-0.389
4,gap_numeric,1.285,0.374,3.433,0.001,0.551,2.019
5,gap_numeric:gap_distance[T.medium],0.118,0.529,0.224,0.823,-0.919,1.156
6,gap_numeric:gap_distance[T.null],-0.158,0.529,-0.298,0.766,-1.195,0.88
7,gap_numeric:gap_distance[T.short],-0.169,0.529,-0.319,0.75,-1.206,0.869
8,wh_numeric,0.385,0.374,1.027,0.304,-0.349,1.118
9,wh_numeric:gap_distance[T.medium],0.097,0.529,0.183,0.855,-0.941,1.134



=== RANDOM EFFECTS / VARIANCE-COVARIANCE ===


Unnamed: 0,Parameter,Coef.,Std.Err.
0,Group Var,104.725,10.548


<statsmodels.regression.mixed_linear_model.MixedLMResultsWrapper at 0x1537647e8520>

Embedded Clause

In [37]:
embed = ['subj', 'modifier', 'verb', 'object', 'prep', 'goal', 'temp_mod', 'end']
sentence_df['embed_surprisal'] = sentence_df.apply(lambda x: sum_region_surprisal(x, embed), axis = 1)
sentence_df

Unnamed: 0,item_id,condition,filler,gap_distance,gap,DO_gap,IO_gap,prefix,licensor,subj,...,prep,goal,temp_mod,gap_position,end,wh_numeric,gap_numeric,sentence,surprisals,embed_surprisal
0,1,1,0,,0,0,0,The manager predicts,that,the intern,...,to,the client,earlier this morning,DO,.,-1,-1,The manager predicts that the intern forwarded...,"[(The, 0.0), (manager, 11.191156387329102), (p...",97.979562
1,1,2,0,short,0,0,0,The manager predicts,that,the intern,...,to,the client,earlier this morning,DO,.,-1,-1,The manager predicts that the intern who you a...,"[(The, 0.0), (manager, 11.191156387329102), (p...",133.317649
2,1,3,0,medium,0,0,0,The manager predicts,that,the intern,...,to,the client,earlier this morning,DO,.,-1,-1,The manager predicts that the intern who you w...,"[(The, 0.0), (manager, 11.191156387329102), (p...",159.217719
3,1,4,0,long,0,0,0,The manager predicts,that,the intern,...,to,the client,earlier this morning,DO,.,-1,-1,The manager predicts that the intern who you r...,"[(The, 0.0), (manager, 11.191156387329102), (p...",181.743787
4,1,5,0,,0,0,0,The manager predicts,that,the intern,...,to,the client,earlier this morning,IO,.,-1,-1,The manager predicts that the intern forwarded...,"[(The, 0.0), (manager, 11.191156387329102), (p...",97.979562
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
635,20,28,2,long,0,0,0,It was reported,who,the official,...,to,the emergency responders,following the storm,IO,.,1,-1,It was reported who the official who coordinat...,"[(It, 0.0), (was, 3.0059664249420166), (report...",296.047738
636,20,29,2,,1,0,1,It was reported,who,the official,...,to,,following the storm,IO,.,1,1,It was reported who the official allocated add...,"[(It, 0.0), (was, 3.0059664249420166), (report...",95.482954
637,20,30,2,short,1,0,1,It was reported,who,the official,...,to,,following the storm,IO,.,1,1,It was reported who the official who oversaw s...,"[(It, 0.0), (was, 3.0059664249420166), (report...",160.610425
638,20,31,2,medium,1,0,1,It was reported,who,the official,...,to,,following the storm,IO,.,1,1,It was reported who the official who briefed t...,"[(It, 0.0), (was, 3.0059664249420166), (report...",182.147220


Embedded Clause - Object Gap Position

In [38]:
# filter for object gap
embed_objectgap_df = sentence_df[sentence_df['gap_position'] == 'DO']
embed_objectgap_df

Unnamed: 0,item_id,condition,filler,gap_distance,gap,DO_gap,IO_gap,prefix,licensor,subj,...,prep,goal,temp_mod,gap_position,end,wh_numeric,gap_numeric,sentence,surprisals,embed_surprisal
0,1,1,0,,0,0,0,The manager predicts,that,the intern,...,to,the client,earlier this morning,DO,.,-1,-1,The manager predicts that the intern forwarded...,"[(The, 0.0), (manager, 11.191156387329102), (p...",97.979562
1,1,2,0,short,0,0,0,The manager predicts,that,the intern,...,to,the client,earlier this morning,DO,.,-1,-1,The manager predicts that the intern who you a...,"[(The, 0.0), (manager, 11.191156387329102), (p...",133.317649
2,1,3,0,medium,0,0,0,The manager predicts,that,the intern,...,to,the client,earlier this morning,DO,.,-1,-1,The manager predicts that the intern who you w...,"[(The, 0.0), (manager, 11.191156387329102), (p...",159.217719
3,1,4,0,long,0,0,0,The manager predicts,that,the intern,...,to,the client,earlier this morning,DO,.,-1,-1,The manager predicts that the intern who you r...,"[(The, 0.0), (manager, 11.191156387329102), (p...",181.743787
8,1,9,0,,1,1,0,The manager predicts,that,the intern,...,to,the client,earlier this morning,DO,.,-1,1,The manager predicts that the intern forwarded...,"[(The, 0.0), (manager, 11.191156387329102), (p...",71.131123
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
627,20,20,1,long,0,0,0,It was reported,what,the official,...,to,the emergency responders,following the storm,DO,.,1,-1,It was reported what the official who coordina...,"[(It, 0.0), (was, 3.0059664249420166), (report...",286.078320
628,20,21,1,,1,1,0,It was reported,what,the official,...,to,the emergency responders,following the storm,DO,.,1,1,It was reported what the official allocated to...,"[(It, 0.0), (was, 3.0059664249420166), (report...",104.167779
629,20,22,1,short,1,1,0,It was reported,what,the official,...,to,the emergency responders,following the storm,DO,.,1,1,It was reported what the official who oversaw ...,"[(It, 0.0), (was, 3.0059664249420166), (report...",159.900716
630,20,23,1,medium,1,1,0,It was reported,what,the official,...,to,the emergency responders,following the storm,DO,.,1,1,It was reported what the official who briefed ...,"[(It, 0.0), (was, 3.0059664249420166), (report...",176.812508


In [39]:
iv_list = ['wh_numeric', 'gap_numeric', 'gap_distance']
mixed_effects_linear_regression(embed_objectgap_df, iv_list, 'embed_surprisal', 'gap distance: embedded clause object gap position')


=== GAP DISTANCE: EMBEDDED CLAUSE OBJECT GAP POSITION ===

=== MODEL SUMMARY ===
Model: MixedLM
Dependent Variable: embed_surprisal
No. Observations: 320
Method: REML
No. Groups: 20
Scale: 112.8945
Min. group size: 16
Log-Likelihood: -1219.6698
Max. group size: 16
Converged: Yes
Mean group size: 16.0

=== FIXED EFFECTS COEFFICIENTS ===


Unnamed: 0,Parameter,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
0,Intercept,212.223,3.855,55.048,0.0,204.667,219.779
1,gap_distance[T.medium],-52.387,1.68,-31.183,0.0,-55.679,-49.094
2,gap_distance[T.null],-122.472,1.68,-72.901,0.0,-125.765,-119.18
3,gap_distance[T.short],-78.675,1.68,-46.83,0.0,-81.967,-75.382
4,gap_numeric,-12.439,1.188,-10.471,0.0,-14.768,-10.111
5,gap_numeric:gap_distance[T.medium],-0.307,1.68,-0.183,0.855,-3.6,2.985
6,gap_numeric:gap_distance[T.null],-0.294,1.68,-0.175,0.861,-3.587,2.998
7,gap_numeric:gap_distance[T.short],-0.086,1.68,-0.051,0.959,-3.379,3.206
8,wh_numeric,2.627,1.188,2.211,0.027,0.299,4.955
9,wh_numeric:gap_distance[T.medium],0.121,1.68,0.072,0.942,-3.171,3.414



=== RANDOM EFFECTS / VARIANCE-COVARIANCE ===


Unnamed: 0,Parameter,Coef.,Std.Err.
0,Group Var,269.039,8.707


<statsmodels.regression.mixed_linear_model.MixedLMResultsWrapper at 0x1537642b95b0>

Embedded Clause - PP/Goal Gap Position

In [40]:
# filter for PP gap
embed_ppgap_df = sentence_df[sentence_df['gap_position'] == 'IO']
embed_ppgap_df

Unnamed: 0,item_id,condition,filler,gap_distance,gap,DO_gap,IO_gap,prefix,licensor,subj,...,prep,goal,temp_mod,gap_position,end,wh_numeric,gap_numeric,sentence,surprisals,embed_surprisal
4,1,5,0,,0,0,0,The manager predicts,that,the intern,...,to,the client,earlier this morning,IO,.,-1,-1,The manager predicts that the intern forwarded...,"[(The, 0.0), (manager, 11.191156387329102), (p...",97.979562
5,1,6,0,short,0,0,0,The manager predicts,that,the intern,...,to,the client,earlier this morning,IO,.,-1,-1,The manager predicts that the intern who you a...,"[(The, 0.0), (manager, 11.191156387329102), (p...",133.317649
6,1,7,0,medium,0,0,0,The manager predicts,that,the intern,...,to,the client,earlier this morning,IO,.,-1,-1,The manager predicts that the intern who you w...,"[(The, 0.0), (manager, 11.191156387329102), (p...",159.217719
7,1,8,0,long,0,0,0,The manager predicts,that,the intern,...,to,the client,earlier this morning,IO,.,-1,-1,The manager predicts that the intern who you r...,"[(The, 0.0), (manager, 11.191156387329102), (p...",181.743787
12,1,13,0,,1,0,1,The manager predicts,that,the intern,...,to,,earlier this morning,IO,.,-1,1,The manager predicts that the intern forwarded...,"[(The, 0.0), (manager, 11.191156387329102), (p...",92.284581
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
635,20,28,2,long,0,0,0,It was reported,who,the official,...,to,the emergency responders,following the storm,IO,.,1,-1,It was reported who the official who coordinat...,"[(It, 0.0), (was, 3.0059664249420166), (report...",296.047738
636,20,29,2,,1,0,1,It was reported,who,the official,...,to,,following the storm,IO,.,1,1,It was reported who the official allocated add...,"[(It, 0.0), (was, 3.0059664249420166), (report...",95.482954
637,20,30,2,short,1,0,1,It was reported,who,the official,...,to,,following the storm,IO,.,1,1,It was reported who the official who oversaw s...,"[(It, 0.0), (was, 3.0059664249420166), (report...",160.610425
638,20,31,2,medium,1,0,1,It was reported,who,the official,...,to,,following the storm,IO,.,1,1,It was reported who the official who briefed t...,"[(It, 0.0), (was, 3.0059664249420166), (report...",182.147220


In [41]:
iv_list = ['wh_numeric', 'gap_numeric', 'gap_distance']
mixed_effects_linear_regression(embed_ppgap_df, iv_list, 'embed_surprisal', 'gap distance: embedded clause pp/goal gap position')


=== GAP DISTANCE: EMBEDDED CLAUSE PP/GOAL GAP POSITION ===

=== MODEL SUMMARY ===
Model: MixedLM
Dependent Variable: embed_surprisal
No. Observations: 320
Method: REML
No. Groups: 20
Scale: 124.7630
Min. group size: 16
Log-Likelihood: -1232.4252
Max. group size: 16
Converged: Yes
Mean group size: 16.0

=== FIXED EFFECTS COEFFICIENTS ===


Unnamed: 0,Parameter,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
0,Intercept,221.824,3.602,61.591,0.0,214.765,228.883
1,gap_distance[T.medium],-52.105,1.766,-29.503,0.0,-55.566,-48.643
2,gap_distance[T.null],-127.357,1.766,-72.112,0.0,-130.818,-123.896
3,gap_distance[T.short],-79.032,1.766,-44.75,0.0,-82.493,-75.57
4,gap_numeric,-6.733,1.249,-5.391,0.0,-9.181,-4.285
5,gap_numeric:gap_distance[T.medium],-0.033,1.766,-0.019,0.985,-3.495,3.428
6,gap_numeric:gap_distance[T.null],-0.391,1.766,-0.221,0.825,-3.852,3.071
7,gap_numeric:gap_distance[T.short],-0.363,1.766,-0.206,0.837,-3.825,3.098
8,wh_numeric,6.508,1.249,5.211,0.0,4.06,8.956
9,wh_numeric:gap_distance[T.medium],0.216,1.766,0.122,0.903,-3.245,3.678



=== RANDOM EFFECTS / VARIANCE-COVARIANCE ===


Unnamed: 0,Parameter,Coef.,Std.Err.
0,Group Var,228.238,7.081


<statsmodels.regression.mixed_linear_model.MixedLMResultsWrapper at 0x153764a3cee0>

#### Double Gap Construction

In [25]:
sentence_df = pd.read_csv('test_sentences/Double Gap.csv')
sentence_df

Unnamed: 0,item_id,condition,filler,gap,subject_gap,object_gap,prefix,licensor,subj,verb,object,modifier
0,1,a,0,0,0,0,James realized,that,the dog,chased,the cat,through the yard.
1,1,b,0,1,1,0,James realized,that,,chased,the cat,through the yard.
2,1,c,0,1,0,1,James realized,that,the dog,chased,,through the yard.
3,1,d,0,1,1,1,James realized,that,,chased,,through the yard.
4,1,e,1,0,0,0,James realized,what,the dog,chased,the cat,through the yard.
...,...,...,...,...,...,...,...,...,...,...,...,...
155,20,d,0,1,1,1,The principal knows,that,,helped,,after the exam.
156,20,e,1,0,0,0,The principal knows,what,the counselor,helped,the student,after the exam.
157,20,f,1,1,1,0,The principal knows,what,,helped,the student,after the exam.
158,20,g,1,1,0,1,The principal knows,what,the counselor,helped,,after the exam.


In [26]:
sentence_df[['modifier', 'end']] = sentence_df['modifier'].apply(split_ends)
sentence_df = encode_wh_licensor(sentence_df)
sentence_df['gap_numeric'] = sentence_df['gap'].replace(0, -1)
syntactic_parts = ['prefix', 'licensor', 'subj', 'verb', 'object', 'modifier']
sentence_df['sentence'] = sentence_df.apply(lambda x: merge_sentence(x, syntactic_parts), axis = 1)
sentence_df['surprisals'] = sentence_df['sentence'].apply(calculate_surprisal)
sentence_df

Unnamed: 0,item_id,condition,filler,gap,subject_gap,object_gap,prefix,licensor,subj,verb,object,modifier,end,wh_numeric,gap_numeric,sentence,surprisals
0,1,a,0,0,0,0,James realized,that,the dog,chased,the cat,through the yard,.,-1,-1,James realized that the dog chased the cat thr...,"[(James, 0.0), (realized, 15.564455032348633),..."
1,1,b,0,1,1,0,James realized,that,,chased,the cat,through the yard,.,-1,1,James realized that chased the cat through the...,"[(James, 0.0), (realized, 15.564455032348633),..."
2,1,c,0,1,0,1,James realized,that,the dog,chased,,through the yard,.,-1,1,James realized that the dog chased through the...,"[(James, 0.0), (realized, 15.564455032348633),..."
3,1,d,0,1,1,1,James realized,that,,chased,,through the yard,.,-1,1,James realized that chased through the yard.,"[(James, 0.0), (realized, 15.564455032348633),..."
4,1,e,1,0,0,0,James realized,what,the dog,chased,the cat,through the yard,.,1,-1,James realized what the dog chased the cat thr...,"[(James, 0.0), (realized, 15.564455032348633),..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
155,20,d,0,1,1,1,The principal knows,that,,helped,,after the exam,.,-1,1,The principal knows that helped after the exam.,"[(The, 0.0), (principal, 11.270773887634277), ..."
156,20,e,1,0,0,0,The principal knows,what,the counselor,helped,the student,after the exam,.,1,-1,The principal knows what the counselor helped ...,"[(The, 0.0), (principal, 11.270773887634277), ..."
157,20,f,1,1,1,0,The principal knows,what,,helped,the student,after the exam,.,1,1,The principal knows what helped the student af...,"[(The, 0.0), (principal, 11.270773887634277), ..."
158,20,g,1,1,0,1,The principal knows,what,the counselor,helped,,after the exam,.,1,1,The principal knows what the counselor helped ...,"[(The, 0.0), (principal, 11.270773887634277), ..."


In [18]:
#sentence_df.to_csv(('test_sentences/Double Gap.csv'), index = False)
#sentence_df = pd.read_csv('test_sentences/Double Gap.csv')

Modifier

In [27]:
sentence_df['region_surprisal'] = sentence_df.apply(lambda x: sum_region_surprisal(x, ['modifier']), axis = 1)
sentence_df

Unnamed: 0,item_id,condition,filler,gap,subject_gap,object_gap,prefix,licensor,subj,verb,object,modifier,end,wh_numeric,gap_numeric,sentence,surprisals,region_surprisal
0,1,a,0,0,0,0,James realized,that,the dog,chased,the cat,through the yard,.,-1,-1,James realized that the dog chased the cat thr...,"[(James, 0.0), (realized, 15.564455032348633),...",22.018002
1,1,b,0,1,1,0,James realized,that,,chased,the cat,through the yard,.,-1,1,James realized that chased the cat through the...,"[(James, 0.0), (realized, 15.564455032348633),...",19.303180
2,1,c,0,1,0,1,James realized,that,the dog,chased,,through the yard,.,-1,1,James realized that the dog chased through the...,"[(James, 0.0), (realized, 15.564455032348633),...",18.409940
3,1,d,0,1,1,1,James realized,that,,chased,,through the yard,.,-1,1,James realized that chased through the yard.,"[(James, 0.0), (realized, 15.564455032348633),...",18.516019
4,1,e,1,0,0,0,James realized,what,the dog,chased,the cat,through the yard,.,1,-1,James realized what the dog chased the cat thr...,"[(James, 0.0), (realized, 15.564455032348633),...",24.675975
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
155,20,d,0,1,1,1,The principal knows,that,,helped,,after the exam,.,-1,1,The principal knows that helped after the exam.,"[(The, 0.0), (principal, 11.270773887634277), ...",26.060441
156,20,e,1,0,0,0,The principal knows,what,the counselor,helped,the student,after the exam,.,1,-1,The principal knows what the counselor helped ...,"[(The, 0.0), (principal, 11.270773887634277), ...",26.442219
157,20,f,1,1,1,0,The principal knows,what,,helped,the student,after the exam,.,1,1,The principal knows what helped the student af...,"[(The, 0.0), (principal, 11.270773887634277), ...",23.145472
158,20,g,1,1,0,1,The principal knows,what,the counselor,helped,,after the exam,.,1,1,The principal knows what the counselor helped ...,"[(The, 0.0), (principal, 11.270773887634277), ...",26.938232


In [28]:
iv_list = ['subject_gap', 'object_gap', 'wh_numeric']
mixed_effects_linear_regression(sentence_df, iv_list, 'region_surprisal', 'double gap modifier')


=== DOUBLE GAP MODIFIER ===

=== MODEL SUMMARY ===
Model: MixedLM
Dependent Variable: region_surprisal
No. Observations: 160
Method: REML
No. Groups: 20
Scale: 3.6259
Min. group size: 8
Log-Likelihood: -366.2980
Max. group size: 8
Converged: Yes
Mean group size: 8.0

=== FIXED EFFECTS COEFFICIENTS ===


Unnamed: 0,Parameter,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
0,Intercept,27.211,1.141,23.854,0.0,24.975,29.447
1,subject_gap,-3.785,0.426,-8.889,0.0,-4.619,-2.95
2,object_gap,-1.953,0.426,-4.587,0.0,-2.788,-1.119
3,subject_gap:object_gap,2.422,0.602,4.023,0.0,1.242,3.603
4,wh_numeric,1.287,0.301,4.273,0.0,0.696,1.877
5,subject_gap:wh_numeric,-1.16,0.426,-2.724,0.006,-1.994,-0.325
6,object_gap:wh_numeric,-1.384,0.426,-3.251,0.001,-2.219,-0.55
7,subject_gap:object_gap:wh_numeric,1.414,0.602,2.348,0.019,0.234,2.594



=== RANDOM EFFECTS / VARIANCE-COVARIANCE ===


Unnamed: 0,Parameter,Coef.,Std.Err.
0,Group Var,24.212,4.492


<statsmodels.regression.mixed_linear_model.MixedLMResultsWrapper at 0x153764add460>

Embedded Clause

In [29]:
embed = ['subj', 'object', 'verb', 'modifier', 'end']
sentence_df['embed_surprisal'] = sentence_df.apply(lambda x: sum_region_surprisal(x, embed), axis = 1)
sentence_df

Unnamed: 0,item_id,condition,filler,gap,subject_gap,object_gap,prefix,licensor,subj,verb,object,modifier,end,wh_numeric,gap_numeric,sentence,surprisals,region_surprisal,embed_surprisal
0,1,a,0,0,0,0,James realized,that,the dog,chased,the cat,through the yard,.,-1,-1,James realized that the dog chased the cat thr...,"[(James, 0.0), (realized, 15.564455032348633),...",22.018002,56.395520
1,1,b,0,1,1,0,James realized,that,,chased,the cat,through the yard,.,-1,1,James realized that chased the cat through the...,"[(James, 0.0), (realized, 15.564455032348633),...",19.303180,54.383725
2,1,c,0,1,0,1,James realized,that,the dog,chased,,through the yard,.,-1,1,James realized that the dog chased through the...,"[(James, 0.0), (realized, 15.564455032348633),...",18.409940,46.729862
3,1,d,0,1,1,1,James realized,that,,chased,,through the yard,.,-1,1,James realized that chased through the yard.,"[(James, 0.0), (realized, 15.564455032348633),...",18.516019,43.208366
4,1,e,1,0,0,0,James realized,what,the dog,chased,the cat,through the yard,.,1,-1,James realized what the dog chased the cat thr...,"[(James, 0.0), (realized, 15.564455032348633),...",24.675975,59.184489
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
155,20,d,0,1,1,1,The principal knows,that,,helped,,after the exam,.,-1,1,The principal knows that helped after the exam.,"[(The, 0.0), (principal, 11.270773887634277), ...",26.060441,45.421412
156,20,e,1,0,0,0,The principal knows,what,the counselor,helped,the student,after the exam,.,1,-1,The principal knows what the counselor helped ...,"[(The, 0.0), (principal, 11.270773887634277), ...",26.442219,66.763715
157,20,f,1,1,1,0,The principal knows,what,,helped,the student,after the exam,.,1,1,The principal knows what helped the student af...,"[(The, 0.0), (principal, 11.270773887634277), ...",23.145472,50.321641
158,20,g,1,1,0,1,The principal knows,what,the counselor,helped,,after the exam,.,1,1,The principal knows what the counselor helped ...,"[(The, 0.0), (principal, 11.270773887634277), ...",26.938232,58.851973


In [30]:
iv_list = ['subject_gap', 'object_gap', 'wh_numeric']
result = mixed_effects_linear_regression(sentence_df, iv_list, 'embed_surprisal', 'double gap embed')
result


=== DOUBLE GAP EMBED ===

=== MODEL SUMMARY ===
Model: MixedLM
Dependent Variable: embed_surprisal
No. Observations: 160
Method: REML
No. Groups: 20
Scale: 10.8595
Min. group size: 8
Log-Likelihood: -446.0910
Max. group size: 8
Converged: Yes
Mean group size: 8.0

=== FIXED EFFECTS COEFFICIENTS ===


Unnamed: 0,Parameter,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
0,Intercept,65.619,1.655,39.651,0.0,62.375,68.863
1,subject_gap,-10.011,0.737,-13.585,0.0,-11.455,-8.566
2,object_gap,-12.531,0.737,-17.006,0.0,-13.975,-11.087
3,subject_gap:object_gap,1.283,1.042,1.231,0.218,-0.759,3.326
4,wh_numeric,1.733,0.521,3.326,0.001,0.712,2.754
5,subject_gap:wh_numeric,-1.92,0.737,-2.606,0.009,-3.365,-0.476
6,object_gap:wh_numeric,-1.51,0.737,-2.049,0.041,-2.954,-0.065
7,subject_gap:object_gap:wh_numeric,1.57,1.042,1.506,0.132,-0.473,3.612



=== RANDOM EFFECTS / VARIANCE-COVARIANCE ===


Unnamed: 0,Parameter,Coef.,Std.Err.
0,Group Var,49.346,5.337


<statsmodels.regression.mixed_linear_model.MixedLMResultsWrapper at 0x153764ace400>

#### Wh-Islands

In [None]:
sentence_df = pd.read_csv('test_sentences/Wh-Islands Construction.csv')

In [None]:
sentence_df[['continuation', 'end']] = sentence_df['continuation'].apply(split_ends)
sentence_df = encode_wh_licensor(sentence_df)
sentence_df['gap_numeric'] = sentence_df['gap'].replace(0, -1)
sentence_df['sentence'] = sentence_df.apply(merge_sentence, axis = 1)
sentence_df['surprisals'] = sentence_df['sentence'].apply(calculate_surprisal)
sentence_df

Continuation

In [None]:
sentence_df['region_surprisal'] = sentence_df.apply(lambda x: sum_region_surprisal(x, ['continuation']), axis = 1)
sentence_df

In [None]:
iv_list = ['wh_numeric', 'gap_numeric', 'island_type']
mixed_effects_linear_regression(sentence_df, iv_list, 'region_surprisal', 'wh-islands continuation')

Embedded Clause

In [None]:
embed = ['compl', 'embed 1', 'whether', 'subj 2', 'vp 2', 'obj_2', 'continuation', 'end']
sentence_df['embed_surprisal'] = sentence_df.apply(lambda x: sum_region_surprisal(x, embed), axis = 1)
sentence_df

In [None]:
iv_list = ['wh_numeric', 'gap_numeric', 'island_type']
mixed_effects_linear_regression(sentence_df, iv_list, 'embed_surprisal', 'adjunct islands embed')

#### Adjunct Islands 

In [None]:
sentence_df = pd.read_csv('test_sentences/Adjunct Islands Construction.csv')

In [None]:
sentence_df[['continuation', 'end']] = sentence_df['continuation'].apply(split_ends)
sentence_df = encode_wh_licensor(sentence_df)
sentence_df['gap_numeric'] = sentence_df['gap'].replace(0, -1)
sentence_df['sentence'] = sentence_df.apply(merge_sentence, axis = 1)
sentence_df['surprisals'] = sentence_df['sentence'].apply(calculate_surprisal)
sentence_df

Continuation

In [None]:
sentence_df['region_surprisal'] = sentence_df.apply(lambda x: sum_region_surprisal(x, ['continuation']), axis = 1)
sentence_df

In [None]:
iv_list = ['wh_numeric', 'gap_numeric', 'island_type']
mixed_effects_linear_regression(sentence_df, iv_list, 'region_surprisal', 'adjunct islands continuation')

Embedded Clause

In [None]:
embed = ['adjunct setup', 'subject', 'modifier', 'verb', 'object', 'end']
sentence_df['embed_surprisal'] = sentence_df.apply(lambda x: sum_region_surprisal(x, embed), axis = 1)
sentence_df

In [None]:
iv_list = ['wh_numeric', 'gap_numeric', 'island_type']
mixed_effects_linear_regression(sentence_df, iv_list, 'embed_surprisal', 'adjunct islands embed')

#### Complex NP Islands

In [None]:
sentence_df = pd.read_csv('test_sentences/Complex NP Islands Construction.csv')

In [None]:
sentence_df = encode_wh_licensor(sentence_df)
sentence_df['gap_numeric'] = sentence_df['gap'].replace(0, -1)
sentence_df['sentence'] = sentence_df.apply(merge_sentence, axis = 1)
sentence_df['surprisals'] = sentence_df['sentence'].apply(calculate_surprisal)
sentence_df

Subject Condition

In [None]:
subject_df = sentence_df.copy()

In [None]:
subject_df['region_surprisal'] = subject_df.apply(lambda x: sum_region_surprisal(x, ['subj_setup']), axis = 1)
subject_df

In [None]:
# filter for subject condition
subject_df = subject_df[subject_df['subj_obj'] == 'subject']

In [None]:
iv_list = ['wh_numeric', 'gap_numeric', 'island_type']
mixed_effects_linear_regression(subject_df, iv_list, 'region_surprisal', 'complex np islands: subject condition')

Object Condition

In [None]:
object_df = sentence_df.copy()

In [None]:
object_df['region_surprisal'] = object_df.apply(lambda x: sum_region_surprisal(x, ['end']), axis = 1)
object_df

In [None]:
# filter for object gap
object_df = object_df[object_df['subj_obj'] == 'object']

In [None]:
iv_list = ['wh_numeric', 'gap_numeric', 'island_type']
mixed_effects_linear_regression(object_df, iv_list, 'region_surprisal', 'complex np islands: object condition')

Embedded Clause

In [None]:
embed = ['subj', 'that_rc', 'what_rc', 'rc_np', 'prep', 'prep_np', 'subj_setup', 'obj_setup', 'end']
sentence_df['embed_surprisal'] = sentence_df.apply(lambda x: sum_region_surprisal(x, embed), axis = 1)
sentence_df

Embedded Clause - Subject Condition

In [None]:
# filter for subject condition
subject_df = sentence_df[sentence_df['subj_obj'] == 'subject']

In [None]:
iv_list = ['wh_numeric', 'gap_numeric', 'island_type']
mixed_effects_linear_regression(subject_df, iv_list, 'region_surprisal', 'complex np islands: embedded clause subject condition')

Embedded Clause - Object Condition

In [None]:
# filter for object gap
object_df = sentence_df[sentence_df['subj_obj'] == 'object']

In [None]:
iv_list = ['wh_numeric', 'gap_numeric', 'island_type']
mixed_effects_linear_regression(object_df, iv_list, 'region_surprisal', 'complex np islands: embedded clause object condition')