In [23]:
# Minicons Installation
# Introduction can be found https://kanishka.xyz/post/minicons-running-large-scale-behavioral-analyses-on-transformer-lms/
# Tutorial and code can be found https://github.com/kanishkamisra/minicons/blob/master/examples/surprisals.md
#!pip install minicons

from minicons import scorer
import pandas as pd
import numpy as np
import json
import csv
import re
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as smf
import torch
from torch.utils.data import DataLoader
from transformers import GPT2LMHeadModel, GPT2TokenizerFast

#### Resizing Model Embeddings (50527) to Match with Tokenizer Vocabulary Size (50528)

In [6]:
'''
from transformers import GPT2TokenizerFast, GPT2LMHeadModel

model_path = "gpt2-small/checkpoint-trainedtokenizer_100M"

tokenizer = GPT2TokenizerFast.from_pretrained(model_path)
model = GPT2LMHeadModel.from_pretrained(model_path)

# print mismatch
print("Tokenizer vocab size:", len(tokenizer))
print("Model vocab size:", model.config.vocab_size)

# resize model embeddings to match tokenizer
if len(tokenizer) != model.config.vocab_size:
    print(f"Resizing model embeddings from {model.config.vocab_size} → {len(tokenizer)}")
    model.resize_token_embeddings(len(tokenizer))
    model.save_pretrained(model_path)
    print("Saved updated model.")
'''

Tokenizer vocab size: 50258
Model vocab size: 50257
Resizing model embeddings from 50257 → 50258
Saved updated model.
Special tokens map: {'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|endoftext|>'}


In [2]:
#model_path = "gpt2-small/checkpoint-pretrainedtokenizer_10M"
#model_path = "gpt2-small/checkpoint-trainedtokenizer_10M"
#model_path = "gpt2-small/checkpoint-trainedtokenizer_10M_whitespace"
#model_path = "gpt2-small/checkpoint-pretrainedtokenizer_100M"
#model_path = "gpt2-small/checkpoint-trainedtokenizer_100M"
model_path = "gpt2-small/checkpoint-trainedtokenizer_100M_whitespace"

model = GPT2LMHeadModel.from_pretrained(model_path)
tokenizer = GPT2TokenizerFast.from_pretrained(model_path)

# wrap with minicons scorer
lm_scorer = scorer.IncrementalLMScorer(model_path, device = "cpu")

In [9]:
print("Special tokens:", tokenizer.all_special_tokens)
print("Special token IDs:", tokenizer.all_special_ids)
print("Special tokens map:", tokenizer.special_tokens_map)

Special tokens: ['<|endoftext|>']
Special token IDs: [50257]
Special tokens map: {'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|endoftext|>'}


In [12]:
surprisals

[[('<pad>', 0.0),
  ('ĠTheĠ', 6.771759033203125),
  ('balloon', 12.995538711547852),
  ('Ġwa', 3.036536693572998),
  ('s', 0.0033515978138893843),
  ('Ġinf', 14.873947143554688),
  ('lat', 3.8473756313323975),
  ('ingĠfor', 9.053845405578613),
  ('Ġ10', 10.562509536743164),
  ('Ġminute', 3.032411813735962),
  ('s', 0.0010208890307694674)]]

In [37]:
"""'
def calculate_surprisal(sentence):
    '''
    Takes in a sentence, and outputs surprisal values for each word.
    '''
    
    input_sentence = sentence # process per sentence, never in batches to avoid padding
    # token_score() function of Minicons takes in several parameters
    # if surprisal = True, the output value is surprisal instead of log likelihood
    # if base_two = True, the log likelihood will be in base 2
    # see Minicons documentations for details
    # score tokens
    token_surprisals = lm_scorer.token_score(input_sentence, surprisal = True, base_two = True)[0]
    #print(token_surprisals)

    # matching tokens manually back to words using offset mapping
    # tokenizer setup
    encoding = tokenizer(sentence, return_offsets_mapping = True, add_special_tokens = False)
    offsets = encoding['offset_mapping']
    token_ids = encoding['input_ids']
    tokens = tokenizer.convert_ids_to_tokens(token_ids)

    # filter out special token surprisals (like <pad>) *not needed if we set add_special_tokens to False, but just to be safe
    special_tokens = set(tokenizer.all_special_tokens + ['<pad>'])
    filtered = [
        (token, score, span)
        for (token, score), span in zip(token_surprisals, offsets)
        if token not in special_tokens
    ]

    # prepare: group surprisals by words based on character spans
    words = re.findall(r"\S+", sentence)
    word_spans = []
    i = 0
    for match in re.finditer(r"\S+", sentence):
        start, end = match.span()
        word_spans.append((i, start, end))
        i += 1

    # assign tokens to words based on character alignment (needed since BPE tokenizers break words down into subwords/tokens)
    word_surprisals = []
    word_index = 0
    word_start, word_end = word_spans[word_index][1:3] # previously: word_spans.append((i, start, end)) [0, 1, 2]
    current_surprisal = 0.0
    
    for token, score, (start, end) in filtered:
        if start >= word_end:
            word_surprisals.append((words[word_index], current_surprisal))
            word_index += 1
            if word_index >= len(word_spans):
                break
            word_start, word_end = word_spans[word_index][1:3]
            current_surprisal = 0.0
        current_surprisal += score

    # append final word
    if word_index < len(words):
        word_surprisals.append((words[word_index], current_surprisal))

    return word_surprisals


sentence = 'The teacher realized what the storm rolled in while the student in the first year was studying for the test with great enthusiasm'
calculate_surprisal(sentence)
"""

[('The', 0.0),
 ('teacher', 11.844358444213867),
 ('realized', 13.042094230651855),
 ('what', 6.120745658874512),
 ('the', 3.2175395488739014),
 ('storm', 15.96143913269043),
 ('rolled', 12.999858856201172),
 ('in', 5.186878204345703),
 ('while', 10.483074188232422),
 ('the', 1.5718594789505005),
 ('student', 10.762317657470703),
 ('in', 7.2503581047058105),
 ('the', 1.2104510068893433),
 ('first', 7.770709991455078),
 ('year', 5.073796272277832),
 ('was', 1.570701003074646),
 ('studying', 10.941052436828613),
 ('for', 5.377425670623779),
 ('the', 1.5096884965896606),
 ('test', 7.936253547668457),
 ('with', 9.095281600952148),
 ('great', 9.63071346282959),
 ('enthusiasm', 6.821305751800537)]

In [3]:
def calculate_surprisal(sentence):
    '''
    Takes in a sentence, and outputs surprisal values for each word.
    '''
    
    input_sentence = sentence
    # token_score() function of Minicons takes in several parameters
    # if surprisal = True, the output value is surprisal instead of log likelihood
    # if base_two = True, the log likelihood will be in base 2
    # see Minicons documentations for details
    # score tokens
    token_surprisals = lm_scorer.token_score(input_sentence, surprisal = True, base_two = True)[0]
    #print(token_surprisals)

    '''
    # filter out special tokens (like <pad>)
    special_tokens = set(tokenizer.all_special_tokens + ['<pad>'])
    filtered = [
        (token, score)
        for (token, score) in token_surprisals
        if token not in special_tokens
    ]
    '''

   # expand tokens that contain multiple words
    expanded = []
    for token, score in token_surprisals:
    #for token, score in filtered:
        token = token.strip('Ġ') # remove space marker
        if token.count('Ġ') > 0:
            # multiple words inside
            words = token.split('Ġ')
            words = [word for word in words if word]  # remove empty strings
            for i, word in enumerate(words):
                expanded_token = word
                expanded.append((expanded_token, score / len(words)))  # split surprisal equally
        else:
            expanded.append((token, score))    
    #print(expanded)

    # use regex to split into words and punctuation
    words = re.findall(r'\w+|[^\w\s]', sentence)
    results = []

    token_pointer = 0

    for word in words:
        accumulated = ''
        word_surprisal = 0.0

        while token_pointer < len(expanded):
            token, surprisal = expanded[token_pointer]
            accumulated += token
            word_surprisal += surprisal
            token_pointer += 1

            if accumulated == word:
                results.append((word, word_surprisal))
                break
        else:
            results.append((word, word_surprisal))

    return results


sentence = 'I know that your friend gave a baguette to Mary last weekend.'
calculate_surprisal(sentence)

[('I', 0.0),
 ('know', 4.365269184112549),
 ('that', 3.4667656421661377),
 ('your', 9.112505912780762),
 ('friend', 6.371501922607422),
 ('gave', 10.674038887023926),
 ('a', 4.927915096282959),
 ('baguette', 38.19276142120361),
 ('to', 5.820535182952881),
 ('Mary', 13.028478622436523),
 ('last', 11.46209716796875),
 ('weekend', 7.374884128570557),
 ('.', 2.0132205486297607)]

In [4]:
def calculate_sentence_surprisal(word_surprisals):
    '''Returns total surprisal and average surprisal per word.'''
    
    scores = [score for word, score in word_surprisals]
    total = sum(scores)
    avg = total / len(scores)
    return total, avg

"""
def sum_region_surprisal(row, region_list):
    '''Sums surprisals of a specified sentence region, extracting the relevant part from sentence_surprisals using index information.'''

    sentence_surprisals = row['surprisals']
    
    # retrieve all sentence column(from 'prefix' till 'end')
    all_columns = row.index.tolist()  # get the list of column names
    prefix_index = all_columns.index('prefix')
    eos_index = all_columns.index('end')
    
    # reconstruct the full sentence from 'prefix' to 'end' (exclude NaN values)
    full_sentence = ' '.join([str(row[col]) if pd.notna(row.get(col)) else '' for col in all_columns[prefix_index:eos_index + 1]])
    
    # use regex to split the full sentence into words & punctuation
    sentence_units = re.findall(r'\w+|[^\w\s]', full_sentence)
    
    # get the start and end indices for the region_list within the full sentence
    first_region_column = region_list[0]
    first_region_start_index = all_columns.index(first_region_column)
    
    last_region_column = region_list[-1]
    last_region_end_index = all_columns.index(last_region_column)
    
    # extract the relevant slice of sentence_surprisals that corresponds to the region_list
    # calculate the starting and ending index for the region_list part
    start_index = sum([
        len(re.findall(r'\w+|[^\w\s]', str(row[col]))) for col in all_columns[prefix_index:first_region_start_index]
        if pd.notna(row[col]) 
    ])
    end_index = sum([
        len(re.findall(r'\w+|[^\w\s]', str(row[col]))) for col in all_columns[prefix_index:last_region_end_index + 1]
        if pd.notna(row[col]) 
    ])

    # extract the slice of sentence_surprisals corresponding to the region_list part
    relevant_surprisals = sentence_surprisals[start_index:end_index]
    region_surprisal = sum([score for token, score in relevant_surprisals])
    
    return region_surprisal
"""

def sum_region_surprisal(row, region_list, priority_region = None, normalize = False):
    '''Sums surprisals of a specified sentence region with optional priority region handling.'''

    sentence_surprisals = row['surprisals']
    
    # retrieve all sentence column(from 'prefix' till 'end')
    all_columns = row.index.tolist()  # get the list of column names
    prefix_index = all_columns.index('prefix')
    eos_index = all_columns.index('end')
    
    # reconstruct the full sentence from 'prefix' to 'end' (exclude NaN values)
    full_sentence = ' '.join([str(row[col]) if pd.notna(row.get(col)) else '' for col in all_columns[prefix_index:eos_index + 1]])
    
    # use regex to split the full sentence into words & punctuation
    sentence_units = re.findall(r'\w+|[^\w\s]', full_sentence)
    
    if priority_region:
        # check if priority_region has non-zero surprisal
        if pd.notna(row[priority_region]):
            # extract text for the priority_region
            priority_region_index = all_columns.index(priority_region)
            
            start_index = sum([
                len(re.findall(r'\w+|[^\w\s]', str(row[col]))) for col in all_columns[prefix_index:priority_region_index]
                if pd.notna(row[col]) 
            ])
            end_index = sum([
                len(re.findall(r'\w+|[^\w\s]', str(row[col]))) for col in all_columns[prefix_index:priority_region_index + 1]
                if pd.notna(row[col]) 
            ])
            
            # calculate the surprisal for the priority region
            relevant_surprisals = sentence_surprisals[start_index:end_index]
            priority_surprisal = sum([score for token, score in relevant_surprisals])
    
            # return priority region's surprisal score if it is non-zero
            if priority_surprisal != 0:
                return priority_surprisal
    
    # if priority region is not provided or its surprisal is zero, calculate sum of region_list surprisals

    # get the start and end indices for the region_list within the full sentence
    first_region_column = region_list[0]
    first_region_start_index = all_columns.index(first_region_column)
    
    last_region_column = region_list[-1]
    last_region_end_index = all_columns.index(last_region_column)
    
    # extract the relevant slice of sentence_surprisals that corresponds to the region_list
    # calculate the starting and ending index for the region_list part
    start_index = sum([
        len(re.findall(r'\w+|[^\w\s]', str(row[col]))) for col in all_columns[prefix_index:first_region_start_index]
        if pd.notna(row[col]) 
    ])
    end_index = sum([
        len(re.findall(r'\w+|[^\w\s]', str(row[col]))) for col in all_columns[prefix_index:last_region_end_index + 1]
        if pd.notna(row[col]) 
    ])

    # extract the slice of sentence_surprisals corresponding to the region_list part
    relevant_surprisals = sentence_surprisals[start_index:end_index]
    region_surprisal = sum([score for token, score in relevant_surprisals])
    
    if normalize == False:
        return region_surprisal
    elif normalize == True:
        return region_surprisal / len(sentence_units)

In [5]:
def split_ends(ends):
    '''Splits off sentence-final punctuation.'''
    
    match = re.match(r'^(.*?)([.!?])$', ends.strip())
    if match:
        return pd.Series([match.group(1), match.group(2)])
    else:
        return pd.Series([ends, ''])  # no end punctuation
    
def encode_wh_licensor(df):
    '''
    Adds a numeric column to the DataFrame:
    - 1 if licensor is a wh-licensor
    - -1 otherwise (licensor is "that" or absent)
    '''
    df['wh_numeric'] = df['filler'].apply(lambda x: 1 if x > 0 else -1)
    return df

def merge_sentence(row, syntactic_parts):
    '''Merges sentence columns back into full sentence.'''
    
    parts = [row[part] for part in syntactic_parts]         
    # filter out NaN or empty parts
    non_empty_parts = [str(part) for part in parts if pd.notna(part) and str(part).strip() != '']
    # join with spaces and add end punctuation
    sentence = ' '.join(non_empty_parts) + (row['end'] if pd.notna(row['end']) else '')
    return sentence

#### Statistical Analysis: Mixed-Effects Linear Regression Model

In [6]:
from IPython.display import display
import warnings
from statsmodels.tools.sm_exceptions import ConvergenceWarning

def print_summary(result):
    '''
    Prints the descriptive header from a statsmodels MixedLMResults summary,
    and displays both the fixed-effects and random-effects components as pandas DataFrames.
    '''
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", ConvergenceWarning)
        
        summary_str = str(result.summary())
        lines = summary_str.split('\n')

        # PART 1: HEADER
        print("\n=== MODEL SUMMARY ===")
        for line in lines:
            if 'Coef.' in line:
                break  # stop when reaching the coefficient table
            pairs = re.findall(r'(\S[^:]*):\s+([^\s][^:]*?)(?=\s{2,}|$)', line)
            for key, value in pairs:
                print(f"{key.strip()}: {value.strip()}")

        # PART 2a: FIXED EFFECTS TABLE
        fixed_table = []
        in_fixed_table = False
        random_start_idx = None

        for i, line in enumerate(lines):
            if 'Coef.' in line:
                in_fixed_table = True
                continue
            if in_fixed_table:
                if not line.strip() or set(line.strip()) in [{'='}, {'-'}]:
                    continue  # skip empty/separator lines
                parts = line.strip().split()
                if len(parts) >= 7:
                    param = parts[0]
                    row = parts[1:7]
                    fixed_table.append([param] + row)
                else:
                    random_start_idx = i
                    break

        if fixed_table:
            df_fixed = pd.DataFrame(fixed_table, columns = [
                'Parameter', 'Coef.', 'Std.Err.', 'z', 'P>|z|', '[0.025', '0.975]'
            ])
            print("\n=== FIXED EFFECTS COEFFICIENTS ===")
            display(df_fixed)
        else:
            print("\nNo fixed-effects data found.")

        # PART 2b: RANDOM EFFECTS / VARIANCE COMPONENTS TABLE
        random_table = []
        if random_start_idx:
            for line in lines[random_start_idx:]:
                if not line.strip() or set(line.strip()) in [{'='}, {'-'}]:
                    continue
                parts = line.strip().split()
                # Look for last token ending in Var or Cov
                varcov_idx = None
                for j in reversed(range(len(parts))):
                    if parts[j].endswith('Var') or parts[j].endswith('Cov'):
                        varcov_idx = j
                        break
                if varcov_idx is not None:
                    param = ' '.join(parts[:varcov_idx + 1])
                    numeric_parts = parts[varcov_idx + 1:]
                    if len(numeric_parts) == 2:
                        val1, val2 = numeric_parts
                    elif len(numeric_parts) == 1:
                        val1, val2 = numeric_parts[0], ''
                    else:
                        val1, val2 = '', ''
                    random_table.append([param, val1, val2])

        if random_table:
            df_random = pd.DataFrame(random_table, columns = ['Parameter', 'Coef.', 'Std.Err.'])
            print("\n=== RANDOM EFFECTS / VARIANCE-COVARIANCE ===")
            display(df_random)
        else:
            print("\nNo random-effects data found.")


In [7]:
# columns: item_id (indicates sentence set), wh_licensor (0/1), gap (0/1), island_type, surprisal

warnings.simplefilter("ignore", ConvergenceWarning)

def mixed_effects_linear_regression(df, iv_list, surprisal, label):
    '''
    Fits mixed-effects model and extracts wh-licensing interaction.
    '''
    '''    
    for field in iv_list:
        if field == 'wh_numeric' or 'gap_numeric' or 'gap_distance_length':
            pass
        else:
            df[field] = df[field].astype('category')
    '''

    interaction_terms = ' * '.join(iv_list)
    random_effects = ' + '.join(iv_list)

    model = smf.mixedlm(
        f"{surprisal} ~ {interaction_terms}",
        df,
        groups = df["item_id"],
        #re_formula = f"~{random_effects}"
        re_formula = "1"
        )

    result = model.fit()
    #interaction_coef = result.params.get('wh_licensor[T.1]:gap[T.1]', None)

    print(f"\n=== {label.upper()} ===")
    #print(result.summary())
    print_summary(result)

    return result

#interaction = mixed_effects_linear_regression(df, "construction_type") # label name to be changed according to construction type

#### Gap Position

In [None]:
sentence_df = pd.read_csv('test_sentences/Gap Position.csv')

In [None]:
sentence_df = encode_wh_licensor(sentence_df)
sentence_df['gap_numeric'] = sentence_df['gap'].replace(0, -1)
sentence_df['sentence'] = sentence_df.apply(merge_sentence, axis = 1)
sentence_df['surprisals'] = sentence_df['sentence'].apply(calculate_surprisal)
sentence_df

Subject Position

In [None]:
subject_df = sentence_df.copy()

In [None]:
subject_df['region_surprisal'] = subject_df.apply(lambda x: sum_region_surprisal(x, ['verb']), axis = 1)
subject_df

In [None]:
# filter for subject gap
subject_df = subject_df[subject_df['gap_position'] == 'subject']

In [None]:
iv_list = ['wh_numeric', 'gap_numeric']
mixed_effects_linear_regression(subject_df, iv_list, 'region_surprisal', 'gap position: subject')

Object Position

In [None]:
object_df = sentence_df.copy()

In [None]:
object_df['region_surprisal'] = object_df.apply(lambda x: sum_region_surprisal(x, ['prep']), axis = 1)
object_df

In [None]:
# filter for object gap
object_df = object_df[object_df['gap_position'] == 'object']

In [None]:
iv_list = ['wh_numeric', 'gap_numeric']
mixed_effects_linear_regression(object_df, iv_list, 'region_surprisal', 'gap position: object')

PP/Goal Position

In [None]:
pp_df = sentence_df.copy()

In [None]:
pp_df['region_surprisal'] = pp_df.apply(lambda x: sum_region_surprisal(x, ['end']), axis = 1)
pp_df

In [None]:
# filter for pp gap
pp_df = pp_df[pp_df['gap_position'] == 'PP']

In [None]:
iv_list = ['wh_numeric', 'gap_numeric']
mixed_effects_linear_regression(pp_df, iv_list, 'region_surprisal', 'gap position: pp')

Embedded Clause

In [None]:
embed = ['apositive', 'NP1', 'verb', 'NP2', 'prep', 'NP3', 'end']
sentence_df['embed_surprisal'] = sentence_df.apply(lambda x: sum_region_surprisal(x, embed, normalize = True), axis = 1)
sentence_df

In [None]:
iv_list = ['wh_numeric', 'gap_numeric', 'gap_position']
mixed_effects_linear_regression(pp_df, iv_list, 'region_surprisal', 'gap position: embedded clause')

#### Gap Distance - Categorical/Continuous

In [8]:
sentence_df = pd.read_csv('test_sentences/Gap Distance.csv')
sentence_df['gap_distance'] = sentence_df['gap_distance'].fillna('null')
#sentence_df['gap_distance'].dropna(inplace = True)
#sentence_df = sentence_df[sentence_df['gap_distance'].replace(['nan', 'NaN'], np.nan).notna()]

In [9]:
sentence_df[['temp_mod', 'end']] = sentence_df['temp_mod'].apply(split_ends)
sentence_df = encode_wh_licensor(sentence_df)
sentence_df['gap_numeric'] = sentence_df['gap'].replace(0, -1)
syntactic_parts = ['prefix', 'licensor', 'subj', 'modifier', 'verb', 'object', 'prep', 'goal', 'temp_mod']
sentence_df['sentence'] = sentence_df.apply(lambda x: merge_sentence(x, syntactic_parts), axis = 1)
sentence_df['surprisals'] = sentence_df['sentence'].apply(calculate_surprisal)
sentence_df['gap_distance_length'] = sentence_df['modifier'].apply(lambda x: len(x) if pd.notna(x) else 0)
sentence_df['gap_distance'] = pd.Categorical(sentence_df['gap_distance'], categories = ['null', 'short', 'medium', 'long'], ordered = True)

Gap Position == Object

In [None]:
# modifier: short_mod, med_mod, long_mod

In [44]:
"""
# region surprisals are kept separate, in long format
# create long-format rows
region_rows = []
for region in ['prep', 'goal']:
    region_df = sentence_df.copy()
    region_df['region'] = region 
    region_df['region_surprisal'] = region_df.apply(lambda x: sum_region_surprisal(x, [region]), axis = 1)
    region_rows.append(region_df)

# concatenate long-format DataFrame
long_df = pd.concat(region_rows, ignore_index = True)

# filter for object gap
objectgap_df = long_df[long_df['gap_position'] == 'DO']
objectgap_df
"""

Unnamed: 0,item_id,condition,filler,gap_distance,gap,DO_gap,IO_gap,prefix,licensor,subj,...,goal,temp_mod,gap_position,end,wh_numeric,gap_numeric,sentence,surprisals,region,region_surprisal
0,1,1,0,,0,0,0,The manager predicts,that,the intern,...,the client,earlier this morning,DO,.,-1,-1,The manager predicts that the intern forwarded...,"[(The, 0.0), (manager, 11.191156387329102), (p...",prep,2.627493
1,1,2,0,short,0,0,0,The manager predicts,that,the intern,...,the client,earlier this morning,DO,.,-1,-1,The manager predicts that the intern who you a...,"[(The, 0.0), (manager, 11.191156387329102), (p...",prep,2.458865
2,1,3,0,medium,0,0,0,The manager predicts,that,the intern,...,the client,earlier this morning,DO,.,-1,-1,The manager predicts that the intern who you w...,"[(The, 0.0), (manager, 11.191156387329102), (p...",prep,2.568919
3,1,4,0,long,0,0,0,The manager predicts,that,the intern,...,the client,earlier this morning,DO,.,-1,-1,The manager predicts that the intern who you r...,"[(The, 0.0), (manager, 11.191156387329102), (p...",prep,2.287273
8,1,9,0,,1,1,0,The manager predicts,that,the intern,...,the client,earlier this morning,DO,.,-1,1,The manager predicts that the intern forwarded...,"[(The, 0.0), (manager, 11.191156387329102), (p...",prep,2.033572
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1267,20,20,1,long,0,0,0,It was reported,what,the official,...,the emergency responders,following the storm,DO,.,1,-1,It was reported what the official who coordina...,"[(It, 0.0), (was, 3.0059664249420166), (report...",goal,42.282419
1268,20,21,1,,1,1,0,It was reported,what,the official,...,the emergency responders,following the storm,DO,.,1,1,It was reported what the official allocated to...,"[(It, 0.0), (was, 3.0059664249420166), (report...",goal,42.558905
1269,20,22,1,short,1,1,0,It was reported,what,the official,...,the emergency responders,following the storm,DO,.,1,1,It was reported what the official who oversaw ...,"[(It, 0.0), (was, 3.0059664249420166), (report...",goal,39.784677
1270,20,23,1,medium,1,1,0,It was reported,what,the official,...,the emergency responders,following the storm,DO,.,1,1,It was reported what the official who briefed ...,"[(It, 0.0), (was, 3.0059664249420166), (report...",goal,43.953444


In [10]:
object_df = sentence_df.copy()
#object_df['region_surprisal'] = object_df.apply(lambda x: sum_region_surprisal(x, ['prep', 'goal']), axis = 1)
object_df['region_surprisal'] = object_df.apply(lambda x: sum_region_surprisal(x, ['prep', 'goal'], 'object'), axis = 1)

# filter for object gap
objectgap_df = object_df[object_df['gap_position'] == 'DO']

In [11]:
# categorical
iv_list = ['wh_numeric', 'gap_numeric', 'gap_distance']
mixed_effects_linear_regression(objectgap_df, iv_list, 'region_surprisal', 'gap distance: gap position at object')


=== GAP DISTANCE: GAP POSITION AT OBJECT ===

=== MODEL SUMMARY ===
Model: MixedLM
Dependent Variable: region_surprisal
No. Observations: 320
Method: REML
No. Groups: 20
Scale: 23.7384
Min. group size: 16
Log-Likelihood: -970.7107
Max. group size: 16
Converged: Yes
Mean group size: 16.0

=== FIXED EFFECTS COEFFICIENTS ===


Unnamed: 0,Parameter,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
0,Intercept,23.533,1.024,22.979,0.0,21.526,25.54
1,gap_distance[T.short],-0.923,0.77,-1.198,0.231,-2.432,0.587
2,gap_distance[T.medium],-0.992,0.77,-1.287,0.198,-2.502,0.518
3,gap_distance[T.long],-1.468,0.77,-1.905,0.057,-2.978,0.042
4,wh_numeric,0.675,0.545,1.24,0.215,-0.392,1.743
5,wh_numeric:gap_distance[T.short],-0.36,0.77,-0.467,0.64,-1.87,1.15
6,wh_numeric:gap_distance[T.medium],-0.5,0.77,-0.65,0.516,-2.01,1.009
7,wh_numeric:gap_distance[T.long],-0.55,0.77,-0.713,0.476,-2.059,0.96
8,gap_numeric,-2.029,0.545,-3.724,0.0,-3.096,-0.961
9,gap_numeric:gap_distance[T.short],-0.086,0.77,-0.111,0.911,-1.596,1.424



=== RANDOM EFFECTS / VARIANCE-COVARIANCE ===


Unnamed: 0,Parameter,Coef.,Std.Err.
0,Group Var,15.042,1.137


<statsmodels.regression.mixed_linear_model.MixedLMResultsWrapper at 0x152a5f113760>

In [12]:
# continuous
iv_list = ['wh_numeric', 'gap_numeric', 'gap_distance_length']
mixed_effects_linear_regression(objectgap_df, iv_list, 'region_surprisal', 'gap distance: gap position at object')


=== GAP DISTANCE: GAP POSITION AT OBJECT ===

=== MODEL SUMMARY ===
Model: MixedLM
Dependent Variable: region_surprisal
No. Observations: 320
Method: REML
No. Groups: 20
Scale: 23.1863
Min. group size: 16
Log-Likelihood: -992.7386
Max. group size: 16
Converged: Yes
Mean group size: 16.0

=== FIXED EFFECTS COEFFICIENTS ===


Unnamed: 0,Parameter,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
0,Intercept,23.303,0.97,24.019,0.0,21.402,25.205
1,wh_numeric,0.562,0.428,1.312,0.19,-0.278,1.401
2,gap_numeric,-2.467,0.428,-5.762,0.0,-3.306,-1.628
3,wh_numeric:gap_numeric,-0.353,0.428,-0.824,0.41,-1.192,0.486
4,gap_distance_length,-0.017,0.009,-1.835,0.066,-0.036,0.001
5,wh_numeric:gap_distance_length,-0.007,0.009,-0.717,0.473,-0.025,0.012
6,gap_numeric:gap_distance_length,0.005,0.009,0.496,0.62,-0.014,0.023
7,wh_numeric:gap_numeric:gap_distance_length,0.005,0.009,0.536,0.592,-0.013,0.023



=== RANDOM EFFECTS / VARIANCE-COVARIANCE ===


Unnamed: 0,Parameter,Coef.,Std.Err.
0,Group Var,15.127,1.153


<statsmodels.regression.mixed_linear_model.MixedLMResultsWrapper at 0x152a5f6b1dc0>

Gap Postion == PP/Goal

In [13]:
pp_df = sentence_df.copy()
#pp_df['region_surprisal'] = pp_df.apply(lambda x: sum_region_surprisal(x, ['temp_mod']), axis = 1)
pp_df['region_surprisal'] = pp_df.apply(lambda x: sum_region_surprisal(x, ['temp_mod'], 'goal'), axis = 1)

# filter for PP gap
ppgap_df = pp_df[pp_df['gap_position'] == 'IO']

In [14]:
# categorical
iv_list = ['wh_numeric', 'gap_numeric', 'gap_distance']
mixed_effects_linear_regression(ppgap_df, iv_list, 'region_surprisal', 'gap distance: gap position at pp/goal')


=== GAP DISTANCE: GAP POSITION AT PP/GOAL ===

=== MODEL SUMMARY ===
Model: MixedLM
Dependent Variable: region_surprisal
No. Observations: 320
Method: REML
No. Groups: 20
Scale: 36.0144
Min. group size: 16
Log-Likelihood: -1035.8511
Max. group size: 16
Converged: Yes
Mean group size: 16.0

=== FIXED EFFECTS COEFFICIENTS ===


Unnamed: 0,Parameter,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
0,Intercept,22.152,1.36,16.286,0.0,19.486,24.818
1,gap_distance[T.short],-0.389,0.949,-0.41,0.682,-2.249,1.471
2,gap_distance[T.medium],-0.226,0.949,-0.239,0.811,-2.086,1.633
3,gap_distance[T.long],-0.533,0.949,-0.561,0.575,-2.392,1.327
4,wh_numeric,-0.006,0.671,-0.009,0.993,-1.321,1.309
5,wh_numeric:gap_distance[T.short],0.04,0.949,0.042,0.967,-1.82,1.899
6,wh_numeric:gap_distance[T.medium],0.035,0.949,0.037,0.971,-1.825,1.895
7,wh_numeric:gap_distance[T.long],0.007,0.949,0.007,0.994,-1.853,1.866
8,gap_numeric,4.894,0.671,7.294,0.0,3.579,6.209
9,gap_numeric:gap_distance[T.short],-0.294,0.949,-0.31,0.757,-2.154,1.566



=== RANDOM EFFECTS / VARIANCE-COVARIANCE ===


Unnamed: 0,Parameter,Coef.,Std.Err.
0,Group Var,27.997,1.689


<statsmodels.regression.mixed_linear_model.MixedLMResultsWrapper at 0x152a5f17b6a0>

In [15]:
# continuous
iv_list = ['wh_numeric', 'gap_numeric', 'gap_distance_length']
mixed_effects_linear_regression(ppgap_df, iv_list, 'region_surprisal', 'gap distance: gap position at pp/goal')


=== GAP DISTANCE: GAP POSITION AT PP/GOAL ===

=== MODEL SUMMARY ===
Model: MixedLM
Dependent Variable: region_surprisal
No. Observations: 320
Method: REML
No. Groups: 20
Scale: 35.0812
Min. group size: 16
Log-Likelihood: -1059.1465
Max. group size: 16
Converged: Yes
Mean group size: 16.0

=== FIXED EFFECTS COEFFICIENTS ===


Unnamed: 0,Parameter,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
0,Intercept,22.044,1.299,16.972,0.0,19.499,24.59
1,wh_numeric,0.015,0.527,0.028,0.978,-1.017,1.047
2,gap_numeric,4.935,0.527,9.37,0.0,3.902,5.967
3,wh_numeric:gap_numeric,-0.169,0.527,-0.321,0.748,-1.201,0.863
4,gap_distance_length,-0.005,0.012,-0.434,0.664,-0.028,0.018
5,wh_numeric:gap_distance_length,-0.0,0.012,-0.001,0.999,-0.023,0.023
6,gap_numeric:gap_distance_length,-0.001,0.012,-0.088,0.93,-0.024,0.022
7,wh_numeric:gap_numeric:gap_distance_length,0.001,0.012,0.118,0.906,-0.021,0.024



=== RANDOM EFFECTS / VARIANCE-COVARIANCE ===


Unnamed: 0,Parameter,Coef.,Std.Err.
0,Group Var,28.144,1.715


<statsmodels.regression.mixed_linear_model.MixedLMResultsWrapper at 0x152a5f0fe430>

Embedded Clause

In [16]:
embed = ['subj', 'modifier', 'verb', 'object', 'prep', 'goal', 'temp_mod', 'end']
sentence_df['embed_surprisal'] = sentence_df.apply(lambda x: sum_region_surprisal(x, embed, normalize = True), axis = 1)

Embedded Clause - Object Gap Position

In [17]:
# filter for object gap
embed_objectgap_df = sentence_df[sentence_df['gap_position'] == 'DO']

In [18]:
# categorical
iv_list = ['wh_numeric', 'gap_numeric', 'gap_distance']
mixed_effects_linear_regression(embed_objectgap_df, iv_list, 'embed_surprisal', 'gap distance: embedded clause object gap position')


=== GAP DISTANCE: EMBEDDED CLAUSE OBJECT GAP POSITION ===

=== MODEL SUMMARY ===
Model: MixedLM
Dependent Variable: embed_surprisal
No. Observations: 320
Method: REML
No. Groups: 20
Scale: 0.1739
Min. group size: 16
Log-Likelihood: -236.1197
Max. group size: 16
Converged: Yes
Mean group size: 16.0

=== FIXED EFFECTS COEFFICIENTS ===


Unnamed: 0,Parameter,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
0,Intercept,5.772,0.157,36.78,0.0,5.465,6.08
1,gap_distance[T.short],1.101,0.066,16.704,0.0,0.972,1.231
2,gap_distance[T.medium],1.187,0.066,18.0,0.0,1.058,1.316
3,gap_distance[T.long],2.012,0.066,30.506,0.0,1.882,2.141
4,wh_numeric,0.102,0.047,2.186,0.029,0.011,0.193
5,wh_numeric:gap_distance[T.short],0.031,0.066,0.472,0.637,-0.098,0.16
6,wh_numeric:gap_distance[T.medium],0.019,0.066,0.286,0.775,-0.11,0.148
7,wh_numeric:gap_distance[T.long],-0.005,0.066,-0.08,0.937,-0.134,0.124
8,gap_numeric,-0.318,0.047,-6.831,0.0,-0.41,-0.227
9,gap_numeric:gap_distance[T.short],0.153,0.066,2.315,0.021,0.023,0.282



=== RANDOM EFFECTS / VARIANCE-COVARIANCE ===


Unnamed: 0,Parameter,Coef.,Std.Err.
0,Group Var,0.449,0.37


<statsmodels.regression.mixed_linear_model.MixedLMResultsWrapper at 0x152a5f149c40>

In [19]:
# continuous
iv_list = ['wh_numeric', 'gap_numeric', 'gap_distance_length']
mixed_effects_linear_regression(embed_objectgap_df, iv_list, 'embed_surprisal', 'gap distance: embedded clause object gap position')


=== GAP DISTANCE: EMBEDDED CLAUSE OBJECT GAP POSITION ===

=== MODEL SUMMARY ===
Model: MixedLM
Dependent Variable: embed_surprisal
No. Observations: 320
Method: REML
No. Groups: 20
Scale: 0.2180
Min. group size: 16
Log-Likelihood: -274.6082
Max. group size: 16
Converged: Yes
Mean group size: 16.0

=== FIXED EFFECTS COEFFICIENTS ===


Unnamed: 0,Parameter,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
0,Intercept,5.994,0.152,39.473,0.0,5.697,6.292
1,wh_numeric,0.12,0.042,2.883,0.004,0.038,0.201
2,gap_numeric,-0.283,0.042,-6.809,0.0,-0.364,-0.201
3,wh_numeric:gap_numeric,-0.029,0.042,-0.693,0.488,-0.11,0.053
4,gap_distance_length,0.024,0.001,26.209,0.0,0.022,0.026
5,wh_numeric:gap_distance_length,-0.0,0.001,-0.205,0.838,-0.002,0.002
6,gap_numeric:gap_distance_length,0.003,0.001,3.293,0.001,0.001,0.005
7,wh_numeric:gap_numeric:gap_distance_length,0.0,0.001,0.531,0.595,-0.001,0.002



=== RANDOM EFFECTS / VARIANCE-COVARIANCE ===


Unnamed: 0,Parameter,Coef.,Std.Err.
0,Group Var,0.426,0.316


<statsmodels.regression.mixed_linear_model.MixedLMResultsWrapper at 0x152a5f159be0>

Embedded Clause - PP/Goal Gap Position

In [20]:
# filter for PP gap
embed_ppgap_df = sentence_df[sentence_df['gap_position'] == 'IO']

In [21]:
# categorical
iv_list = ['wh_numeric', 'gap_numeric', 'gap_distance']
mixed_effects_linear_regression(embed_ppgap_df, iv_list, 'embed_surprisal', 'gap distance: embedded clause pp/goal gap position')


=== GAP DISTANCE: EMBEDDED CLAUSE PP/GOAL GAP POSITION ===

=== MODEL SUMMARY ===
Model: MixedLM
Dependent Variable: embed_surprisal
No. Observations: 320
Method: REML
No. Groups: 20
Scale: 0.1840
Min. group size: 16
Log-Likelihood: -242.4718
Max. group size: 16
Converged: Yes
Mean group size: 16.0

=== FIXED EFFECTS COEFFICIENTS ===


Unnamed: 0,Parameter,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
0,Intercept,6.022,0.145,41.579,0.0,5.738,6.306
1,gap_distance[T.short],1.007,0.068,14.847,0.0,0.874,1.14
2,gap_distance[T.medium],1.098,0.068,16.195,0.0,0.966,1.231
3,gap_distance[T.long],1.884,0.068,27.776,0.0,1.751,2.017
4,wh_numeric,0.081,0.048,1.694,0.09,-0.013,0.175
5,wh_numeric:gap_distance[T.short],-0.006,0.068,-0.091,0.927,-0.139,0.127
6,wh_numeric:gap_distance[T.medium],-0.007,0.068,-0.11,0.912,-0.14,0.125
7,wh_numeric:gap_distance[T.long],-0.027,0.068,-0.399,0.69,-0.16,0.106
8,gap_numeric,-0.015,0.048,-0.321,0.748,-0.109,0.079
9,gap_numeric:gap_distance[T.short],0.064,0.068,0.944,0.345,-0.069,0.197



=== RANDOM EFFECTS / VARIANCE-COVARIANCE ===


Unnamed: 0,Parameter,Coef.,Std.Err.
0,Group Var,0.374,0.301


<statsmodels.regression.mixed_linear_model.MixedLMResultsWrapper at 0x152a5f171ca0>

In [22]:
# continuous
iv_list = ['wh_numeric', 'gap_numeric', 'gap_distance_length']
mixed_effects_linear_regression(embed_ppgap_df, iv_list, 'embed_surprisal', 'gap distance: embedded clause pp/goal gap position')


=== GAP DISTANCE: EMBEDDED CLAUSE PP/GOAL GAP POSITION ===

=== MODEL SUMMARY ===
Model: MixedLM
Dependent Variable: embed_surprisal
No. Observations: 320
Method: REML
No. Groups: 20
Scale: 0.2162
Min. group size: 16
Log-Likelihood: -271.9102
Max. group size: 16
Converged: Yes
Mean group size: 16.0

=== FIXED EFFECTS COEFFICIENTS ===


Unnamed: 0,Parameter,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
0,Intercept,6.217,0.141,44.026,0.0,5.94,6.494
1,wh_numeric,0.083,0.041,1.997,0.046,0.002,0.164
2,gap_numeric,0.008,0.041,0.194,0.847,-0.073,0.089
3,wh_numeric:gap_numeric,-0.008,0.041,-0.184,0.854,-0.089,0.073
4,gap_distance_length,0.023,0.001,24.768,0.0,0.021,0.024
5,wh_numeric:gap_distance_length,-0.0,0.001,-0.358,0.721,-0.002,0.001
6,gap_numeric:gap_distance_length,0.001,0.001,1.165,0.244,-0.001,0.003
7,wh_numeric:gap_numeric:gap_distance_length,0.0,0.001,0.11,0.912,-0.002,0.002



=== RANDOM EFFECTS / VARIANCE-COVARIANCE ===


Unnamed: 0,Parameter,Coef.,Std.Err.
0,Group Var,0.364,0.272


<statsmodels.regression.mixed_linear_model.MixedLMResultsWrapper at 0x152a5f173070>

#### Double Gaps

In [24]:
sentence_df = pd.read_csv('test_sentences/Double Gaps.csv')

In [25]:
sentence_df[['modifier', 'end']] = sentence_df['modifier'].apply(split_ends)
sentence_df = encode_wh_licensor(sentence_df)
sentence_df['gap_numeric'] = sentence_df['gap'].replace(0, -1)
syntactic_parts = ['prefix', 'licensor', 'subj', 'verb', 'object', 'modifier']
sentence_df['sentence'] = sentence_df.apply(lambda x: merge_sentence(x, syntactic_parts), axis = 1)
sentence_df['surprisals'] = sentence_df['sentence'].apply(calculate_surprisal)

In [52]:
#sentence_df.to_csv(('test_sentences/Double Gap Construction.csv'), index = False)
#sentence_df = pd.read_csv('test_sentences/Double Gap.csv')

Post Gap Region - Modifier

In [30]:
#sentence_df['region_surprisal'] = sentence_df.apply(lambda x: sum_region_surprisal(x, ['modifier']), axis = 1)
sentence_df['region_surprisal'] = sentence_df.apply(lambda x: sum_region_surprisal(x, ['modifier'], 'object'), axis = 1)

In [41]:
iv_list = ['subject_gap', 'object_gap', 'wh_numeric']
mixed_effects_linear_regression(sentence_df, iv_list, 'region_surprisal', 'double gap modifier')


=== DOUBLE GAP MODIFIER ===

=== MODEL SUMMARY ===
Model: MixedLM
Dependent Variable: region_surprisal
No. Observations: 160
Method: REML
No. Groups: 20
Scale: 15.1930
Min. group size: 8
Log-Likelihood: -452.8108
Max. group size: 8
Converged: Yes
Mean group size: 8.0

=== FIXED EFFECTS COEFFICIENTS ===


Unnamed: 0,Parameter,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
0,Intercept,13.775,0.88,15.647,0.0,12.049,15.5
1,subject_gap,0.531,0.872,0.609,0.542,-1.177,2.239
2,object_gap,7.992,0.872,9.169,0.0,6.283,9.7
3,subject_gap:object_gap,1.598,1.233,1.296,0.195,-0.818,4.014
4,wh_numeric,0.841,0.616,1.364,0.173,-0.367,2.049
5,subject_gap:wh_numeric,-1.004,0.872,-1.152,0.249,-2.712,0.704
6,object_gap:wh_numeric,-1.156,0.872,-1.327,0.185,-2.865,0.552
7,subject_gap:object_gap:wh_numeric,1.476,1.233,1.198,0.231,-0.94,3.892



=== RANDOM EFFECTS / VARIANCE-COVARIANCE ===


Unnamed: 0,Parameter,Coef.,Std.Err.
0,Group Var,7.905,0.872


<statsmodels.regression.mixed_linear_model.MixedLMResultsWrapper at 0x152a5f19c880>

Subject Gap Position

In [31]:
subjectgap_df = sentence_df[sentence_df['condition'].isin(['a', 'b', 'd', 'e', 'f', 'h'])]

In [42]:
iv_list = ['wh_numeric', 'gap_numeric']
mixed_effects_linear_regression(subjectgap_df, iv_list, 'region_surprisal', 'double gap modifier: subject gap position')


=== DOUBLE GAP MODIFIER ===

=== MODEL SUMMARY ===
Model: MixedLM
Dependent Variable: region_surprisal
No. Observations: 120
Method: REML
No. Groups: 20
Scale: 33.3281
Min. group size: 6
Log-Likelihood: -379.2084
Max. group size: 6
Converged: Yes
Mean group size: 6.0

=== FIXED EFFECTS COEFFICIENTS ===


Unnamed: 0,Parameter,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
0,Intercept,16.438,0.608,27.031,0.0,15.246,17.63
1,wh_numeric,0.419,0.559,0.749,0.454,-0.677,1.514
2,gap_numeric,2.663,0.559,4.764,0.0,1.567,3.758
3,wh_numeric:gap_numeric,-0.422,0.559,-0.755,0.45,-1.518,0.674



=== RANDOM EFFECTS / VARIANCE-COVARIANCE ===


Unnamed: 0,Parameter,Coef.,Std.Err.
0,Group Var,1.147,0.412


<statsmodels.regression.mixed_linear_model.MixedLMResultsWrapper at 0x152a5f1f5b80>

Object Gap Position

In [43]:
objectgap_df = sentence_df[sentence_df['condition'].isin(['a', 'c', 'd', 'e', 'g', 'h'])]

In [44]:
iv_list = ['wh_numeric', 'gap_numeric']
mixed_effects_linear_regression(objectgap_df, iv_list, 'region_surprisal', 'double gap modifier: object gap position')


=== DOUBLE GAP MODIFIER: OBJECT GAP POSITION ===

=== MODEL SUMMARY ===
Model: MixedLM
Dependent Variable: region_surprisal
No. Observations: 120
Method: REML
No. Groups: 20
Scale: 14.7677
Min. group size: 6
Log-Likelihood: -348.0215
Max. group size: 6
Converged: Yes
Mean group size: 6.0

=== FIXED EFFECTS COEFFICIENTS ===


Unnamed: 0,Parameter,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
0,Intercept,18.303,0.904,20.246,0.0,16.531,20.075
1,wh_numeric,0.381,0.372,1.023,0.306,-0.349,1.11
2,gap_numeric,4.528,0.372,12.169,0.0,3.799,5.257
3,wh_numeric:gap_numeric,-0.46,0.372,-1.236,0.216,-1.189,0.269



=== RANDOM EFFECTS / VARIANCE-COVARIANCE ===


Unnamed: 0,Parameter,Coef.,Std.Err.
0,Group Var,13.577,1.481


<statsmodels.regression.mixed_linear_model.MixedLMResultsWrapper at 0x152a5f1e9d60>

Embedded Clause

In [45]:
embed = ['subj', 'object', 'verb', 'modifier', 'end']
sentence_df['embed_surprisal'] = sentence_df.apply(lambda x: sum_region_surprisal(x, embed, normalize = True), axis = 1)

In [20]:
iv_list = ['subject_gap', 'object_gap', 'wh_numeric']
result = mixed_effects_linear_regression(sentence_df, iv_list, 'embed_surprisal', 'double gap embed')
result


=== DOUBLE GAP EMBED ===

=== MODEL SUMMARY ===
Model: MixedLM
Dependent Variable: embed_surprisal
No. Observations: 160
Method: REML
No. Groups: 20
Scale: 0.0990
Min. group size: 8
Log-Likelihood: -85.8060
Max. group size: 8
Converged: Yes
Mean group size: 8.0

=== FIXED EFFECTS COEFFICIENTS ===


Unnamed: 0,Parameter,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
0,Intercept,5.051,0.135,37.395,0.0,4.787,5.316
1,subject_gap,0.025,0.07,0.349,0.727,-0.113,0.162
2,object_gap,-0.225,0.07,-3.2,0.001,-0.363,-0.087
3,subject_gap:object_gap,0.105,0.1,1.06,0.289,-0.09,0.3
4,wh_numeric,0.133,0.05,2.681,0.007,0.036,0.231
5,subject_gap:wh_numeric,-0.152,0.07,-2.157,0.031,-0.29,-0.014
6,object_gap:wh_numeric,-0.113,0.07,-1.602,0.109,-0.251,0.025
7,subject_gap:object_gap:wh_numeric,0.116,0.1,1.164,0.245,-0.079,0.311



=== RANDOM EFFECTS / VARIANCE-COVARIANCE ===


Unnamed: 0,Parameter,Coef.,Std.Err.
0,Group Var,0.315,0.361


<statsmodels.regression.mixed_linear_model.MixedLMResultsWrapper at 0x145b3f64f790>

Subject Gap Position

In [46]:
subjectgap_df = sentence_df[sentence_df['condition'].isin(['a', 'b', 'd', 'e', 'f', 'h'])]

In [47]:
iv_list = ['wh_numeric', 'gap_numeric']
mixed_effects_linear_regression(subjectgap_df, iv_list, 'embed_surprisal', 'double gap embedded clause: subject gap position')


=== DOUBLE GAP EMBEDDED CLAUSE: SUBJECT GAP POSITION ===

=== MODEL SUMMARY ===
Model: MixedLM
Dependent Variable: embed_surprisal
No. Observations: 120
Method: REML
No. Groups: 20
Scale: 0.1074
Min. group size: 6
Log-Likelihood: -72.5608
Max. group size: 6
Converged: Yes
Mean group size: 6.0

=== FIXED EFFECTS COEFFICIENTS ===


Unnamed: 0,Parameter,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
0,Intercept,5.034,0.13,38.615,0.0,4.778,5.289
1,wh_numeric,0.058,0.032,1.837,0.066,-0.004,0.12
2,gap_numeric,-0.018,0.032,-0.555,0.579,-0.08,0.045
3,wh_numeric:gap_numeric,-0.075,0.032,-2.366,0.018,-0.137,-0.013



=== RANDOM EFFECTS / VARIANCE-COVARIANCE ===


Unnamed: 0,Parameter,Coef.,Std.Err.
0,Group Var,0.32,0.365


<statsmodels.regression.mixed_linear_model.MixedLMResultsWrapper at 0x152a5f1a6b80>

Object Gap Position

In [48]:
objectgap_df = sentence_df[sentence_df['condition'].isin(['a', 'c', 'd', 'e', 'g', 'h'])]

In [49]:
iv_list = ['wh_numeric', 'gap_numeric']
mixed_effects_linear_regression(objectgap_df, iv_list, 'embed_surprisal', 'double gap embedded clause: object gap position')


=== DOUBLE GAP EMBEDDED CLAUSE: OBJECT GAP POSITION ===

=== MODEL SUMMARY ===
Model: MixedLM
Dependent Variable: embed_surprisal
No. Observations: 120
Method: REML
No. Groups: 20
Scale: 0.1096
Min. group size: 6
Log-Likelihood: -73.5937
Max. group size: 6
Converged: Yes
Mean group size: 6.0

=== FIXED EFFECTS COEFFICIENTS ===


Unnamed: 0,Parameter,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
0,Intercept,4.971,0.131,38.019,0.0,4.715,5.228
1,wh_numeric,0.068,0.032,2.123,0.034,0.005,0.131
2,gap_numeric,-0.08,0.032,-2.497,0.013,-0.143,-0.017
3,wh_numeric:gap_numeric,-0.065,0.032,-2.038,0.042,-0.128,-0.003



=== RANDOM EFFECTS / VARIANCE-COVARIANCE ===


Unnamed: 0,Parameter,Coef.,Std.Err.
0,Group Var,0.321,0.364


<statsmodels.regression.mixed_linear_model.MixedLMResultsWrapper at 0x152a5f188f40>

#### Wh-Islands

In [None]:
sentence_df = pd.read_csv('test_sentences/Wh-Islands.csv')

In [None]:
sentence_df[['continuation', 'end']] = sentence_df['continuation'].apply(split_ends)
sentence_df = encode_wh_licensor(sentence_df)
sentence_df['gap_numeric'] = sentence_df['gap'].replace(0, -1)
sentence_df['sentence'] = sentence_df.apply(merge_sentence, axis = 1)
sentence_df['surprisals'] = sentence_df['sentence'].apply(calculate_surprisal)
sentence_df

Post-Gap Region - Continuation

In [None]:
sentence_df['region_surprisal'] = sentence_df.apply(lambda x: sum_region_surprisal(x, ['continuation']), axis = 1)
sentence_df

In [None]:
iv_list = ['wh_numeric', 'gap_numeric', 'island_type']
mixed_effects_linear_regression(sentence_df, iv_list, 'region_surprisal', 'wh-islands continuation')

Embedded Clause

In [None]:
embed = ['compl', 'embed 1', 'whether', 'subj 2', 'vp 2', 'obj_2', 'continuation', 'end']
sentence_df['embed_surprisal'] = sentence_df.apply(lambda x: sum_region_surprisal(x, embed, normalize = True), axis = 1)
sentence_df

In [None]:
iv_list = ['wh_numeric', 'gap_numeric', 'island_type']
mixed_effects_linear_regression(sentence_df, iv_list, 'embed_surprisal', 'wh-islands embed')

#### Adjunct Islands 

In [None]:
sentence_df = pd.read_csv('test_sentences/Adjunct Islands.csv')

In [None]:
sentence_df[['continuation', 'end']] = sentence_df['continuation'].apply(split_ends)
sentence_df = encode_wh_licensor(sentence_df)
sentence_df['gap_numeric'] = sentence_df['gap'].replace(0, -1)
sentence_df['sentence'] = sentence_df.apply(merge_sentence, axis = 1)
sentence_df['surprisals'] = sentence_df['sentence'].apply(calculate_surprisal)
sentence_df

Post-Gap Region - Continuation

In [None]:
sentence_df['region_surprisal'] = sentence_df.apply(lambda x: sum_region_surprisal(x, ['continuation']), axis = 1)
sentence_df

In [None]:
iv_list = ['wh_numeric', 'gap_numeric', 'island_type']
mixed_effects_linear_regression(sentence_df, iv_list, 'region_surprisal', 'adjunct islands continuation')

Embedded Clause

In [None]:
embed = ['adjunct setup', 'subject', 'modifier', 'verb', 'object', 'end']
sentence_df['embed_surprisal'] = sentence_df.apply(lambda x: sum_region_surprisal(x, embed, normalize = True), axis = 1)
sentence_df

In [None]:
iv_list = ['wh_numeric', 'gap_numeric', 'island_type']
mixed_effects_linear_regression(sentence_df, iv_list, 'embed_surprisal', 'adjunct islands embed')

#### Complex NP Islands

In [None]:
sentence_df = pd.read_csv('test_sentences/Complex NP Islands.csv')

In [None]:
sentence_df = encode_wh_licensor(sentence_df)
sentence_df['gap_numeric'] = sentence_df['gap'].replace(0, -1)
sentence_df['sentence'] = sentence_df.apply(merge_sentence, axis = 1)
sentence_df['surprisals'] = sentence_df['sentence'].apply(calculate_surprisal)
sentence_df

Subject Condition

In [None]:
subject_df = sentence_df.copy()

In [None]:
subject_df['region_surprisal'] = subject_df.apply(lambda x: sum_region_surprisal(x, ['subj_setup']), axis = 1)
subject_df

In [None]:
# filter for subject condition
subject_df = subject_df[subject_df['subj_obj'] == 'subject']

In [None]:
iv_list = ['wh_numeric', 'gap_numeric', 'island_type']
mixed_effects_linear_regression(subject_df, iv_list, 'region_surprisal', 'complex np islands: subject condition')

Object Condition

In [None]:
object_df = sentence_df.copy()

In [None]:
object_df['region_surprisal'] = object_df.apply(lambda x: sum_region_surprisal(x, ['end']), axis = 1)
object_df

In [None]:
# filter for object gap
object_df = object_df[object_df['subj_obj'] == 'object']

In [None]:
iv_list = ['wh_numeric', 'gap_numeric', 'island_type']
mixed_effects_linear_regression(object_df, iv_list, 'region_surprisal', 'complex np islands: object condition')

Embedded Clause

In [None]:
embed = ['subj', 'that_rc', 'what_rc', 'rc_np', 'prep', 'prep_np', 'subj_setup', 'obj_setup', 'end']
sentence_df['embed_surprisal'] = sentence_df.apply(lambda x: sum_region_surprisal(x, embed, normalize = True), axis = 1)
sentence_df

Embedded Clause - Subject Condition

In [None]:
# filter for subject condition
subject_df = sentence_df[sentence_df['subj_obj'] == 'subject']

In [None]:
iv_list = ['wh_numeric', 'gap_numeric', 'island_type']
mixed_effects_linear_regression(subject_df, iv_list, 'embed_surprisal', 'complex np islands: embedded clause subject condition')

Embedded Clause - Object Condition

In [None]:
# filter for object gap
object_df = sentence_df[sentence_df['subj_obj'] == 'object']

In [None]:
iv_list = ['wh_numeric', 'gap_numeric', 'island_type']
mixed_effects_linear_regression(object_df, iv_list, 'embed_surprisal', 'complex np islands: embedded clause object condition')