In [1]:
# Minicons Installation
# Introduction can be found https://kanishka.xyz/post/minicons-running-large-scale-behavioral-analyses-on-transformer-lms/
# Tutorial and code can be found https://github.com/kanishkamisra/minicons/blob/master/examples/surprisals.md
#!pip install minicons

from minicons import scorer
import pandas as pd
import numpy as np
import json
import csv
import re
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as smf
import torch
from torch.utils.data import DataLoader
from transformers import GPT2LMHeadModel, GPT2TokenizerFast

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was too old on your system - pyarrow 10.0.1 is the current minimum supported version as of this release.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
model_path = "gpt2-small/checkpoint-trainedtokenizer_10M_whitespace"
#model_path = "gpt2-small/checkpoint-trainedtokenizer_100M_whitespace"
#model_path = "gpt3-small/checkpoint-trainedtokenizer_10M_whitespace"
#model_path = "gpt3-small/checkpoint-trainedtokenizer_100M_whitespace"

model = GPT2LMHeadModel.from_pretrained(model_path)
tokenizer = GPT2TokenizerFast.from_pretrained(model_path)

# wrap with minicons scorer
lm_scorer = scorer.IncrementalLMScorer(model_path, device = "cpu")

In [3]:
def calculate_surprisal(sentence):
    '''
    Takes in a sentence, and outputs surprisal values for each word.
    '''
    
    input_sentence = sentence
    # token_score() function of Minicons takes in several parameters
    # if surprisal = True, the output value is surprisal instead of log likelihood
    # if base_two = True, the log likelihood will be in base 2
    # see Minicons documentations for details
    # score tokens
    token_surprisals = lm_scorer.token_score(input_sentence, surprisal = True, base_two = True)[0]
    #print(token_surprisals)

    '''
    # filter out special tokens (like <pad>)
    special_tokens = set(tokenizer.all_special_tokens + ['<pad>'])
    filtered = [
        (token, score)
        for (token, score) in token_surprisals
        if token not in special_tokens
    ]
    '''

   # expand tokens that contain multiple words
    expanded = []
    for token, score in token_surprisals:
    #for token, score in filtered:
        token = token.strip('Ġ') # remove space marker
        if token.count('Ġ') > 0:
            # multiple words inside
            words = token.split('Ġ')
            words = [word for word in words if word]  # remove empty strings
            for i, word in enumerate(words):
                expanded_token = word
                expanded.append((expanded_token, score / len(words)))  # split surprisal equally
        else:
            expanded.append((token, score))    
    #print(expanded)

    # use regex to split into words and punctuation
    words = re.findall(r'\w+|[^\w\s]', sentence)
    results = []

    token_pointer = 0

    for word in words:
        accumulated = ''
        word_surprisal = 0.0

        while token_pointer < len(expanded):
            token, surprisal = expanded[token_pointer]
            accumulated += token
            word_surprisal += surprisal
            token_pointer += 1

            if accumulated == word:
                results.append((word, word_surprisal))
                break
        else:
            results.append((word, word_surprisal))

    return results


sentence = 'I know that your friend gave a baguette to Mary last weekend.'
calculate_surprisal(sentence)

[('I', 0.0),
 ('know', 5.239012718200684),
 ('that', 3.954333782196045),
 ('your', 10.374737739562988),
 ('friend', 6.853007793426514),
 ('gave', 12.485967636108398),
 ('a', 4.920528888702393),
 ('baguette', 40.79525184631348),
 ('to', 8.777816772460938),
 ('Mary', 12.327611923217773),
 ('last', 11.225496292114258),
 ('weekend', 8.15101146697998),
 ('.', 10.069524765014648)]

In [4]:
def calculate_sentence_surprisal(word_surprisals):
    '''Returns total surprisal and average surprisal per word.'''
    
    scores = [score for word, score in word_surprisals]
    total = sum(scores)
    avg = total / len(scores)
    return total, avg

"""
def sum_region_surprisal(row, region_list):
    '''Sums surprisals of a specified sentence region, extracting the relevant part from sentence_surprisals using index information.'''

    sentence_surprisals = row['surprisals']
    
    # retrieve all sentence column(from 'prefix' till 'end')
    all_columns = row.index.tolist()  # get the list of column names
    prefix_index = all_columns.index('prefix')
    eos_index = all_columns.index('end')
    
    # reconstruct the full sentence from 'prefix' to 'end' (exclude NaN values)
    full_sentence = ' '.join([str(row[col]) if pd.notna(row.get(col)) else '' for col in all_columns[prefix_index:eos_index + 1]])
    
    # use regex to split the full sentence into words & punctuation
    sentence_units = re.findall(r'\w+|[^\w\s]', full_sentence)
    
    # get the start and end indices for the region_list within the full sentence
    first_region_column = region_list[0]
    first_region_start_index = all_columns.index(first_region_column)
    
    last_region_column = region_list[-1]
    last_region_end_index = all_columns.index(last_region_column)
    
    # extract the relevant slice of sentence_surprisals that corresponds to the region_list
    # calculate the starting and ending index for the region_list part
    start_index = sum([
        len(re.findall(r'\w+|[^\w\s]', str(row[col]))) for col in all_columns[prefix_index:first_region_start_index]
        if pd.notna(row[col]) 
    ])
    end_index = sum([
        len(re.findall(r'\w+|[^\w\s]', str(row[col]))) for col in all_columns[prefix_index:last_region_end_index + 1]
        if pd.notna(row[col]) 
    ])

    # extract the slice of sentence_surprisals corresponding to the region_list part
    relevant_surprisals = sentence_surprisals[start_index:end_index]
    region_surprisal = sum([score for token, score in relevant_surprisals])
    
    return region_surprisal
"""

def sum_region_surprisal(row, region_list, priority_region = None, normalize = False):
    '''Sums surprisals of a specified sentence region with optional priority region handling.'''

    sentence_surprisals = row['surprisals']
    
    # retrieve all sentence column(from 'prefix' till 'end')
    all_columns = row.index.tolist()  # get the list of column names
    prefix_index = all_columns.index('prefix')
    eos_index = all_columns.index('end')
    
    # reconstruct the full sentence from 'prefix' to 'end' (exclude NaN values)
    full_sentence = ' '.join([str(row[col]) if pd.notna(row.get(col)) else '' for col in all_columns[prefix_index:eos_index + 1]])
    
    # use regex to split the full sentence into words & punctuation
    sentence_units = re.findall(r'\w+|[^\w\s]', full_sentence)
    
    if priority_region:
        # check if priority_region has non-zero surprisal
        if pd.notna(row[priority_region]):
            # extract text for the priority_region
            priority_region_index = all_columns.index(priority_region)
            
            start_index = sum([
                len(re.findall(r'\w+|[^\w\s]', str(row[col]))) for col in all_columns[prefix_index:priority_region_index]
                if pd.notna(row[col]) 
            ])
            end_index = sum([
                len(re.findall(r'\w+|[^\w\s]', str(row[col]))) for col in all_columns[prefix_index:priority_region_index + 1]
                if pd.notna(row[col]) 
            ])
            
            # calculate the surprisal for the priority region
            relevant_surprisals = sentence_surprisals[start_index:end_index]
            priority_surprisal = sum([score for token, score in relevant_surprisals])
    
            # return priority region's surprisal score if it is non-zero
            if priority_surprisal != 0:
                return priority_surprisal
    
    # if priority region is not provided or its surprisal is zero, calculate sum of region_list surprisals

    # get the start and end indices for the region_list within the full sentence
    first_region_column = region_list[0]
    first_region_start_index = all_columns.index(first_region_column)
    
    last_region_column = region_list[-1]
    last_region_end_index = all_columns.index(last_region_column)
    
    # extract the relevant slice of sentence_surprisals that corresponds to the region_list
    # calculate the starting and ending index for the region_list part
    start_index = sum([
        len(re.findall(r'\w+|[^\w\s]', str(row[col]))) for col in all_columns[prefix_index:first_region_start_index]
        if pd.notna(row[col]) 
    ])
    end_index = sum([
        len(re.findall(r'\w+|[^\w\s]', str(row[col]))) for col in all_columns[prefix_index:last_region_end_index + 1]
        if pd.notna(row[col]) 
    ])

    # extract the slice of sentence_surprisals corresponding to the region_list part
    relevant_surprisals = sentence_surprisals[start_index:end_index]
    region_surprisal = sum([score for token, score in relevant_surprisals])
    
    if normalize == False:
        return region_surprisal
    elif normalize == True:
        return region_surprisal / len(sentence_units)

In [5]:
def split_ends(ends):
    '''Splits off sentence-final punctuation.'''
    
    match = re.match(r'^(.*?)([.!?])$', ends.strip())
    if match:
        return pd.Series([match.group(1), match.group(2)])
    else:
        return pd.Series([ends, ''])  # no end punctuation
    
def encode_wh_licensor(df):
    '''
    Adds a numeric column to the DataFrame:
    - 1 if licensor is a wh-licensor
    - -1 otherwise (licensor is "that" or absent)
    '''
    df['wh_numeric'] = df['filler'].apply(lambda x: 1 if x > 0 else -1)
    return df
    
def merge_sentence(row, syntactic_parts):
    '''Merges sentence columns back into full sentence.'''
    
    parts = [row[part] for part in syntactic_parts]         
    # filter out NaN or empty parts
    non_empty_parts = [str(part).strip() for part in parts if pd.notna(part) and str(part).strip() != '']
    # join with spaces and add end punctuation
    sentence = ' '.join(non_empty_parts) + (row['end'] if pd.notna(row['end']) else '')
    return sentence

def assign_grammaticality(row):
    ''' Determines grammaticality based on licensor and gap values.'''
    if row['filler'] > 0 and row['gap'] == 1:
        return 'gram+'
        #return 1
    elif row['filler'] == 0 and row['gap'] == 0:
        return 'gram+'
        #return 1
    elif row['filler'] > 0 and row['gap'] == 0:
        return 'gram-'
        #return 0
    elif row['filler'] == 0 and row['gap'] == 1:
        return 'gram-'
        #return 0

#### Statistical Analysis: Mixed-Effects Linear Regression Model

In [6]:
from IPython.display import display
import warnings
from statsmodels.tools.sm_exceptions import ConvergenceWarning

def print_summary(result):
    '''
    Prints the descriptive header from a statsmodels MixedLMResults summary,
    and displays both the fixed-effects and random-effects components as pandas DataFrames.
    '''
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", ConvergenceWarning)
        
        summary_str = str(result.summary())
        lines = summary_str.split('\n')

        # PART 1: HEADER
        print("\n=== MODEL SUMMARY ===")
        for line in lines:
            if 'Coef.' in line:
                break  # stop when reaching the coefficient table
            pairs = re.findall(r'(\S[^:]*):\s+([^\s][^:]*?)(?=\s{2,}|$)', line)
            for key, value in pairs:
                print(f"{key.strip()}: {value.strip()}")

        # PART 2a: FIXED EFFECTS TABLE
        fixed_table = []
        in_fixed_table = False
        random_start_idx = None

        for i, line in enumerate(lines):
            if 'Coef.' in line:
                in_fixed_table = True
                continue
            if in_fixed_table:
                if not line.strip() or set(line.strip()) in [{'='}, {'-'}]:
                    continue  # skip empty/separator lines
                parts = line.strip().split()
                if len(parts) >= 7:
                    param = parts[0]
                    row = parts[1:7]
                    fixed_table.append([param] + row)
                else:
                    random_start_idx = i
                    break

        if fixed_table:
            df_fixed = pd.DataFrame(fixed_table, columns = [
                'Parameter', 'Coef.', 'Std.Err.', 'z', 'P>|z|', '[0.025', '0.975]'
            ])
            print("\n=== FIXED EFFECTS COEFFICIENTS ===")
            display(df_fixed)
        else:
            print("\nNo fixed-effects data found.")

        # PART 2b: RANDOM EFFECTS / VARIANCE COMPONENTS TABLE
        random_table = []
        if random_start_idx:
            for line in lines[random_start_idx:]:
                if not line.strip() or set(line.strip()) in [{'='}, {'-'}]:
                    continue
                parts = line.strip().split()
                # Look for last token ending in Var or Cov
                varcov_idx = None
                for j in reversed(range(len(parts))):
                    if parts[j].endswith('Var') or parts[j].endswith('Cov'):
                        varcov_idx = j
                        break
                if varcov_idx is not None:
                    param = ' '.join(parts[:varcov_idx + 1])
                    numeric_parts = parts[varcov_idx + 1:]
                    if len(numeric_parts) == 2:
                        val1, val2 = numeric_parts
                    elif len(numeric_parts) == 1:
                        val1, val2 = numeric_parts[0], ''
                    else:
                        val1, val2 = '', ''
                    random_table.append([param, val1, val2])

        if random_table:
            df_random = pd.DataFrame(random_table, columns = ['Parameter', 'Coef.', 'Std.Err.'])
            print("\n=== RANDOM EFFECTS / VARIANCE-COVARIANCE ===")
            display(df_random)
        else:
            print("\nNo random-effects data found.")


In [7]:
# columns: item_id (indicates sentence set), wh_licensor (0/1), gap (0/1), island_type, surprisal

warnings.simplefilter("ignore", ConvergenceWarning)

def mixed_effects_linear_regression(df, iv_list, surprisal, label):
    '''
    Fits mixed-effects model and extracts wh-licensing interaction.
    '''
        
    for field in iv_list:
        if field == 'wh_numeric' or 'gap_numeric' or 'gap_distance_length':
            pass
        else:
            df[field] = df[field].astype('category')

    interaction_terms = ' * '.join(iv_list)
    random_effects = ' + '.join(iv_list)

    model = smf.mixedlm(
        f"{surprisal} ~ {interaction_terms}",
        df,
        groups = df["item_id"],
        #re_formula = f"~{random_effects}"
        re_formula = "1"
        )

    result = model.fit()
    #interaction_coef = result.params.get('wh_licensor[T.1]:gap[T.1]', None)

    print(f"\n=== {label.upper()} ===")
    #print(result.summary())
    print_summary(result)

    return result

#interaction = mixed_effects_linear_regression(df, "construction_type") # label name to be changed according to construction type

#### Gap Distance - Categorical/Continuous

In [8]:
sentence_df = pd.read_csv('test_sentences/Gap Distance.csv')
sentence_df['gap_distance'] = sentence_df['gap_distance'].fillna('null')
#sentence_df['gap_distance'].dropna(inplace = True)
#sentence_df = sentence_df[sentence_df['gap_distance'].replace(['nan', 'NaN'], np.nan).notna()]

In [9]:
sentence_df[['temp_mod', 'end']] = sentence_df['temp_mod'].apply(split_ends)
sentence_df = encode_wh_licensor(sentence_df)
sentence_df['gap_numeric'] = sentence_df['gap'].replace(0, -1)
syntactic_parts = ['prefix', 'licensor', 'subj', 'modifier', 'verb', 'object', 'prep', 'goal', 'temp_mod']
sentence_df['sentence'] = sentence_df.apply(lambda x: merge_sentence(x, syntactic_parts), axis = 1)
sentence_df['surprisals'] = sentence_df['sentence'].apply(calculate_surprisal)
sentence_df['gap_distance_length'] = sentence_df['modifier'].apply(lambda x: len(x) if pd.notna(x) else 0)
sentence_df['grammaticality'] = sentence_df.apply(assign_grammaticality, axis = 1)

Gap Position == Object

In [10]:
# modifier: short_mod, med_mod, long_mod

In [11]:
object_df = sentence_df.copy()
#object_df['region_surprisal'] = object_df.apply(lambda x: sum_region_surprisal(x, ['prep', 'goal']), axis = 1)
object_df['region_surprisal'] = object_df.apply(lambda x: sum_region_surprisal(x, ['prep', 'goal'], 'object'), axis = 1)

# filter for object gap
objectgap_df = object_df[object_df['gap_position'] == 'DO']

In [12]:
# flip test: gap-
no_gap_df = objectgap_df[objectgap_df['gap_numeric'] == -1]
iv_list = ['wh_numeric']
mixed_effects_linear_regression(no_gap_df, iv_list, 'region_surprisal', 'object gap position: gap-')


=== OBJECT GAP POSITION: GAP- ===

=== MODEL SUMMARY ===
:            MixedLM Dependent Variable: region_surprisal
No. Observations: 160
Method: REML
No. Groups: 20
Scale: 1.3652
Min. group size: 8
Log-Likelihood: -307.7916
Max. group size: 8
Converged: Yes
Mean group size: 8.0

=== FIXED EFFECTS COEFFICIENTS ===


Unnamed: 0,Parameter,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
0,Intercept,33.173,1.579,21.015,0.0,30.079,36.267
1,wh_numeric,0.11,0.092,1.191,0.234,-0.071,0.291



=== RANDOM EFFECTS / VARIANCE-COVARIANCE ===


Unnamed: 0,Parameter,Coef.,Std.Err.
0,Group Var,49.667,14.747


<statsmodels.regression.mixed_linear_model.MixedLMResultsWrapper at 0x15382e55b070>

In [13]:
# flip test: gap+
gap_df = objectgap_df[objectgap_df['gap_numeric'] == 1]
iv_list = ['wh_numeric']
mixed_effects_linear_regression(gap_df, iv_list, 'region_surprisal', 'object gap position: gap+')


=== OBJECT GAP POSITION: GAP+ ===

=== MODEL SUMMARY ===
:            MixedLM Dependent Variable: region_surprisal
No. Observations: 160
Method: REML
No. Groups: 20
Scale: 1.5233
Min. group size: 8
Log-Likelihood: -319.2730
Max. group size: 8
Converged: Yes
Mean group size: 8.0

=== FIXED EFFECTS COEFFICIENTS ===


Unnamed: 0,Parameter,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
0,Intercept,29.858,1.935,15.434,0.0,26.067,33.65
1,wh_numeric,0.056,0.098,0.572,0.567,-0.135,0.247



=== RANDOM EFFECTS / VARIANCE-COVARIANCE ===


Unnamed: 0,Parameter,Coef.,Std.Err.
0,Group Var,74.662,20.978


<statsmodels.regression.mixed_linear_model.MixedLMResultsWrapper at 0x15382eae9a30>

In [14]:
# division by grammaticality
iv_list = ['grammaticality']
mixed_effects_linear_regression(objectgap_df, iv_list, 'region_surprisal', 'object gap position: division by grammaticality')


=== OBJECT GAP POSITION: DIVISION BY GRAMMATICALITY ===

=== MODEL SUMMARY ===
Model: MixedLM
Dependent Variable: region_surprisal
No. Observations: 320
Method: REML
No. Groups: 20
Scale: 38.5686
Min. group size: 16
Log-Likelihood: -1060.5478
Max. group size: 16
Converged: Yes
Mean group size: 16.0

=== FIXED EFFECTS COEFFICIENTS ===


Unnamed: 0,Parameter,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
0,Intercept,31.489,1.246,25.271,0.0,29.047,33.931
1,grammaticality[T.gram-],0.054,0.694,0.078,0.938,-1.307,1.415



=== RANDOM EFFECTS / VARIANCE-COVARIANCE ===


Unnamed: 0,Parameter,Coef.,Std.Err.
0,Group Var,26.23,1.543


<statsmodels.regression.mixed_linear_model.MixedLMResultsWrapper at 0x15382eae9580>

Gap Postion == PP/Goal

In [15]:
pp_df = sentence_df.copy()
#pp_df['region_surprisal'] = pp_df.apply(lambda x: sum_region_surprisal(x, ['temp_mod']), axis = 1)
pp_df['region_surprisal'] = pp_df.apply(lambda x: sum_region_surprisal(x, ['temp_mod'], 'goal'), axis = 1)

# filter for PP gap
ppgap_df = pp_df[pp_df['gap_position'] == 'IO']

In [16]:
# flip test: gap-
no_gap_df = ppgap_df[ppgap_df['gap_numeric'] == -1]
iv_list = ['wh_numeric']
mixed_effects_linear_regression(no_gap_df, iv_list, 'region_surprisal', 'pp gap position: gap-')


=== PP GAP POSITION: GAP- ===

=== MODEL SUMMARY ===
:            MixedLM Dependent Variable: region_surprisal
No. Observations: 160
Method: REML
No. Groups: 20
Scale: 0.4557
Min. group size: 8
Log-Likelihood: -235.3753
Max. group size: 8
Converged: Yes
Mean group size: 8.0

=== FIXED EFFECTS COEFFICIENTS ===


Unnamed: 0,Parameter,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
0,Intercept,23.351,1.932,12.087,0.0,19.564,27.138
1,wh_numeric,-0.056,0.053,-1.051,0.293,-0.161,0.049



=== RANDOM EFFECTS / VARIANCE-COVARIANCE ===


Unnamed: 0,Parameter,Coef.,Std.Err.
0,Group Var,74.593,38.245


<statsmodels.regression.mixed_linear_model.MixedLMResultsWrapper at 0x15382e5bfdf0>

In [17]:
# flip test: gap+
gap_df = ppgap_df[ppgap_df['gap_numeric'] == 1]
iv_list = ['wh_numeric']
mixed_effects_linear_regression(gap_df, iv_list, 'region_surprisal', 'pp gap position: gap+')


=== PP GAP POSITION: GAP+ ===

=== MODEL SUMMARY ===
:            MixedLM Dependent Variable: region_surprisal
No. Observations: 160
Method: REML
No. Groups: 20
Scale: 0.8238
Min. group size: 8
Log-Likelihood: -283.4059
Max. group size: 8
Converged: Yes
Mean group size: 8.0

=== FIXED EFFECTS COEFFICIENTS ===


Unnamed: 0,Parameter,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
0,Intercept,30.313,2.776,10.922,0.0,24.873,35.753
1,wh_numeric,-0.087,0.072,-1.21,0.226,-0.227,0.054



=== RANDOM EFFECTS / VARIANCE-COVARIANCE ===


Unnamed: 0,Parameter,Coef.,Std.Err.
0,Group Var,153.97,58.702


<statsmodels.regression.mixed_linear_model.MixedLMResultsWrapper at 0x15382e5bcd30>

In [18]:
# division by grammaticality
iv_list = ['grammaticality']
mixed_effects_linear_regression(ppgap_df, iv_list, 'region_surprisal', 'pp gap position: division by grammaticality')


=== PP GAP POSITION: DIVISION BY GRAMMATICALITY ===

=== MODEL SUMMARY ===
Model: MixedLM
Dependent Variable: region_surprisal
No. Observations: 320
Method: REML
No. Groups: 20
Scale: 79.7470
Min. group size: 16
Log-Likelihood: -1174.3073
Max. group size: 16
Converged: Yes
Mean group size: 16.0

=== FIXED EFFECTS COEFFICIENTS ===


Unnamed: 0,Parameter,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
0,Intercept,26.817,1.648,16.277,0.0,23.588,30.046
1,grammaticality[T.gram-],0.031,0.998,0.031,0.975,-1.926,1.988



=== RANDOM EFFECTS / VARIANCE-COVARIANCE ===


Unnamed: 0,Parameter,Coef.,Std.Err.
0,Group Var,44.318,1.847


<statsmodels.regression.mixed_linear_model.MixedLMResultsWrapper at 0x15382e54d970>

Embedded Clause

In [19]:
embed = ['subj', 'modifier', 'verb', 'object', 'prep', 'goal', 'temp_mod', 'end']
sentence_df['embed_surprisal'] = sentence_df.apply(lambda x: sum_region_surprisal(x, embed, normalize = True), axis = 1)

Embedded Clause - Object Gap Position

In [20]:
# filter for object gap
embed_objectgap_df = sentence_df[sentence_df['gap_position'] == 'DO']

In [21]:
# flip test: gap-
no_gap_df = embed_objectgap_df[embed_objectgap_df['gap_numeric'] == -1]
iv_list = ['wh_numeric']
mixed_effects_linear_regression(no_gap_df, iv_list, 'embed_surprisal', 'object gap position: gap-')


=== OBJECT GAP POSITION: GAP- ===

=== MODEL SUMMARY ===
:            MixedLM Dependent Variable: embed_surprisal
No. Observations: 160
Method: REML
No. Groups: 20
Scale: 0.5327
Min. group size: 8
Log-Likelihood: -198.5210
Max. group size: 8
Converged: Yes
Mean group size: 8.0

=== FIXED EFFECTS COEFFICIENTS ===


Unnamed: 0,Parameter,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
0,Intercept,9.197,0.157,58.605,0.0,8.889,9.505
1,wh_numeric,-0.005,0.058,-0.095,0.924,-0.119,0.108



=== RANDOM EFFECTS / VARIANCE-COVARIANCE ===


Unnamed: 0,Parameter,Coef.,Std.Err.
0,Group Var,0.426,0.233


<statsmodels.regression.mixed_linear_model.MixedLMResultsWrapper at 0x15382e5bce50>

In [22]:
# flip test: gap+
gap_df = embed_objectgap_df[embed_objectgap_df['gap_numeric'] == 1]
iv_list = ['wh_numeric']
mixed_effects_linear_regression(gap_df, iv_list, 'embed_surprisal', 'object gap position: gap+')


=== OBJECT GAP POSITION: GAP+ ===

=== MODEL SUMMARY ===
:            MixedLM Dependent Variable: embed_surprisal
No. Observations: 160
Method: REML
No. Groups: 20
Scale: 0.8560
Min. group size: 8
Log-Likelihood: -234.8796
Max. group size: 8
Converged: Yes
Mean group size: 8.0

=== FIXED EFFECTS COEFFICIENTS ===


Unnamed: 0,Parameter,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
0,Intercept,8.722,0.188,46.498,0.0,8.354,9.089
1,wh_numeric,-0.016,0.073,-0.215,0.83,-0.159,0.128



=== RANDOM EFFECTS / VARIANCE-COVARIANCE ===


Unnamed: 0,Parameter,Coef.,Std.Err.
0,Group Var,0.597,0.263


<statsmodels.regression.mixed_linear_model.MixedLMResultsWrapper at 0x15382e5b6df0>

In [23]:
# division by grammaticality
iv_list = ['grammaticality']
mixed_effects_linear_regression(embed_objectgap_df, iv_list, 'embed_surprisal', 'object gap position: division by grammaticality')


=== OBJECT GAP POSITION: DIVISION BY GRAMMATICALITY ===

=== MODEL SUMMARY ===
Model: MixedLM
Dependent Variable: embed_surprisal
No. Observations: 320
Method: REML
No. Groups: 20
Scale: 0.7308
Min. group size: 16
Log-Likelihood: -430.4806
Max. group size: 16
Converged: Yes
Mean group size: 16.0

=== FIXED EFFECTS COEFFICIENTS ===


Unnamed: 0,Parameter,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
0,Intercept,8.954,0.176,50.879,0.0,8.609,9.299
1,grammaticality[T.gram-],0.01,0.096,0.107,0.915,-0.177,0.198



=== RANDOM EFFECTS / VARIANCE-COVARIANCE ===


Unnamed: 0,Parameter,Coef.,Std.Err.
0,Group Var,0.528,0.225


<statsmodels.regression.mixed_linear_model.MixedLMResultsWrapper at 0x15382e5aefd0>

Embedded Clause - PP/Goal Gap Position

In [24]:
# filter for PP gap
embed_ppgap_df = sentence_df[sentence_df['gap_position'] == 'IO']

In [25]:
# flip test: gap-
no_gap_df = embed_ppgap_df[embed_ppgap_df['gap_numeric'] == -1]
iv_list = ['wh_numeric']
mixed_effects_linear_regression(no_gap_df, iv_list, 'embed_surprisal', 'pp gap position: gap-')


=== PP GAP POSITION: GAP- ===

=== MODEL SUMMARY ===
:            MixedLM Dependent Variable: embed_surprisal
No. Observations: 160
Method: REML
No. Groups: 20
Scale: 0.5028
Min. group size: 8
Log-Likelihood: -194.0825
Max. group size: 8
Converged: Yes
Mean group size: 8.0

=== FIXED EFFECTS COEFFICIENTS ===


Unnamed: 0,Parameter,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
0,Intercept,9.223,0.153,60.124,0.0,8.922,9.523
1,wh_numeric,0.02,0.056,0.362,0.717,-0.09,0.13



=== RANDOM EFFECTS / VARIANCE-COVARIANCE ===


Unnamed: 0,Parameter,Coef.,Std.Err.
0,Group Var,0.408,0.23


<statsmodels.regression.mixed_linear_model.MixedLMResultsWrapper at 0x15382e5a42b0>

In [26]:
# flip test: gap+
gap_df = embed_ppgap_df[embed_ppgap_df['gap_numeric'] == 1]
iv_list = ['wh_numeric']
mixed_effects_linear_regression(gap_df, iv_list, 'embed_surprisal', 'pp gap position: gap+')


=== PP GAP POSITION: GAP+ ===

=== MODEL SUMMARY ===
:            MixedLM Dependent Variable: embed_surprisal
No. Observations: 160
Method: REML
No. Groups: 20
Scale: 0.6453
Min. group size: 8
Log-Likelihood: -208.4854
Max. group size: 8
Converged: Yes
Mean group size: 8.0

=== FIXED EFFECTS COEFFICIENTS ===


Unnamed: 0,Parameter,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
0,Intercept,9.165,0.131,69.711,0.0,8.908,9.423
1,wh_numeric,0.026,0.064,0.41,0.682,-0.098,0.15



=== RANDOM EFFECTS / VARIANCE-COVARIANCE ===


Unnamed: 0,Parameter,Coef.,Std.Err.
0,Group Var,0.265,0.149


<statsmodels.regression.mixed_linear_model.MixedLMResultsWrapper at 0x15382e5a0550>

In [27]:
# division by grammaticality
iv_list = ['grammaticality']
mixed_effects_linear_regression(embed_ppgap_df, iv_list, 'embed_surprisal', 'pp gap position: division by grammaticality')


=== PP GAP POSITION: DIVISION BY GRAMMATICALITY ===

=== MODEL SUMMARY ===
Model: MixedLM
Dependent Variable: embed_surprisal
No. Observations: 320
Method: REML
No. Groups: 20
Scale: 0.5657
Min. group size: 16
Log-Likelihood: -388.2291
Max. group size: 16
Converged: Yes
Mean group size: 16.0

=== FIXED EFFECTS COEFFICIENTS ===


Unnamed: 0,Parameter,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
0,Intercept,9.197,0.144,63.959,0.0,8.915,9.479
1,grammaticality[T.gram-],-0.006,0.084,-0.068,0.946,-0.171,0.159



=== RANDOM EFFECTS / VARIANCE-COVARIANCE ===


Unnamed: 0,Parameter,Coef.,Std.Err.
0,Group Var,0.343,0.168


<statsmodels.regression.mixed_linear_model.MixedLMResultsWrapper at 0x15382e58c1c0>

#### Double Gaps

In [28]:
sentence_df = pd.read_csv('test_sentences/Double Gaps.csv')

In [29]:
sentence_df[['modifier', 'end']] = sentence_df['modifier'].apply(split_ends)
sentence_df = encode_wh_licensor(sentence_df)
sentence_df['gap_numeric'] = sentence_df['gap'].replace(0, -1)
syntactic_parts = ['prefix', 'licensor', 'subj', 'verb', 'object', 'modifier']
sentence_df['sentence'] = sentence_df.apply(lambda x: merge_sentence(x, syntactic_parts), axis = 1)
sentence_df['surprisals'] = sentence_df['sentence'].apply(calculate_surprisal)
sentence_df['grammaticality'] = sentence_df.apply(assign_grammaticality, axis = 1)

Post Gap Region - Modifier

In [30]:
#sentence_df['region_surprisal'] = sentence_df.apply(lambda x: sum_region_surprisal(x, ['modifier']), axis = 1)
sentence_df['region_surprisal'] = sentence_df.apply(lambda x: sum_region_surprisal(x, ['modifier'], 'object'), axis = 1)

In [31]:
# flip test: gap-
no_gap_df = sentence_df[sentence_df['gap_numeric'] == -1]
iv_list = ['wh_numeric']
mixed_effects_linear_regression(no_gap_df, iv_list, 'region_surprisal', 'double gap modifier: gap-')


=== DOUBLE GAP MODIFIER: GAP- ===

=== MODEL SUMMARY ===
:            MixedLM Dependent Variable: region_surprisal
No. Observations: 40
Method: REML
No. Groups: 20
Scale: 0.1026
Min. group size: 2
Log-Likelihood: -71.4704
Max. group size: 2
Converged: Yes
Mean group size: 2.0

=== FIXED EFFECTS COEFFICIENTS ===


Unnamed: 0,Parameter,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
0,Intercept,15.003,1.024,14.654,0.0,12.997,17.01
1,wh_numeric,-0.038,0.051,-0.745,0.456,-0.137,0.062



=== RANDOM EFFECTS / VARIANCE-COVARIANCE ===


Unnamed: 0,Parameter,Coef.,Std.Err.
0,Group Var,20.914,30.031


<statsmodels.regression.mixed_linear_model.MixedLMResultsWrapper at 0x15382e5d3c70>

In [32]:
# flip test: gap+
gap_df = sentence_df[sentence_df['gap_numeric'] == 1]
iv_list = ['wh_numeric']
mixed_effects_linear_regression(gap_df, iv_list, 'region_surprisal', 'double gap modifier: gap+')


=== DOUBLE GAP MODIFIER: GAP+ ===

=== MODEL SUMMARY ===
:            MixedLM Dependent Variable: region_surprisal
No. Observations: 120
Method: REML
No. Groups: 20
Scale: 43.3834
Min. group size: 6
Log-Likelihood: -406.9352
Max. group size: 6
Converged: Yes
Mean group size: 6.0

=== FIXED EFFECTS COEFFICIENTS ===


Unnamed: 0,Parameter,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
0,Intercept,23.569,1.147,20.541,0.0,21.32,25.818
1,wh_numeric,-0.013,0.601,-0.022,0.983,-1.192,1.165



=== RANDOM EFFECTS / VARIANCE-COVARIANCE ===


Unnamed: 0,Parameter,Coef.,Std.Err.
0,Group Var,19.101,1.416


<statsmodels.regression.mixed_linear_model.MixedLMResultsWrapper at 0x15382e5db4f0>

In [33]:
# division by grammaticality
iv_list = ['grammaticality']
mixed_effects_linear_regression(sentence_df, iv_list, 'region_surprisal', 'double gap modifier: division by grammaticality')


=== DOUBLE GAP MODIFIER: DIVISION BY GRAMMATICALITY ===

=== MODEL SUMMARY ===
Model: MixedLM
Dependent Variable: region_surprisal
No. Observations: 160
Method: REML
No. Groups: 20
Scale: 54.1955
Min. group size: 8
Log-Likelihood: -553.3675
Max. group size: 8
Converged: Yes
Mean group size: 8.0

=== FIXED EFFECTS COEFFICIENTS ===


Unnamed: 0,Parameter,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
0,Intercept,21.427,1.117,19.182,0.0,19.238,23.616
1,grammaticality[T.gram-],0.001,1.164,0.001,0.999,-2.281,2.282



=== RANDOM EFFECTS / VARIANCE-COVARIANCE ===


Unnamed: 0,Parameter,Coef.,Std.Err.
0,Group Var,11.406,0.854


<statsmodels.regression.mixed_linear_model.MixedLMResultsWrapper at 0x15382e5d3be0>

Subject Gap Position

In [34]:
subjectgap_df = sentence_df[sentence_df['condition'].isin(['a', 'b', 'd', 'e', 'f', 'h'])]

In [35]:
# flip test: gap-
no_gap_df = subjectgap_df[subjectgap_df['gap_numeric'] == -1]
iv_list = ['wh_numeric']
mixed_effects_linear_regression(no_gap_df, iv_list, 'region_surprisal', 'double gap modifier subject gap position: gap-')


=== DOUBLE GAP MODIFIER SUBJECT GAP POSITION: GAP- ===

=== MODEL SUMMARY ===
:            MixedLM Dependent Variable: region_surprisal
No. Observations: 40
Method: REML
No. Groups: 20
Scale: 0.1026
Min. group size: 2
Log-Likelihood: -71.4704
Max. group size: 2
Converged: Yes
Mean group size: 2.0

=== FIXED EFFECTS COEFFICIENTS ===


Unnamed: 0,Parameter,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
0,Intercept,15.003,1.024,14.654,0.0,12.997,17.01
1,wh_numeric,-0.038,0.051,-0.745,0.456,-0.137,0.062



=== RANDOM EFFECTS / VARIANCE-COVARIANCE ===


Unnamed: 0,Parameter,Coef.,Std.Err.
0,Group Var,20.914,30.031


<statsmodels.regression.mixed_linear_model.MixedLMResultsWrapper at 0x15382e5f9490>

In [36]:
# flip test: gap+
gap_df = subjectgap_df[subjectgap_df['gap_numeric'] == 1]
iv_list = ['wh_numeric']
mixed_effects_linear_regression(gap_df, iv_list, 'region_surprisal', 'double gap modifier subject gap position: gap+')


=== DOUBLE GAP MODIFIER SUBJECT GAP POSITION: GAP+ ===

=== MODEL SUMMARY ===
:            MixedLM Dependent Variable: region_surprisal
No. Observations: 80
Method: REML
No. Groups: 20
Scale: 54.6571
Min. group size: 4
Log-Likelihood: -273.1666
Max. group size: 4
Converged: Yes
Mean group size: 4.0

=== FIXED EFFECTS COEFFICIENTS ===


Unnamed: 0,Parameter,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
0,Intercept,22.07,0.921,23.95,0.0,20.263,23.876
1,wh_numeric,-0.043,0.827,-0.052,0.959,-1.663,1.577



=== RANDOM EFFECTS / VARIANCE-COVARIANCE ===


Unnamed: 0,Parameter,Coef.,Std.Err.
0,Group Var,3.318,0.857


<statsmodels.regression.mixed_linear_model.MixedLMResultsWrapper at 0x15382e582190>

In [37]:
# division by grammaticality
iv_list = ['grammaticality']
mixed_effects_linear_regression(subjectgap_df, iv_list, 'region_surprisal', 'double gap modifier subject gap position: division by grammaticality')


=== DOUBLE GAP MODIFIER SUBJECT GAP POSITION: DIVISION BY GRAMMATICALITY ===

=== MODEL SUMMARY ===
Model: MixedLM
Dependent Variable: region_surprisal
No. Observations: 120
Method: REML
No. Groups: 20
Scale: 51.7067
Min. group size: 6
Log-Likelihood: -408.4591
Max. group size: 6
Converged: No
Mean group size: 6.0

=== FIXED EFFECTS COEFFICIENTS ===


Unnamed: 0,Parameter,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
0,Intercept,19.698,1.047,18.806,0.0,17.645,21.751
1,grammaticality[T.gram-],0.032,1.313,0.024,0.981,-2.541,2.605



=== RANDOM EFFECTS / VARIANCE-COVARIANCE ===


Unnamed: 0,Parameter,Coef.,Std.Err.
0,Group Var,4.707,0.647


<statsmodels.regression.mixed_linear_model.MixedLMResultsWrapper at 0x15382e582310>

Object Gap Position

In [38]:
objectgap_df = sentence_df[sentence_df['condition'].isin(['a', 'c', 'd', 'e', 'g', 'h'])]

In [39]:
# flip test: gap-
no_gap_df = objectgap_df[objectgap_df['gap_numeric'] == -1]
iv_list = ['wh_numeric']
mixed_effects_linear_regression(no_gap_df, iv_list, 'region_surprisal', 'double gap modifier object gap position: gap-')


=== DOUBLE GAP MODIFIER OBJECT GAP POSITION: GAP- ===

=== MODEL SUMMARY ===
:            MixedLM Dependent Variable: region_surprisal
No. Observations: 40
Method: REML
No. Groups: 20
Scale: 0.1026
Min. group size: 2
Log-Likelihood: -71.4704
Max. group size: 2
Converged: Yes
Mean group size: 2.0

=== FIXED EFFECTS COEFFICIENTS ===


Unnamed: 0,Parameter,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
0,Intercept,15.003,1.024,14.654,0.0,12.997,17.01
1,wh_numeric,-0.038,0.051,-0.745,0.456,-0.137,0.062



=== RANDOM EFFECTS / VARIANCE-COVARIANCE ===


Unnamed: 0,Parameter,Coef.,Std.Err.
0,Group Var,20.914,30.031


<statsmodels.regression.mixed_linear_model.MixedLMResultsWrapper at 0x15382e55dc10>

In [40]:
# flip test: gap+
gap_df = objectgap_df[objectgap_df['gap_numeric'] == 1]
iv_list = ['wh_numeric']
mixed_effects_linear_regression(gap_df, iv_list, 'region_surprisal', 'double gap modifier object gap position: gap+')


=== DOUBLE GAP MODIFIER OBJECT GAP POSITION: GAP+ ===

=== MODEL SUMMARY ===
:            MixedLM Dependent Variable: region_surprisal
No. Observations: 80
Method: REML
No. Groups: 20
Scale: 1.1734
Min. group size: 4
Log-Likelihood: -171.4212
Max. group size: 4
Converged: Yes
Mean group size: 4.0

=== FIXED EFFECTS COEFFICIENTS ===


Unnamed: 0,Parameter,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
0,Intercept,26.702,1.694,15.763,0.0,23.382,30.022
1,wh_numeric,-0.002,0.121,-0.015,0.988,-0.239,0.235



=== RANDOM EFFECTS / VARIANCE-COVARIANCE ===


Unnamed: 0,Parameter,Coef.,Std.Err.
0,Group Var,57.097,19.763


<statsmodels.regression.mixed_linear_model.MixedLMResultsWrapper at 0x15382e589eb0>

In [41]:
# division by grammaticality
iv_list = ['grammaticality']
mixed_effects_linear_regression(objectgap_df, iv_list, 'region_surprisal', 'double gap modifier object gap position: division by grammaticality')


=== DOUBLE GAP MODIFIER OBJECT GAP POSITION: DIVISION BY GRAMMATICALITY ===

=== MODEL SUMMARY ===
Model: MixedLM
Dependent Variable: region_surprisal
No. Observations: 120
Method: REML
No. Groups: 20
Scale: 57.5070
Min. group size: 6
Log-Likelihood: -420.7590
Max. group size: 6
Converged: Yes
Mean group size: 6.0

=== FIXED EFFECTS COEFFICIENTS ===


Unnamed: 0,Parameter,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
0,Intercept,22.814,1.37,16.653,0.0,20.129,25.499
1,grammaticality[T.gram-],-0.023,1.385,-0.016,0.987,-2.736,2.691



=== RANDOM EFFECTS / VARIANCE-COVARIANCE ===


Unnamed: 0,Parameter,Coef.,Std.Err.
0,Group Var,18.365,1.305


<statsmodels.regression.mixed_linear_model.MixedLMResultsWrapper at 0x15382e59cf40>

Embedded Clause

In [42]:
embed = ['subj', 'object', 'verb', 'modifier', 'end']
sentence_df['embed_surprisal'] = sentence_df.apply(lambda x: sum_region_surprisal(x, embed, normalize = True), axis = 1)

In [43]:
# flip test: gap-
no_gap_df = sentence_df[sentence_df['gap_numeric'] == -1]
iv_list = ['wh_numeric']
mixed_effects_linear_regression(no_gap_df, iv_list, 'embed_surprisal', 'double gap embed: gap-')


=== DOUBLE GAP EMBED: GAP- ===

=== MODEL SUMMARY ===
:            MixedLM Dependent Variable: embed_surprisal
No. Observations: 40
Method: REML
No. Groups: 20
Scale: 0.0098
Min. group size: 2
Log-Likelihood: -9.6069
Max. group size: 2
Converged: Yes
Mean group size: 2.0

=== FIXED EFFECTS COEFFICIENTS ===


Unnamed: 0,Parameter,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
0,Intercept,6.599,0.128,51.675,0.0,6.349,6.85
1,wh_numeric,-0.01,0.016,-0.669,0.504,-0.041,0.02



=== RANDOM EFFECTS / VARIANCE-COVARIANCE ===


Unnamed: 0,Parameter,Coef.,Std.Err.
0,Group Var,0.321,1.512


<statsmodels.regression.mixed_linear_model.MixedLMResultsWrapper at 0x15382e5ea580>

In [44]:
# flip test: gap+
gap_df = sentence_df[sentence_df['gap_numeric'] == 1]
iv_list = ['wh_numeric']
mixed_effects_linear_regression(gap_df, iv_list, 'embed_surprisal', 'double gap embed: gap+')


=== DOUBLE GAP EMBED: GAP+ ===

=== MODEL SUMMARY ===
:            MixedLM Dependent Variable: embed_surprisal
No. Observations: 120
Method: REML
No. Groups: 20
Scale: 0.1797
Min. group size: 6
Log-Likelihood: -95.4713
Max. group size: 6
Converged: Yes
Mean group size: 6.0

=== FIXED EFFECTS COEFFICIENTS ===


Unnamed: 0,Parameter,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
0,Intercept,6.272,0.141,44.612,0.0,5.996,6.547
1,wh_numeric,0.027,0.039,0.697,0.486,-0.049,0.103



=== RANDOM EFFECTS / VARIANCE-COVARIANCE ===


Unnamed: 0,Parameter,Coef.,Std.Err.
0,Group Var,0.365,0.33


<statsmodels.regression.mixed_linear_model.MixedLMResultsWrapper at 0x15382e5db820>

In [45]:
# division by grammaticality
iv_list = ['grammaticality']
mixed_effects_linear_regression(sentence_df, iv_list, 'embed_surprisal', 'double gap embed: division by grammaticality')


=== DOUBLE GAP EMBED: DIVISION BY GRAMMATICALITY ===

=== MODEL SUMMARY ===
Model: MixedLM
Dependent Variable: embed_surprisal
No. Observations: 160
Method: REML
No. Groups: 20
Scale: 0.1842
Min. group size: 8
Log-Likelihood: -120.7632
Max. group size: 8
Converged: Yes
Mean group size: 8.0

=== FIXED EFFECTS COEFFICIENTS ===


Unnamed: 0,Parameter,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
0,Intercept,6.377,0.136,46.75,0.0,6.109,6.644
1,grammaticality[T.gram-],-0.046,0.068,-0.673,0.501,-0.179,0.087



=== RANDOM EFFECTS / VARIANCE-COVARIANCE ===


Unnamed: 0,Parameter,Coef.,Std.Err.
0,Group Var,0.326,0.281


<statsmodels.regression.mixed_linear_model.MixedLMResultsWrapper at 0x15382e5cb100>

Subject Gap Position

In [46]:
subjectgap_df = sentence_df[sentence_df['condition'].isin(['a', 'b', 'd', 'e', 'f', 'h'])]

In [47]:
# flip test: gap-
no_gap_df = subjectgap_df[subjectgap_df['gap_numeric'] == -1]
iv_list = ['wh_numeric']
mixed_effects_linear_regression(no_gap_df, iv_list, 'embed_surprisal', 'double gap embed subject gap position: gap-')


=== DOUBLE GAP EMBED SUBJECT GAP POSITION: GAP- ===

=== MODEL SUMMARY ===
:            MixedLM Dependent Variable: embed_surprisal
No. Observations: 40
Method: REML
No. Groups: 20
Scale: 0.0098
Min. group size: 2
Log-Likelihood: -9.6069
Max. group size: 2
Converged: Yes
Mean group size: 2.0

=== FIXED EFFECTS COEFFICIENTS ===


Unnamed: 0,Parameter,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
0,Intercept,6.599,0.128,51.675,0.0,6.349,6.85
1,wh_numeric,-0.01,0.016,-0.669,0.504,-0.041,0.02



=== RANDOM EFFECTS / VARIANCE-COVARIANCE ===


Unnamed: 0,Parameter,Coef.,Std.Err.
0,Group Var,0.321,1.512


<statsmodels.regression.mixed_linear_model.MixedLMResultsWrapper at 0x15382e5dec70>

In [48]:
# flip test: gap+
gap_df = subjectgap_df[subjectgap_df['gap_numeric'] == 1]
iv_list = ['wh_numeric']
mixed_effects_linear_regression(gap_df, iv_list, 'embed_surprisal', 'double gap embed subject gap position: gap+')


=== DOUBLE GAP EMBED SUBJECT GAP POSITION: GAP+ ===

=== MODEL SUMMARY ===
:            MixedLM Dependent Variable: embed_surprisal
No. Observations: 80
Method: REML
No. Groups: 20
Scale: 0.1512
Min. group size: 4
Log-Likelihood: -63.8504
Max. group size: 4
Converged: Yes
Mean group size: 4.0

=== FIXED EFFECTS COEFFICIENTS ===


Unnamed: 0,Parameter,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
0,Intercept,6.179,0.142,43.584,0.0,5.901,6.457
1,wh_numeric,0.05,0.043,1.145,0.252,-0.035,0.135



=== RANDOM EFFECTS / VARIANCE-COVARIANCE ===


Unnamed: 0,Parameter,Coef.,Std.Err.
0,Group Var,0.364,0.386


<statsmodels.regression.mixed_linear_model.MixedLMResultsWrapper at 0x15382e5850a0>

In [49]:
# division by grammaticality
iv_list = ['grammaticality']
mixed_effects_linear_regression(subjectgap_df, iv_list, 'embed_surprisal', 'double gap embed subject gap position: division by grammaticality')


=== DOUBLE GAP EMBED SUBJECT GAP POSITION: DIVISION BY GRAMMATICALITY ===

=== MODEL SUMMARY ===
Model: MixedLM
Dependent Variable: embed_surprisal
No. Observations: 120
Method: REML
No. Groups: 20
Scale: 0.2031
Min. group size: 6
Log-Likelihood: -98.8679
Max. group size: 6
Converged: Yes
Mean group size: 6.0

=== FIXED EFFECTS COEFFICIENTS ===


Unnamed: 0,Parameter,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
0,Intercept,6.356,0.133,47.649,0.0,6.094,6.617
1,grammaticality[T.gram-],-0.073,0.082,-0.891,0.373,-0.235,0.088



=== RANDOM EFFECTS / VARIANCE-COVARIANCE ===


Unnamed: 0,Parameter,Coef.,Std.Err.
0,Group Var,0.288,0.253


<statsmodels.regression.mixed_linear_model.MixedLMResultsWrapper at 0x15382e5a8880>

Object Gap Position

In [50]:
objectgap_df = sentence_df[sentence_df['condition'].isin(['a', 'c', 'd', 'e', 'g', 'h'])]

In [51]:
# flip test: gap-
no_gap_df = objectgap_df[objectgap_df['gap_numeric'] == -1]
iv_list = ['wh_numeric']
mixed_effects_linear_regression(no_gap_df, iv_list, 'embed_surprisal', 'double gap embed object gap position: gap-')


=== DOUBLE GAP EMBED OBJECT GAP POSITION: GAP- ===

=== MODEL SUMMARY ===
:            MixedLM Dependent Variable: embed_surprisal
No. Observations: 40
Method: REML
No. Groups: 20
Scale: 0.0098
Min. group size: 2
Log-Likelihood: -9.6069
Max. group size: 2
Converged: Yes
Mean group size: 2.0

=== FIXED EFFECTS COEFFICIENTS ===


Unnamed: 0,Parameter,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
0,Intercept,6.599,0.128,51.675,0.0,6.349,6.85
1,wh_numeric,-0.01,0.016,-0.669,0.504,-0.041,0.02



=== RANDOM EFFECTS / VARIANCE-COVARIANCE ===


Unnamed: 0,Parameter,Coef.,Std.Err.
0,Group Var,0.321,1.512


<statsmodels.regression.mixed_linear_model.MixedLMResultsWrapper at 0x15382e629b20>

In [52]:
# flip test: gap+
gap_df = objectgap_df[objectgap_df['gap_numeric'] == 1]
iv_list = ['wh_numeric']
mixed_effects_linear_regression(gap_df, iv_list, 'embed_surprisal', 'double gap embed object gap position: gap+')


=== DOUBLE GAP EMBED OBJECT GAP POSITION: GAP+ ===

=== MODEL SUMMARY ===
:            MixedLM Dependent Variable: embed_surprisal
No. Observations: 80
Method: REML
No. Groups: 20
Scale: 0.1782
Min. group size: 4
Log-Likelihood: -70.8933
Max. group size: 4
Converged: Yes
Mean group size: 4.0

=== FIXED EFFECTS COEFFICIENTS ===


Unnamed: 0,Parameter,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
0,Intercept,6.195,0.159,38.897,0.0,5.883,6.508
1,wh_numeric,0.013,0.047,0.272,0.785,-0.08,0.105



=== RANDOM EFFECTS / VARIANCE-COVARIANCE ===


Unnamed: 0,Parameter,Coef.,Std.Err.
0,Group Var,0.463,0.448


<statsmodels.regression.mixed_linear_model.MixedLMResultsWrapper at 0x15382e625ca0>

In [53]:
# division by grammaticality
iv_list = ['grammaticality']
mixed_effects_linear_regression(objectgap_df, iv_list, 'embed_surprisal', 'double gap embed object gap position: division by grammaticality')


=== DOUBLE GAP EMBED OBJECT GAP POSITION: DIVISION BY GRAMMATICALITY ===

=== MODEL SUMMARY ===
Model: MixedLM
Dependent Variable: embed_surprisal
No. Observations: 120
Method: REML
No. Groups: 20
Scale: 0.2016
Min. group size: 6
Log-Likelihood: -100.6706
Max. group size: 6
Converged: Yes
Mean group size: 6.0

=== FIXED EFFECTS COEFFICIENTS ===


Unnamed: 0,Parameter,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
0,Intercept,6.342,0.148,42.879,0.0,6.052,6.632
1,grammaticality[T.gram-],-0.024,0.082,-0.294,0.769,-0.185,0.137



=== RANDOM EFFECTS / VARIANCE-COVARIANCE ===


Unnamed: 0,Parameter,Coef.,Std.Err.
0,Group Var,0.37,0.319


<statsmodels.regression.mixed_linear_model.MixedLMResultsWrapper at 0x15382e62d8b0>

#### Wh-Islands

In [8]:
sentence_df = pd.read_csv('test_sentences/Wh-Islands.csv')
sentence_df['comp_type'] = sentence_df['comp_type'].fillna('null')

In [9]:
sentence_df[['cont', 'end']] = sentence_df['cont'].apply(split_ends)
sentence_df = encode_wh_licensor(sentence_df)
sentence_df['gap_numeric'] = sentence_df['gap'].replace(0, -1)
syntactic_parts = ['prefix', 'filler_word', 'subj1', 'verb1', 'comp', 'subj2', 'embed_verb', 'obj', 'cont']
sentence_df['sentence'] = sentence_df.apply(lambda x: merge_sentence (x, syntactic_parts), axis = 1)
sentence_df['surprisals'] = sentence_df['sentence'].apply(calculate_surprisal)
sentence_df['comp_type'] = pd.Categorical(sentence_df['comp_type'], categories = ['null', 'that', 'whether'], ordered = True)
sentence_df['grammaticality'] = sentence_df.apply(assign_grammaticality, axis = 1)

Post-Gap Region - Continuation

In [10]:
sentence_df['region_surprisal'] = sentence_df.apply(lambda x: sum_region_surprisal(x, ['cont'], 'obj'), axis = 1)

In [11]:
# flip test: gap-
no_gap_df = sentence_df[sentence_df['gap_numeric'] == -1]
iv_list = ['wh_numeric', 'comp_type']
mixed_effects_linear_regression(no_gap_df, iv_list, 'region_surprisal', 'wh-islands continuation: gap-')


=== WH-ISLANDS CONTINUATION: GAP- ===

=== MODEL SUMMARY ===
Model: MixedLM
Dependent Variable: region_surprisal
No. Observations: 120
Method: REML
No. Groups: 20
Scale: 0.2581
Min. group size: 6
Log-Likelihood: -161.0382
Max. group size: 6
Converged: Yes
Mean group size: 6.0

=== FIXED EFFECTS COEFFICIENTS ===


Unnamed: 0,Parameter,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
0,Intercept,21.455,1.452,14.779,0.0,18.61,24.301
1,comp_type[T.that],-0.226,0.114,-1.991,0.047,-0.449,-0.003
2,comp_type[T.whether],-0.097,0.114,-0.853,0.394,-0.32,0.126
3,wh_numeric,0.101,0.08,1.261,0.207,-0.056,0.259
4,wh_numeric:comp_type[T.that],-0.298,0.114,-2.626,0.009,-0.521,-0.076
5,wh_numeric:comp_type[T.whether],-0.346,0.114,-3.049,0.002,-0.569,-0.124



=== RANDOM EFFECTS / VARIANCE-COVARIANCE ===


Unnamed: 0,Parameter,Coef.,Std.Err.
0,Group Var,42.023,29.427


<statsmodels.regression.mixed_linear_model.MixedLMResultsWrapper at 0x1521eea4ad60>

In [12]:
# flip test: gap+
gap_df = sentence_df[sentence_df['gap_numeric'] == 1]
iv_list = ['wh_numeric', 'comp_type']
mixed_effects_linear_regression(gap_df, iv_list, 'region_surprisal', 'wh-islands continuation: gap+')


=== WH-ISLANDS CONTINUATION: GAP+ ===

=== MODEL SUMMARY ===
Model: MixedLM
Dependent Variable: region_surprisal
No. Observations: 120
Method: REML
No. Groups: 20
Scale: 0.3763
Min. group size: 6
Log-Likelihood: -184.4054
Max. group size: 6
Converged: Yes
Mean group size: 6.0

=== FIXED EFFECTS COEFFICIENTS ===


Unnamed: 0,Parameter,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
0,Intercept,31.405,1.935,16.227,0.0,27.612,35.198
1,comp_type[T.that],0.013,0.137,0.091,0.927,-0.256,0.281
2,comp_type[T.whether],-0.203,0.137,-1.477,0.14,-0.471,0.066
3,wh_numeric,-0.098,0.097,-1.012,0.312,-0.288,0.092
4,wh_numeric:comp_type[T.that],0.182,0.137,1.328,0.184,-0.087,0.451
5,wh_numeric:comp_type[T.whether],0.166,0.137,1.213,0.225,-0.102,0.435



=== RANDOM EFFECTS / VARIANCE-COVARIANCE ===


Unnamed: 0,Parameter,Coef.,Std.Err.
0,Group Var,74.721,43.329


<statsmodels.regression.mixed_linear_model.MixedLMResultsWrapper at 0x1521ee9c1670>

In [13]:
# division by grammaticality
iv_list = ['grammaticality']
mixed_effects_linear_regression(sentence_df, iv_list, 'region_surprisal', 'wh-islands continuation: division by grammaticality')


=== WH-ISLANDS CONTINUATION: DIVISION BY GRAMMATICALITY ===

=== MODEL SUMMARY ===
Model: MixedLM
Dependent Variable: region_surprisal
No. Observations: 240
Method: REML
No. Groups: 20
Scale: 63.8355
Min. group size: 12
Log-Likelihood: -851.2849
Max. group size: 12
Converged: Yes
Mean group size: 12.0

=== FIXED EFFECTS COEFFICIENTS ===


Unnamed: 0,Parameter,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
0,Intercept,26.41,1.204,21.929,0.0,24.05,28.771
1,grammaticality[T.gram-],-0.132,1.031,-0.128,0.898,-2.153,1.89



=== RANDOM EFFECTS / VARIANCE-COVARIANCE ===


Unnamed: 0,Parameter,Coef.,Std.Err.
0,Group Var,18.369,1.003


<statsmodels.regression.mixed_linear_model.MixedLMResultsWrapper at 0x1521ee9bdb80>

Embedded Clause

In [14]:
embed = ['subj1', 'verb1', 'comp', 'subj2', 'embed_verb', 'obj', 'cont', 'end']
sentence_df['embed_surprisal'] = sentence_df.apply(lambda x: sum_region_surprisal(x, embed, normalize = True), axis = 1)

In [15]:
# flip test: gap-
no_gap_df = sentence_df[sentence_df['gap_numeric'] == -1]
iv_list = ['wh_numeric', 'comp_type']
mixed_effects_linear_regression(no_gap_df, iv_list, 'embed_surprisal', 'wh-islands embed: gap-')


=== WH-ISLANDS EMBED: GAP- ===

=== MODEL SUMMARY ===
Model: MixedLM
Dependent Variable: embed_surprisal
No. Observations: 120
Method: REML
No. Groups: 20
Scale: 0.0160
Min. group size: 6
Log-Likelihood: 8.0954
Max. group size: 6
Converged: Yes
Mean group size: 6.0

=== FIXED EFFECTS COEFFICIENTS ===


Unnamed: 0,Parameter,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
0,Intercept,8.807,0.207,42.6,0.0,8.401,9.212
1,comp_type[T.that],-0.046,0.028,-1.63,0.103,-0.102,0.009
2,comp_type[T.whether],0.084,0.028,2.974,0.003,0.029,0.14
3,wh_numeric,-0.135,0.02,-6.761,0.0,-0.174,-0.096
4,wh_numeric:comp_type[T.that],-0.023,0.028,-0.802,0.423,-0.078,0.033
5,wh_numeric:comp_type[T.whether],-0.046,0.028,-1.614,0.107,-0.101,0.01



=== RANDOM EFFECTS / VARIANCE-COVARIANCE ===


Unnamed: 0,Parameter,Coef.,Std.Err.
0,Group Var,0.847,2.385


<statsmodels.regression.mixed_linear_model.MixedLMResultsWrapper at 0x1521ee9c94f0>

In [16]:
# flip test: gap+
gap_df = sentence_df[sentence_df['gap_numeric'] == 1]
iv_list = ['wh_numeric', 'comp_type']
mixed_effects_linear_regression(gap_df, iv_list, 'embed_surprisal', 'wh-islands embed: gap+')


=== WH-ISLANDS EMBED: GAP+ ===

=== MODEL SUMMARY ===
Model: MixedLM
Dependent Variable: embed_surprisal
No. Observations: 120
Method: REML
No. Groups: 20
Scale: 0.0207
Min. group size: 6
Log-Likelihood: -5.5394
Max. group size: 6
Converged: Yes
Mean group size: 6.0

=== FIXED EFFECTS COEFFICIENTS ===


Unnamed: 0,Parameter,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
0,Intercept,8.794,0.224,39.338,0.0,8.355,9.232
1,comp_type[T.that],-0.069,0.032,-2.151,0.031,-0.132,-0.006
2,comp_type[T.whether],0.059,0.032,1.832,0.067,-0.004,0.122
3,wh_numeric,-0.202,0.023,-8.886,0.0,-0.247,-0.157
4,wh_numeric:comp_type[T.that],0.027,0.032,0.845,0.398,-0.036,0.09
5,wh_numeric:comp_type[T.whether],-0.002,0.032,-0.076,0.939,-0.065,0.061



=== RANDOM EFFECTS / VARIANCE-COVARIANCE ===


Unnamed: 0,Parameter,Coef.,Std.Err.
0,Group Var,0.989,2.452


<statsmodels.regression.mixed_linear_model.MixedLMResultsWrapper at 0x1521eea1e7f0>

In [17]:
# division by grammaticality
iv_list = ['grammaticality']
mixed_effects_linear_regression(sentence_df, iv_list, 'embed_surprisal', 'wh-islands embed: division by grammaticality')


=== WH-ISLANDS EMBED: DIVISION BY GRAMMATICALITY ===

=== MODEL SUMMARY ===
Model: MixedLM
Dependent Variable: embed_surprisal
No. Observations: 240
Method: REML
No. Groups: 20
Scale: 0.0728
Min. group size: 12
Log-Likelihood: -78.2575
Max. group size: 12
Converged: Yes
Mean group size: 12.0

=== FIXED EFFECTS COEFFICIENTS ===


Unnamed: 0,Parameter,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
0,Intercept,8.787,0.213,41.222,0.0,8.369,9.205
1,grammaticality[T.gram-],0.036,0.035,1.026,0.305,-0.033,0.104



=== RANDOM EFFECTS / VARIANCE-COVARIANCE ===


Unnamed: 0,Parameter,Coef.,Std.Err.
0,Group Var,0.897,1.131


<statsmodels.regression.mixed_linear_model.MixedLMResultsWrapper at 0x1521eea1cf70>

#### Adjunct Islands 

In [18]:
sentence_df = pd.read_csv('test_sentences/Adjunct Islands.csv')
sentence_df['adjunct_type'] = sentence_df['adjunct_type'].fillna('null')

In [19]:
sentence_df[['cont', 'end']] = sentence_df['cont'].apply(split_ends)
sentence_df['adjunct_front_trigger'] = np.nan
sentence_df['adjunct_front_trigger'] = sentence_df['adjunct_trigger'].where(sentence_df['adjunct_type'] == 'front', sentence_df['adjunct_front_trigger'])
sentence_df['adjunct_trigger'] = sentence_df['adjunct_trigger'].where(sentence_df['adjunct_type'] != 'front', np.nan)
sentence_df.rename(columns = {'adjunct_trigger': 'adjunct_back_trigger'}, inplace = True)
sentence_df = sentence_df[[col for col in sentence_df.columns if col != 'end'] + ['end']]

In [20]:
sentence_df = encode_wh_licensor(sentence_df)
sentence_df['gap_numeric'] = sentence_df['gap'].replace(0, -1)
syntactic_parts = ['prefix', 'filler_word', 'adjunct_back_trigger', 'prep', 'subj', 'mod', 'verb', 'obj', 'cont', 'adjunct_front_trigger']
sentence_df['sentence'] = sentence_df.apply(lambda x: merge_sentence(x, syntactic_parts), axis = 1)
sentence_df['surprisals'] = sentence_df['sentence'].apply(calculate_surprisal)
sentence_df['adjunct_type'] = pd.Categorical(sentence_df['adjunct_type'], categories = ['null', 'back', 'front'], ordered = True)
sentence_df['grammaticality'] = sentence_df.apply(assign_grammaticality, axis = 1)

Post-Gap Region - Continuation

In [21]:
sentence_df['region_surprisal'] = sentence_df.apply(lambda x: sum_region_surprisal(x, ['cont'], 'obj'), axis = 1)

In [22]:
# flip test: gap-
no_gap_df = sentence_df[sentence_df['gap_numeric'] == -1]
iv_list = ['wh_numeric', 'adjunct_type']
mixed_effects_linear_regression(no_gap_df, iv_list, 'region_surprisal', 'adjunct islands continuation: gap-')


=== ADJUNCT ISLANDS CONTINUATION: GAP- ===

=== MODEL SUMMARY ===
Model: MixedLM
Dependent Variable: region_surprisal
No. Observations: 120
Method: REML
No. Groups: 20
Scale: 0.4314
Min. group size: 6
Log-Likelihood: -194.4080
Max. group size: 6
Converged: Yes
Mean group size: 6.0

=== FIXED EFFECTS COEFFICIENTS ===


Unnamed: 0,Parameter,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
0,Intercept,32.665,2.327,14.037,0.0,28.104,37.226
1,adjunct_type[T.back],-0.059,0.147,-0.4,0.689,-0.347,0.229
2,adjunct_type[T.front],0.11,0.147,0.752,0.452,-0.177,0.398
3,wh_numeric,0.081,0.104,0.784,0.433,-0.122,0.285
4,wh_numeric:adjunct_type[T.back],-0.052,0.147,-0.355,0.722,-0.34,0.236
5,wh_numeric:adjunct_type[T.front],-0.049,0.147,-0.334,0.738,-0.337,0.239



=== RANDOM EFFECTS / VARIANCE-COVARIANCE ===


Unnamed: 0,Parameter,Coef.,Std.Err.
0,Group Var,108.092,58.52


<statsmodels.regression.mixed_linear_model.MixedLMResultsWrapper at 0x1521eea171f0>

In [23]:
# flip test: gap+
gap_df = sentence_df[sentence_df['gap_numeric'] == 1]
iv_list = ['wh_numeric', 'adjunct_type']
mixed_effects_linear_regression(gap_df, iv_list, 'region_surprisal', 'adjunct islands continuation: gap+')


=== ADJUNCT ISLANDS CONTINUATION: GAP+ ===

=== MODEL SUMMARY ===
Model: MixedLM
Dependent Variable: region_surprisal
No. Observations: 120
Method: REML
No. Groups: 20
Scale: 0.4485
Min. group size: 6
Log-Likelihood: -192.4646
Max. group size: 6
Converged: Yes
Mean group size: 6.0

=== FIXED EFFECTS COEFFICIENTS ===


Unnamed: 0,Parameter,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
0,Intercept,42.796,1.907,22.439,0.0,39.058,46.534
1,adjunct_type[T.back],-0.042,0.15,-0.282,0.778,-0.336,0.251
2,adjunct_type[T.front],0.063,0.15,0.419,0.675,-0.231,0.356
3,wh_numeric,0.01,0.106,0.093,0.926,-0.198,0.217
4,wh_numeric:adjunct_type[T.back],-0.007,0.15,-0.049,0.961,-0.301,0.286
5,wh_numeric:adjunct_type[T.front],0.061,0.15,0.406,0.685,-0.233,0.354



=== RANDOM EFFECTS / VARIANCE-COVARIANCE ===


Unnamed: 0,Parameter,Coef.,Std.Err.
0,Group Var,72.522,38.527


<statsmodels.regression.mixed_linear_model.MixedLMResultsWrapper at 0x1521ee9a9a90>

In [24]:
# division by grammaticality
iv_list = ['grammaticality']
mixed_effects_linear_regression(sentence_df, iv_list, 'region_surprisal', 'adjunct islands continuation: division by grammaticality')


=== ADJUNCT ISLANDS CONTINUATION: DIVISION BY GRAMMATICALITY ===

=== MODEL SUMMARY ===
Model: MixedLM
Dependent Variable: region_surprisal
No. Observations: 240
Method: REML
No. Groups: 20
Scale: 67.3039
Min. group size: 12
Log-Likelihood: -864.7407
Max. group size: 12
Converged: Yes
Mean group size: 12.0

=== FIXED EFFECTS COEFFICIENTS ===


Unnamed: 0,Parameter,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
0,Intercept,37.732,1.713,22.029,0.0,34.375,41.089
1,grammaticality[T.gram-],0.02,1.059,0.019,0.985,-2.056,2.096



=== RANDOM EFFECTS / VARIANCE-COVARIANCE ===


Unnamed: 0,Parameter,Coef.,Std.Err.
0,Group Var,47.459,2.188


<statsmodels.regression.mixed_linear_model.MixedLMResultsWrapper at 0x1521ec292130>

Embedded Clause

In [25]:
embed = ['adjunct_back_trigger', 'prep', 'subj', 'mod', 'verb', 'obj', 'cont', 'adjunct_front_trigger', 'end']
sentence_df['embed_surprisal'] = sentence_df.apply(lambda x: sum_region_surprisal(x, embed, normalize = True), axis = 1)

In [26]:
# flip test: gap-
no_gap_df = sentence_df[sentence_df['gap_numeric'] == -1]
iv_list = ['wh_numeric', 'adjunct_type']
mixed_effects_linear_regression(no_gap_df, iv_list, 'embed_surprisal', 'adjunct islands embed: gap-')


=== ADJUNCT ISLANDS EMBED: GAP- ===

=== MODEL SUMMARY ===
Model: MixedLM
Dependent Variable: embed_surprisal
No. Observations: 120
Method: REML
No. Groups: 20
Scale: 0.0549
Min. group size: 6
Log-Likelihood: -53.2948
Max. group size: 6
Converged: Yes
Mean group size: 6.0

=== FIXED EFFECTS COEFFICIENTS ===


Unnamed: 0,Parameter,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
0,Intercept,8.967,0.242,37.129,0.0,8.493,9.44
1,adjunct_type[T.back],0.613,0.052,11.697,0.0,0.51,0.715
2,adjunct_type[T.front],0.679,0.052,12.955,0.0,0.576,0.781
3,wh_numeric,0.015,0.037,0.399,0.69,-0.058,0.087
4,wh_numeric:adjunct_type[T.back],0.017,0.052,0.322,0.747,-0.086,0.12
5,wh_numeric:adjunct_type[T.front],0.089,0.052,1.706,0.088,-0.013,0.192



=== RANDOM EFFECTS / VARIANCE-COVARIANCE ===


Unnamed: 0,Parameter,Coef.,Std.Err.
0,Group Var,1.139,1.742


<statsmodels.regression.mixed_linear_model.MixedLMResultsWrapper at 0x1521ec308190>

In [27]:
# flip test: gap+
gap_df = sentence_df[sentence_df['gap_numeric'] == 1]
iv_list = ['wh_numeric', 'adjunct_type']
mixed_effects_linear_regression(gap_df, iv_list, 'embed_surprisal', 'adjunct islands embed: gap+')


=== ADJUNCT ISLANDS EMBED: GAP+ ===

=== MODEL SUMMARY ===
Model: MixedLM
Dependent Variable: embed_surprisal
No. Observations: 120
Method: REML
No. Groups: 20
Scale: 0.0662
Min. group size: 6
Log-Likelihood: -61.4419
Max. group size: 6
Converged: Yes
Mean group size: 6.0

=== FIXED EFFECTS COEFFICIENTS ===


Unnamed: 0,Parameter,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
0,Intercept,8.345,0.233,35.799,0.0,7.888,8.802
1,adjunct_type[T.back],0.866,0.058,15.063,0.0,0.754,0.979
2,adjunct_type[T.front],0.947,0.058,16.459,0.0,0.834,1.059
3,wh_numeric,0.01,0.041,0.253,0.801,-0.069,0.09
4,wh_numeric:adjunct_type[T.back],0.021,0.058,0.358,0.72,-0.092,0.133
5,wh_numeric:adjunct_type[T.front],0.102,0.058,1.766,0.077,-0.011,0.214



=== RANDOM EFFECTS / VARIANCE-COVARIANCE ===


Unnamed: 0,Parameter,Coef.,Std.Err.
0,Group Var,1.054,1.471


<statsmodels.regression.mixed_linear_model.MixedLMResultsWrapper at 0x1521ec300100>

In [28]:
# division by grammaticality
iv_list = ['grammaticality']
mixed_effects_linear_regression(sentence_df, iv_list, 'embed_surprisal', 'adjunct islands embed: division by grammaticality')


=== ADJUNCT ISLANDS EMBED: DIVISION BY GRAMMATICALITY ===

=== MODEL SUMMARY ===
Model: MixedLM
Dependent Variable: embed_surprisal
No. Observations: 240
Method: REML
No. Groups: 20
Scale: 0.3265
Min. group size: 12
Log-Likelihood: -243.9673
Max. group size: 12
Converged: Yes
Mean group size: 12.0

=== FIXED EFFECTS COEFFICIENTS ===


Unnamed: 0,Parameter,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
0,Intercept,9.174,0.232,39.595,0.0,8.72,9.628
1,grammaticality[T.gram-],-0.001,0.074,-0.011,0.992,-0.145,0.144



=== RANDOM EFFECTS / VARIANCE-COVARIANCE ===


Unnamed: 0,Parameter,Coef.,Std.Err.
0,Group Var,1.019,0.619


<statsmodels.regression.mixed_linear_model.MixedLMResultsWrapper at 0x1521ec2fb9a0>

#### Complex NP Islands

In [None]:
sentence_df = pd.read_csv('test_sentences/Complex NP Islands.csv')

In [None]:
sentence_df = encode_wh_licensor(sentence_df)
sentence_df['gap_numeric'] = sentence_df['gap'].replace(0, -1)
sentence_df['sentence'] = sentence_df.apply(merge_sentence, axis = 1)
sentence_df['surprisals'] = sentence_df['sentence'].apply(calculate_surprisal)
sentence_df['grammaticality'] = sentence_df.apply(assign_grammaticality, axis = 1)

Subject Condition

In [None]:
subject_df = sentence_df.copy()

In [None]:
subject_df['region_surprisal'] = subject_df.apply(lambda x: sum_region_surprisal(x, ['subj_setup']), axis = 1)
subject_df

In [None]:
# filter for subject condition
subject_df = subject_df[subject_df['subj_obj'] == 'subject']

In [None]:
# flip test: gap-
no_gap_df = subject_df[subject_df['gap_numeric'] == -1]
iv_list = ['wh_numeric']
mixed_effects_linear_regression(no_gap_df, iv_list, 'region_surprisal', 'complex np islands subject condition: gap-')

In [None]:
# flip test: gap+
gap_df = subject_df[subject_df['gap_numeric'] == 1]
iv_list = ['wh_numeric']
mixed_effects_linear_regression(gap_df, iv_list, 'region_surprisal', 'complex np islands subject condition: gap+')

In [None]:
# division by grammaticality
iv_list = ['grammaticality']
mixed_effects_linear_regression(subject_df, iv_list, 'region_surprisal', 'complex np islands subject condition: division by grammaticality')

Object Condition

In [None]:
object_df = sentence_df.copy()

In [None]:
object_df['region_surprisal'] = object_df.apply(lambda x: sum_region_surprisal(x, ['end']), axis = 1)
object_df

In [None]:
# filter for object gap
object_df = object_df[object_df['subj_obj'] == 'object']

In [None]:
# flip test: gap-
no_gap_df = object_df[object_df['gap_numeric'] == -1]
iv_list = ['wh_numeric']
mixed_effects_linear_regression(no_gap_df, iv_list, 'region_surprisal', 'complex np islands object condition: gap-')

In [None]:
# flip test: gap+
gap_df = object_df[object_df['gap_numeric'] == 1]
iv_list = ['wh_numeric']
mixed_effects_linear_regression(gap_df, iv_list, 'region_surprisal', 'complex np islands object condition: gap+')

In [None]:
# division by grammaticality
iv_list = ['grammaticality']
mixed_effects_linear_regression(object_df, iv_list, 'region_surprisal', 'complex np islands object condition: division by grammaticality')

Embedded Clause

In [None]:
embed = ['subj', 'that_rc', 'what_rc', 'rc_np', 'prep', 'prep_np', 'subj_setup', 'obj_setup', 'end']
sentence_df['embed_surprisal'] = sentence_df.apply(lambda x: sum_region_surprisal(x, embed, normalize = True), axis = 1)
sentence_df

Embedded Clause - Subject Condition

In [None]:
# filter for subject condition
subject_df = sentence_df[sentence_df['subj_obj'] == 'subject']

In [None]:
# flip test: gap-
no_gap_df = subject_df[subject_df['gap_numeric'] == -1]
iv_list = ['wh_numeric']
mixed_effects_linear_regression(no_gap_df, iv_list, 'embed_surprisal', 'complex np islands subject condition: gap-')

In [None]:
# flip test: gap+
gap_df = subject_df[subject_df['gap_numeric'] == 1]
iv_list = ['wh_numeric']
mixed_effects_linear_regression(gap_df, iv_list, 'embed_surprisal', 'complex np islands subject condition: gap+')

In [None]:
# division by grammaticality
iv_list = ['grammaticality']
mixed_effects_linear_regression(subject_df, iv_list, 'embed_surprisal', 'complex np islands subject condition: division by grammaticality')

Embedded Clause - Object Condition

In [None]:
# filter for object gap
object_df = sentence_df[sentence_df['subj_obj'] == 'object']

In [None]:
# flip test: gap-
no_gap_df = object_df[object_df['gap_numeric'] == -1]
iv_list = ['wh_numeric']
mixed_effects_linear_regression(no_gap_df, iv_list, 'embed_surprisal', 'complex np islands object condition: gap-')

In [None]:
# flip test: gap+
gap_df = object_df[object_df['gap_numeric'] == 1]
iv_list = ['wh_numeric']
mixed_effects_linear_regression(gap_df, iv_list, 'embed_surprisal', 'complex np islands object condition: gap+')

In [None]:
# division by grammaticality
iv_list = ['grammaticality']
mixed_effects_linear_regression(object_df, iv_list, 'embed_surprisal', 'complex np islands object condition: division by grammaticality')