In [1]:
import pandas as pd
from tqdm import tqdm
import os
import openai
import numpy as np
import pickle as pk
import seaborn as sns
import matplotlib.pyplot as plt
import json
keys = json.load(open("keys.json"))
os.environ["OPENAI_API_KEY"]=keys["OPENAI_API_KEY"]
openai.api_key = os.environ.get("OPENAI_API_KEY")
import together
os.environ["TOGETHER_API_KEY"]=keys["TOGETHER_API_KEY"]
together.api_key = os.environ.get("TOGETHER_API_KEY")
# pd.set_option('display.max_rows', None)  # Show all rows
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('future.no_silent_downcasting', True)
from scipy.stats import chisquare

In [2]:
data2 = pd.read_csv("../csvs/hills.csv")
data3 = pd.read_csv("../csvs/noconstraints.csv")
data4 = pd.read_csv("../csvs/similar.csv")
data5 = pd.read_csv("../csvs/divergent.csv")

In [12]:
featuredict = pk.load(open(f"../files/vf_features.pk", "rb"))
featuredf = pd.DataFrame.from_dict(featuredict, orient='index')
featuredf = featuredf.replace({'True': 1, 'True.': 1, 'False': 0, 'False.': 0})

In [None]:
featuredf = featuredf[~featuredf.applymap(lambda x: isinstance(x, int)).all(axis=1)]

## Analyse features

In [None]:
def get_featuredf():
    featuredict = pk.load(open(f"../files/vf_features.pk", "rb"))
    featuredf = pd.DataFrame.from_dict(featuredict, orient='index')
    featuredf = featuredf.replace({'True': 1, 'True.': 1, 'TRUE': 1, 'False': 0, 'False.': 0})
    featuredf = featuredf[featuredf.applymap(lambda x: isinstance(x, int)).all(axis=1)]
    featuredf = featuredf[featuredf.apply(lambda row: row.map(lambda x: isinstance(x, int)).all(), axis=1)]
    
    # featuredf["feature_Is reptile"] = (featuredf["feature_Is reptile"] | featuredf["feature_Is amphibian"]).astype(int)
    # featuredf.rename(columns={"feature_Is reptile": "feature_Is reptile or amphibian"}, inplace=True)
    # featuredf.drop(columns=["feature_Is amphibian"], inplace=True)
    print(len(featuredict), len(featuredf))
    # assert len(featuredict) == len(featuredf)
    return featuredf, featuredf.columns.tolist()

vf_featuredf, vf_featurecols = get_featuredf()

In [None]:
vf_featuredf

In [None]:
def correlation_matrix(df):
    correlation_matrix = df.corr()
    mask = np.triu(np.ones(correlation_matrix.shape), k=1)  # Upper triangle mask
    corr = correlation_matrix.where(mask == 0)
    plt.figure(figsize=(15, 10))
    sns.heatmap(corr, annot=False, cmap='RdBu', fmt=".1f", mask=np.triu(np.ones_like(corr, dtype=bool)), vmin=-1, vmax=1)
    plt.show()
    return correlation_matrix

vf_featuredf_corr = correlation_matrix(vf_featuredf)

In [None]:
# % of correlated features (greater than 0.5 corr)
print(np.mean(vf_featuredf_corr > 0.5))

# % of correlated features (less than -0.5 corr)
print(np.mean(vf_featuredf_corr < -0.5))

In [None]:
def get_highly_correlated_columns(corr, threshold=0.75):
    # List to store highly correlated columns
    highly_correlated = {}
    
    # can change the logic to only keep correlated columns that as least correlated with others
    # if (np.sum(vf_featuredf_corr.loc["feature_Is mammal"].values) - 1)/(len(vf_featuredf_corr.loc["feature_Is mammal"].values) - 1)

    for i, col in enumerate(corr.columns):
        for prev_col in corr.columns[:i]:  # Check only previous columns
            if abs(corr.loc[col, prev_col]) >= threshold:  # Check correlation
                highly_correlated[col] = prev_col
                print(col, prev_col)
                break
    
    return highly_correlated

high_corr_columns_vf = get_highly_correlated_columns(vf_featuredf_corr)

In [21]:
remove_columns_vf = list(high_corr_columns_vf.keys())

## Add features to responses

In [22]:
def add_features_to_responsedf(df):
    featuredict = vf_featuredf.to_dict(orient='index')
    mapped_features = df['response'].map(featuredict)
    mapped_features = mapped_features.apply(lambda x: x if isinstance(x, dict) else {})
    fc = pd.DataFrame(mapped_features.tolist())
    df = pd.concat([df, fc], axis=1)
    df = df.replace({'True': 1, 'True.': 1, 'False': 0, 'False.': 0})
    dropped_rows = df[df[vf_featurecols].isna().any(axis=1)]
    df = df.dropna(subset=vf_featurecols)
    for col in vf_featurecols:
        df[col] = df[col].astype(int)
    return df, dropped_rows

data2, dropped_rows2 = add_features_to_responsedf(data2)
data3, dropped_rows3 = add_features_to_responsedf(data3)
data4, dropped_rows4 = add_features_to_responsedf(data4)
data5, dropped_rows5 = add_features_to_responsedf(data5)

In [28]:
feature_means = data2[[col for col in data2.columns if col.startswith("feature_")]].mean()

In [None]:
feature_means.tolist()

In [10]:
remove_columns_vf.extend([col for col in vf_featurecols if data2[col].mean() > 0.95 or data2[col].mean() < 0.05 and col != 'feature_Is insect'])

In [None]:
remove_columns_vf

In [12]:
vf_featuredf = vf_featuredf.drop(columns = remove_columns_vf)

In [None]:
vf_featuredf

In [14]:
featuredict = vf_featuredf.to_dict(orient='index')

In [18]:
pk.dump(featuredict, open(f"../files/vf_features_updated.pk", "wb"))

In [None]:
def plot_means(df, featuredf, featurecols):
    fig, ax = plt.subplots(1, 2, figsize=(6,2))
    ax[0].hist(featuredf.mean(axis=0).values);
    ax[0].set_xlabel("P(feature) = 1")
    ax[0].set_ylabel("Number of features")
    ax[1].hist(df[featurecols].mean(axis=0).values);
    ax[1].set_xlabel("P(feature) = 1 in responses")

plot_means(data_vf, vf_featuredf, vf_featurecols)

In [None]:
def find_consecutive_ones(df, feature_col):
    # result = {}
    counts = {}
    for subject in df["pid"].unique():
        subject_data = df[df["pid"] == subject][feature_col].values
        # counts = {}
        current_count = 0
        for value in subject_data:
            if value == 1:
                current_count += 1
            elif current_count > 0:
                counts[current_count] = counts.get(current_count, 0) + 1
                current_count = 0
        # Add the last streak if it ends with a 1
        if current_count > 0:
            counts[current_count] = counts.get(current_count, 0) + 1 
        # result[subject] = counts
    # return result
    return counts

def make_persistance_plots(df, featuredf, featurecols):
    for col in featurecols:
        plt.figure(figsize = (2,2))
        p = df[featurecols].mean(axis=0).loc[col]   # response df
        p_ = featuredf.mean(axis=0).loc[col]        # feature df
        
        print(col, p, p_)

        fco = dict(sorted(find_consecutive_ones(df, col).items()))
        plt.plot(list(fco.keys()), np.array(list(fco.values())) / (np.sum(list(fco.values()))), label = "Data")
        
        x = np.arange(1, list(fco.keys())[-1] + 1)
        geometric_pdf = (p ** x) * (1 - p) 
        plt.plot(x, geometric_pdf, label = "Random")
        
        plt.legend()
        plt.show()

def make_persistance_plots_conditional(df, featuredf, featurecols):
    cols_to_remove = []
    for col in featurecols:
        plt.figure(figsize = (2,2))
        p = df[featurecols].mean(axis=0).loc[col]   # response df
        p_ = featuredf.mean(axis=0).loc[col]        # feature df
        
        print(col, p, p_)

        fco = dict(sorted(find_consecutive_ones(df, col).items()))
        print(fco)
        x = np.array(list(fco.keys()))
        
        data_values = np.array(list(fco.values()))
        observed = data_values / np.sum(data_values)
        plt.plot(x, observed, label = "Data")
        
        geometric_pdf = (p ** (x - 1)) * (1 - p) 
        plt.plot(x, geometric_pdf, label = "Random")

        # expected = geometric_pdf * np.sum(data_values)
        # expected *= np.sum(data_values) / np.sum(expected)

        chi2_stat, p_value = chisquare(data_values, f_exp=geometric_pdf / np.sum(geometric_pdf) * np.sum(data_values))
        print(f"Chi-Square Statistic: {chi2_stat}, p-value: {p_value}")
        if p_value > 0.01:
            cols_to_remove.append(col)
        
        plt.legend()
        plt.show()
    return cols_to_remove

def make_persistance_plots_hazard(df, featuredf, featurecols):
    for col in featurecols:
        plt.figure(figsize = (2,2))
        p = df[featurecols].mean(axis=0).loc[col]   # response df
        p_ = featuredf.mean(axis=0).loc[col]        # feature df
        
        print(col, p, p_)

        fco = dict(sorted(find_consecutive_ones(df, col).items()))

        remaining_population = sum(fco.values())
        hazard_function = []

        for i, freq_i in fco.items():
            hazard_function.append(freq_i / remaining_population)  # Hazard probability for i
            remaining_population -= freq_i

        plt.plot(list(fco.keys()), hazard_function, label = "Data")
        
        x = np.arange(1, list(fco.keys())[-1] + 1)
        geometric_pdf = [(1 - p)] * len(x)
        plt.plot(x, geometric_pdf, label = "Random")
        
        plt.legend()
        plt.show()

In [None]:
# remove_columns_vf.extend(make_persistance_plots_conditional(data_vf, vf_featuredf, vf_featurecols))

In [None]:
# make_persistance_plots_hazard(data_vf, vf_featuredf, vf_featurecols)

In [None]:
# remove_columns_autbrick.extend(make_persistance_plots_conditional(data_autbrick, autbrick_featuredf, autbrick_featurecols))

In [None]:
# make_persistance_plots_hazard(data_autbrick, autbrick_featuredf, autbrick_featurecols)

In [None]:
# remove_columns_autpaperclip.extend(make_persistance_plots_conditional(data_autpaperclip, autpaperclip_featuredf, autpaperclip_featurecols))

In [None]:
# make_persistance_plots_hazard(data_autpaperclip, autpaperclip_featuredf, autpaperclip_featurecols)

# Remove features

In [None]:
vf_featuredf = vf_featuredf.drop(columns = remove_columns_vf)
data_vf = data_vf.drop(columns = remove_columns_vf)
vf_featurecols = [item for item in vf_featurecols if item not in remove_columns_vf]

# autbrick_featuredf = autbrick_featuredf.drop(columns = remove_columns_autbrick)
# # add data line
# autbrick_featurecols = [item for item in autbrick_featurecols if item not in remove_columns_autbrick]

# autpaperclip_featuredf = autpaperclip_featuredf.drop(columns = remove_columns_autpaperclip)
# # add data line
# autpaperclip_featurecols = [item for item in autpaperclip_featurecols if item not in remove_columns_autpaperclip]

In [None]:
print(len(vf_featurecols)) #, len(autbrick_featurecols), len(autpaperclip_featurecols))

In [None]:
final_features = vf_featurecols[:3] + ["feature_Is reptile", "feature_Is amphibian"] + vf_featurecols[3:]
pk.dump(final_features, open("../files/vf_final_features.pk", "wb"))

In [None]:
def get_num_features_same(df, featurecols):
    df['num_features_same'] = None
    df = df.groupby('pid', group_keys=False).apply(calculate_num_features_same, featurecols = featurecols)
    return df

def calculate_num_features_same(group, featurecols):
    group = group.reset_index(drop=True)
    num_features_same = [np.nan]  # Initialize with nan for the first row
    
    for i in range(1, len(group)):
        row1 = group.loc[i - 1, featurecols]
        row2 = group.loc[i, featurecols]
        
        # Check for NaN values
        if row1.isna().any() or row2.isna().any():
            num_features_same.append(np.nan)
        else:
            consecutive_1s = ((row1 == 1) & (row2 == 1)) | ((row1 == 0) & (row2 == 0))
            num_features_same.append(consecutive_1s.sum())
    
    group['num_features_same'] = num_features_same
    return group

data_vf = get_num_features_same(data_vf, vf_featurecols)
data2 = get_num_features_same(data2, vf_featurecols)
data3 = get_num_features_same(data3, vf_featurecols)
data4 = get_num_features_same(data4, vf_featurecols)
data5 = get_num_features_same(data5, vf_featurecols)
# data_autbrick = get_num_features_same(data_autbrick, autbrick_featurecols)
# data_autpaperclip = get_num_features_same(data_autpaperclip, autpaperclip_featurecols)

In [None]:
def get_dot_product(df, featurecols):
    df['dot_product'] = None
    df = df.groupby('pid', group_keys=False).apply(calculate_dot_product, featurecols = featurecols)
    return df

def calculate_dot_product(group, featurecols):
    group = group.reset_index(drop=True)
    dot_product = [np.nan, np.nan]  # Initialize with nan for the first row
    
    for i in range(2, len(group)):
        row1 = group.loc[i - 2, featurecols]
        row2 = group.loc[i - 1, featurecols]
        row3 = group.loc[i, featurecols]
        
        # Check for NaN values
        if row1.isna().any() or row2.isna().any():
            dot_product.append(np.nan)
        else:
            vec1 = (row1 == row2).astype(int)
            vec2 = (row2 == row3).astype(int)
            dot_product.append(np.dot(vec1, vec2))
    
    group['dot_product'] = dot_product  
    return group

# data_vf = get_dot_product(data_vf, vf_featurecols)
# data2 = get_dot_product(data2, vf_featurecols)
# data3 = get_dot_product(data3, vf_featurecols)
# data4 = get_dot_product(data4, vf_featurecols)
data5 = get_dot_product(data5, vf_featurecols)

In [None]:
def calculate_dot_product(group, featurecols):
    group = group.reset_index(drop=True)
    n = len(group)
    
    if n < 3:
        group['dot_product'] = [np.nan] * n
        return group
    
    dot_product = [np.nan, np.nan]  # For first two rows

    for i in range(2, n):
        row1 = group.loc[i - 2, featurecols]
        row2 = group.loc[i - 1, featurecols]
        row3 = group.loc[i, featurecols]

        if row1.isna().any() or row2.isna().any():
            dot_product.append(np.nan)
        else:
            vec1 = (row1 == row2).astype(int)
            vec2 = (row2 == row3).astype(int)
            dot_product.append(np.dot(vec1, vec2))

    group['dot_product'] = dot_product
    return group

# data_vf = get_dot_product(data_vf, vf_featurecols)
# data2 = get_dot_product(data2, vf_featurecols)
data3 = get_dot_product(data3, vf_featurecols)
data4 = get_dot_product(data4, vf_featurecols)
data5 = get_dot_product(data5, vf_featurecols)

In [None]:
plt.figure(figsize=(6,5))
plt.hist(data3["num_features_same"].tolist()[:min(len(data3), len(data4), len(data5))], alpha=0.3, label="noconstraints")
plt.hist(data4["num_features_same"].tolist()[:min(len(data3), len(data4), len(data5))], alpha=0.3, label="similar")
plt.hist(data5["num_features_same"].tolist()[:min(len(data3), len(data4), len(data5))], alpha=0.3, label="divergent")
plt.legend();

In [None]:
plt.figure(figsize=(6,5))
plt.hist(data3["dot_product"].tolist()[:min(len(data3), len(data4), len(data5))], alpha=0.3, label="noconstraints")
plt.hist(data4["dot_product"].tolist()[:min(len(data3), len(data4), len(data5))], alpha=0.3, label="similar")
plt.hist(data5["dot_product"].tolist()[:min(len(data3), len(data4), len(data5))], alpha=0.3, label="divergent")
plt.legend();

In [None]:
plt.figure(figsize=(6,5))
plt.hist(data3["RT"].tolist()[:min(len(data3), len(data4), len(data5))], alpha=0.3, label="noconstraints", bins=100)
plt.hist(data4["RT"].tolist()[:min(len(data3), len(data4), len(data5))], alpha=0.3, label="similar", bins=100)
plt.hist(data5["RT"].tolist()[:min(len(data3), len(data4), len(data5))], alpha=0.3, label="divergent", bins=100)
plt.xlim(0, 20000)
plt.legend();

In [None]:
plt.figure(figsize=(6,5))
plt.hist(data3.groupby("pid").count()["rid"].tolist(), alpha=0.3, label="noconstraints", bins=10)
plt.hist(data4.groupby("pid").count()["rid"].tolist(), alpha=0.3, label="similar", bins=10)
plt.hist(data5.groupby("pid").count()["rid"].tolist(), alpha=0.3, label="divergent", bins=10)
plt.legend();

In [None]:
data3

In [None]:
from scipy.stats import spearmanr

from scipy.stats import spearmanr

# Data 2
subset = data2[["num_features_same", "dot_product", "RT"]].dropna()
# print("data2 - Pearson:",
#       np.corrcoef(subset["num_features_same"], subset["RT"])[0, 1],
#       np.corrcoef(subset["dot_product"], subset["RT"])[0, 1])
print("data2 - Spearman:",
      spearmanr(subset["num_features_same"], subset["RT"]).correlation,
      spearmanr(subset["dot_product"], subset["RT"]).correlation)

# Data 3
subset = data3[["num_features_same", "dot_product", "RT"]].dropna()
# print("data3 - Pearson:",
#       np.corrcoef(subset["num_features_same"], subset["RT"])[0, 1],
#       np.corrcoef(subset["dot_product"], subset["RT"])[0, 1])
print("data3 - Spearman:",
      spearmanr(subset["num_features_same"], subset["RT"]).correlation,
      spearmanr(subset["dot_product"], subset["RT"]).correlation)

# Data 4
subset = data4[["num_features_same", "dot_product", "RT"]].dropna()
# print("data4 - Pearson:",
#       np.corrcoef(subset["num_features_same"], subset["RT"])[0, 1],
#       np.corrcoef(subset["dot_product"], subset["RT"])[0, 1])
print("data4 - Spearman:",
      spearmanr(subset["num_features_same"], subset["RT"]).correlation,
      spearmanr(subset["dot_product"], subset["RT"]).correlation)

# Data 5
subset = data5[["num_features_same", "dot_product", "RT"]].dropna()
# print("data5 - Pearson:",
#       np.corrcoef(subset["num_features_same"], subset["RT"])[0, 1],
#       np.corrcoef(subset["dot_product"], subset["RT"])[0, 1])
print("data5 - Spearman:",
      spearmanr(subset["num_features_same"], subset["RT"]).correlation,
      spearmanr(subset["dot_product"], subset["RT"]).correlation)


In [None]:
subset = data2[["num_features_same", "dot_product"]].dropna()

In [None]:
data2['num_features_same'].corr(data2['dot_product'], method='pearson')

In [None]:
plt.figure(figsize=(3,3))
plt.hist(data2[["pid", "num_features_same", "dot_product"]].groupby("pid").mean()["num_features_same"].tolist())
plt.ylabel("Number of Ppts")
plt.xlabel("Mean num features same")
plt.xlim(85, 110);

plt.figure(figsize=(3,3))
plt.hist(data2[["pid", "num_features_same", "dot_product"]].groupby("pid").mean()["dot_product"].tolist())
plt.ylabel("Number of Ppts")
plt.xlabel("Mean dot product")
plt.xlim(65, 95);

In [None]:
data2

In [None]:
# shuffle within each pid
shuffled_data2 = data2.groupby("pid", group_keys=False).apply(lambda x: x.sample(frac=1).reset_index(drop=True))
shuffled_data2 = get_num_features_same(shuffled_data2, vf_featurecols)
shuffled_data2 = get_dot_product(shuffled_data2, vf_featurecols)

plt.figure(figsize=(3,3))
plt.hist(shuffled_data2[["pid", "num_features_same", "dot_product"]].groupby("pid").mean()["num_features_same"].tolist())
plt.ylabel("Number of Ppts")
plt.xlabel("Mean num features same")
plt.xlim(85, 110);

plt.figure(figsize=(3,3))
plt.hist(shuffled_data2[["pid", "num_features_same", "dot_product"]].groupby("pid").mean()["dot_product"].tolist())
plt.ylabel("Number of Ppts")
plt.xlabel("Mean dot product")
plt.xlim(65, 95);

In [None]:
def get_num_features_same(df, featurecols):
    df['num_features_same_2back'] = np.nan  # Initialize the column with NaN
    df = df.groupby('pid', group_keys=False).apply(calculate_num_features_same, featurecols=featurecols)
    return df

def calculate_num_features_same(group, featurecols):
    group = group.reset_index(drop=True)
    
    num_features_same = np.full(len(group), np.nan)  # Initialize with NaN
    
    for i in range(2, len(group)):
        row1 = group.loc[i - 2, featurecols]
        row2 = group.loc[i, featurecols]

        # Check for NaN values
        if row1.isna().any() or row2.isna().any():
            num_features_same[i] = np.nan
        else:
            consecutive_1s = ((row1 == 1) & (row2 == 1)) | ((row1 == 0) & (row2 == 0))
            num_features_same[i] = consecutive_1s.sum()
    
    group['num_features_same_2back'] = num_features_same  # Assign correctly
    return group

# Apply the function to your datasets
data_vf = get_num_features_same(data_vf, vf_featurecols)
# data_autbrick = get_num_features_same(data_autbrick, autbrick_featurecols)
# data_autpaperclip = get_num_features_same(data_autpaperclip, autpaperclip_featurecols)

In [None]:
def get_num_features_same(df, featurecols):
    df['num_features_same_3back'] = np.nan  # Initialize the column with NaN
    df = df.groupby('pid', group_keys=False).apply(calculate_num_features_same, featurecols=featurecols)
    return df

def calculate_num_features_same(group, featurecols):
    group = group.reset_index(drop=True)
    
    num_features_same = np.full(len(group), np.nan)  # Initialize with NaN
    
    for i in range(3, len(group)):
        row1 = group.loc[i - 3, featurecols]
        row2 = group.loc[i, featurecols]

        # Check for NaN values
        if row1.isna().any() or row2.isna().any():
            num_features_same[i] = np.nan
        else:
            consecutive_1s = ((row1 == 1) & (row2 == 1)) | ((row1 == 0) & (row2 == 0))
            num_features_same[i] = consecutive_1s.sum()
    
    group['num_features_same_3back'] = num_features_same  # Assign correctly
    return group

# Apply the function to your datasets
data_vf = get_num_features_same(data_vf, vf_featurecols)
# data_autbrick = get_num_features_same(data_autbrick, autbrick_featurecols)
# data_autpaperclip = get_num_features_same(data_autpaperclip, autpaperclip_featurecols)

In [None]:
def get_num_features_same(df, featurecols):
    df['num_features_same_4back'] = np.nan  # Initialize the column with NaN
    df = df.groupby('pid', group_keys=False).apply(calculate_num_features_same, featurecols=featurecols)
    return df

def calculate_num_features_same(group, featurecols):
    group = group.reset_index(drop=True)
    
    num_features_same = np.full(len(group), np.nan)  # Initialize with NaN
    
    for i in range(4, len(group)):
        row1 = group.loc[i - 4, featurecols]
        row2 = group.loc[i, featurecols]

        # Check for NaN values
        if row1.isna().any() or row2.isna().any():
            num_features_same[i] = np.nan
        else:
            consecutive_1s = ((row1 == 1) & (row2 == 1)) | ((row1 == 0) & (row2 == 0))
            num_features_same[i] = consecutive_1s.sum()
    
    group['num_features_same_4back'] = num_features_same  # Assign correctly
    return group

# Apply the function to your datasets
data_vf = get_num_features_same(data_vf, vf_featurecols)
# data_autbrick = get_num_features_same(data_autbrick, autbrick_featurecols)
# data_autpaperclip = get_num_features_same(data_autpaperclip, autpaperclip_featurecols)

In [None]:
def get_num_features_same(df, featurecols):
    df['num_features_same_5back'] = np.nan  # Initialize the column with NaN
    df = df.groupby('pid', group_keys=False).apply(calculate_num_features_same, featurecols=featurecols)
    return df

def calculate_num_features_same(group, featurecols):
    group = group.reset_index(drop=True)
    
    num_features_same = np.full(len(group), np.nan)  # Initialize with NaN
    
    for i in range(5, len(group)):
        row1 = group.loc[i - 5, featurecols]
        row2 = group.loc[i, featurecols]

        # Check for NaN values
        if row1.isna().any() or row2.isna().any():
            num_features_same[i] = np.nan
        else:
            consecutive_1s = ((row1 == 1) & (row2 == 1)) | ((row1 == 0) & (row2 == 0))
            num_features_same[i] = consecutive_1s.sum()
    
    group['num_features_same_5back'] = num_features_same  # Assign correctly
    return group

# Apply the function to your datasets
data_vf = get_num_features_same(data_vf, vf_featurecols)
# data_autbrick = get_num_features_same(data_autbrick, autbrick_featurecols)
# data_autpaperclip = get_num_features_same(data_autpaperclip, autpaperclip_featurecols)

In [None]:
# Calculate means and standard errors
means = [
    np.mean(data_vf["num_features_same_5back"]),
    np.mean(data_vf["num_features_same_4back"]),
    np.mean(data_vf["num_features_same_3back"]),
    np.mean(data_vf["num_features_same_2back"]),
    np.mean(data_vf["num_features_same"])
]

std_errors = [
    np.std(data_vf["num_features_same_5back"], ddof=1) / np.sqrt(len(data_vf["num_features_same_5back"].dropna())),
    np.std(data_vf["num_features_same_4back"], ddof=1) / np.sqrt(len(data_vf["num_features_same_4back"].dropna())),
    np.std(data_vf["num_features_same_3back"], ddof=1) / np.sqrt(len(data_vf["num_features_same_3back"].dropna())),
    np.std(data_vf["num_features_same_2back"], ddof=1) / np.sqrt(len(data_vf["num_features_same_2back"].dropna())),
    np.std(data_vf["num_features_same"], ddof=1) / np.sqrt(len(data_vf["num_features_same"].dropna()))
]

x_labels = [-5, -4, -3, -2, -1]

# Plot bar chart with error bars
plt.figure(figsize=(4,3))
plt.bar(x_labels, means, yerr=std_errors, capsize=5, alpha=0.7, color='mediumpurple')
plt.xlabel("Back Steps")
plt.ylabel("Mean Number of Features Same")
plt.title("Mean Number of Features Same")
plt.ylim(50, 65)
plt.show()

In [None]:
def get_num_features_same(df, featurecols):
    df['RT_diff_1back'] = np.nan  # Initialize the column with NaN
    df = df.groupby('pid', group_keys=False).apply(calculate_num_features_same, featurecols=featurecols)
    return df

def calculate_num_features_same(group, featurecols):
    group = group.reset_index(drop=True)
    
    num_features_same = np.full(len(group), np.nan)  # Initialize with NaN
    
    for i in range(1, len(group)):
        row1 = group.loc[i - 1, ["RT"]]
        row2 = group.loc[i, ["RT"]]

        # Check for NaN values
        if row1.isna().any() or row2.isna().any():
            num_features_same[i] = np.nan
        else:
            RTdiff = row2["RT"] - row1["RT"]
            num_features_same[i] = RTdiff
    
    group['RT_diff_1back'] = num_features_same  # Assign correctly
    return group

# Apply the function to your datasets
data_vf = get_num_features_same(data_vf, vf_featurecols)
# data_autbrick = get_num_features_same(data_autbrick, autbrick_featurecols)
# data_autpaperclip = get_num_features_same(data_autpaperclip, autpaperclip_featurecols)

In [None]:
def get_num_features_same(df, featurecols):
    df['RT_diff_2back'] = np.nan  # Initialize the column with NaN
    df = df.groupby('pid', group_keys=False).apply(calculate_num_features_same, featurecols=featurecols)
    return df

def calculate_num_features_same(group, featurecols):
    group = group.reset_index(drop=True)
    
    num_features_same = np.full(len(group), np.nan)  # Initialize with NaN
    
    for i in range(2, len(group)):
        row1 = group.loc[i - 2, ["RT"]]
        row2 = group.loc[i, ["RT"]]

        # Check for NaN values
        if row1.isna().any() or row2.isna().any():
            num_features_same[i] = np.nan
        else:
            RTdiff = row2["RT"] - row1["RT"]
            num_features_same[i] = RTdiff
    
    group['RT_diff_2back'] = num_features_same  # Assign correctly
    return group

# Apply the function to your datasets
data_vf = get_num_features_same(data_vf, vf_featurecols)
# data_autbrick = get_num_features_same(data_autbrick, autbrick_featurecols)
# data_autpaperclip = get_num_features_same(data_autpaperclip, autpaperclip_featurecols)

In [None]:
def get_num_features_same(df, featurecols):
    df['RT_diff_3back'] = np.nan  # Initialize the column with NaN
    df = df.groupby('pid', group_keys=False).apply(calculate_num_features_same, featurecols=featurecols)
    return df

def calculate_num_features_same(group, featurecols):
    group = group.reset_index(drop=True)
    
    num_features_same = np.full(len(group), np.nan)  # Initialize with NaN
    
    for i in range(3, len(group)):
        row1 = group.loc[i - 3, ["RT"]]
        row2 = group.loc[i, ["RT"]]

        # Check for NaN values
        if row1.isna().any() or row2.isna().any():
            num_features_same[i] = np.nan
        else:
            RTdiff = row2["RT"] - row1["RT"]
            num_features_same[i] = RTdiff
    
    group['RT_diff_3back'] = num_features_same  # Assign correctly
    return group

# Apply the function to your datasets
data_vf = get_num_features_same(data_vf, vf_featurecols)
# data_autbrick = get_num_features_same(data_autbrick, autbrick_featurecols)
# data_autpaperclip = get_num_features_same(data_autpaperclip, autpaperclip_featurecols)

In [None]:
def get_num_features_same(df, featurecols):
    df['RT_diff_4back'] = np.nan  # Initialize the column with NaN
    df = df.groupby('pid', group_keys=False).apply(calculate_num_features_same, featurecols=featurecols)
    return df

def calculate_num_features_same(group, featurecols):
    group = group.reset_index(drop=True)
    
    num_features_same = np.full(len(group), np.nan)  # Initialize with NaN
    
    for i in range(4, len(group)):
        row1 = group.loc[i - 4, ["RT"]]
        row2 = group.loc[i, ["RT"]]

        # Check for NaN values
        if row1.isna().any() or row2.isna().any():
            num_features_same[i] = np.nan
        else:
            RTdiff = row2["RT"] - row1["RT"]
            num_features_same[i] = RTdiff
    
    group['RT_diff_4back'] = num_features_same  # Assign correctly
    return group

# Apply the function to your datasets
data_vf = get_num_features_same(data_vf, vf_featurecols)
# data_autbrick = get_num_features_same(data_autbrick, autbrick_featurecols)
# data_autpaperclip = get_num_features_same(data_autpaperclip, autpaperclip_featurecols)

In [None]:
def get_num_features_same(df, featurecols):
    df['RT_diff_5back'] = np.nan  # Initialize the column with NaN
    df = df.groupby('pid', group_keys=False).apply(calculate_num_features_same, featurecols=featurecols)
    return df

def calculate_num_features_same(group, featurecols):
    group = group.reset_index(drop=True)
    
    num_features_same = np.full(len(group), np.nan)  # Initialize with NaN
    
    for i in range(5, len(group)):
        row1 = group.loc[i - 5, ["RT"]]
        row2 = group.loc[i, ["RT"]]

        # Check for NaN values
        if row1.isna().any() or row2.isna().any():
            num_features_same[i] = np.nan
        else:
            RTdiff = row2["RT"] - row1["RT"]
            num_features_same[i] = RTdiff
    
    group['RT_diff_5back'] = num_features_same  # Assign correctly
    return group

# Apply the function to your datasets
data_vf = get_num_features_same(data_vf, vf_featurecols)
# data_autbrick = get_num_features_same(data_autbrick, autbrick_featurecols)
# data_autpaperclip = get_num_features_same(data_autpaperclip, autpaperclip_featurecols)

In [None]:
def get_num_features_same(df, featurecols):
    df['RT_diff_0'] = np.nan  # Initialize the column with NaN
    df = df.groupby('pid', group_keys=False).apply(calculate_num_features_same, featurecols=featurecols)
    return df

def calculate_num_features_same(group, featurecols):
    group = group.reset_index(drop=True)
    
    num_features_same = np.full(len(group), np.nan)  # Initialize with NaN
    
    for i in range(0, len(group)):
        row1 = group.loc[i, ["RT"]]

        # Check for NaN values
        if row1.isna().any():
            num_features_same[i] = np.nan
        else:
            RTdiff = 0
            num_features_same[i] = RTdiff
    
    group['RT_diff_0'] = num_features_same  # Assign correctly
    return group

# Apply the function to your datasets
data_vf = get_num_features_same(data_vf, vf_featurecols)
# data_autbrick = get_num_features_same(data_autbrick, autbrick_featurecols)
# data_autpaperclip = get_num_features_same(data_autpaperclip, autpaperclip_featurecols)

In [None]:
def get_num_features_same(df, featurecols):
    df['RT_diff_1ahead'] = np.nan  # Initialize the column with NaN
    df = df.groupby('pid', group_keys=False).apply(calculate_num_features_same, featurecols=featurecols)
    return df

def calculate_num_features_same(group, featurecols):
    group = group.reset_index(drop=True)
    
    num_features_same = np.full(len(group), np.nan)  # Initialize with NaN
    
    for i in range(0, len(group) - 1):
        row1 = group.loc[i + 1, ["RT"]]
        row2 = group.loc[i, ["RT"]]

        # Check for NaN values
        if row1.isna().any() or row2.isna().any():
            num_features_same[i] = np.nan
        else:
            RTdiff = row2["RT"] - row1["RT"]
            num_features_same[i] = RTdiff
    
    group['RT_diff_1ahead'] = num_features_same  # Assign correctly
    return group

# Apply the function to your datasets
data_vf = get_num_features_same(data_vf, vf_featurecols)
# data_autbrick = get_num_features_same(data_autbrick, autbrick_featurecols)
# data_autpaperclip = get_num_features_same(data_autpaperclip, autpaperclip_featurecols)

In [None]:
def get_num_features_same(df, featurecols):
    df['RT_diff_2ahead'] = np.nan  # Initialize the column with NaN
    df = df.groupby('pid', group_keys=False).apply(calculate_num_features_same, featurecols=featurecols)
    return df

def calculate_num_features_same(group, featurecols):
    group = group.reset_index(drop=True)
    
    num_features_same = np.full(len(group), np.nan)  # Initialize with NaN
    
    for i in range(0, len(group) - 2):
        row1 = group.loc[i + 2, ["RT"]]
        row2 = group.loc[i, ["RT"]]

        # Check for NaN values
        if row1.isna().any() or row2.isna().any():
            num_features_same[i] = np.nan
        else:
            RTdiff = row2["RT"] - row1["RT"]
            num_features_same[i] = RTdiff
    
    group['RT_diff_2ahead'] = num_features_same  # Assign correctly
    return group

# Apply the function to your datasets
data_vf = get_num_features_same(data_vf, vf_featurecols)
# data_autbrick = get_num_features_same(data_autbrick, autbrick_featurecols)
# data_autpaperclip = get_num_features_same(data_autpaperclip, autpaperclip_featurecols)

In [None]:
# Calculate means and standard errors
means = [
    np.mean(data_vf["RT_diff_5back"]),
    np.mean(data_vf["RT_diff_4back"]),
    np.mean(data_vf["RT_diff_3back"]),
    np.mean(data_vf["RT_diff_2back"]),
    np.mean(data_vf["RT_diff_1back"]),
    np.mean(data_vf["RT_diff_0"]),
    np.mean(data_vf["RT_diff_1ahead"]),
    np.mean(data_vf["RT_diff_2ahead"])
]

std_errors = [
    np.std(data_vf["RT_diff_5back"], ddof=1) / np.sqrt(len(data_vf["RT_diff_5back"].dropna())),
    np.std(data_vf["RT_diff_4back"], ddof=1) / np.sqrt(len(data_vf["RT_diff_4back"].dropna())),
    np.std(data_vf["RT_diff_3back"], ddof=1) / np.sqrt(len(data_vf["RT_diff_3back"].dropna())),
    np.std(data_vf["RT_diff_2back"], ddof=1) / np.sqrt(len(data_vf["RT_diff_2back"].dropna())),
    np.std(data_vf["RT_diff_1back"], ddof=1) / np.sqrt(len(data_vf["RT_diff_1back"].dropna())),
    np.std(data_vf["RT_diff_0"], ddof=1) / np.sqrt(len(data_vf["RT_diff_0"].dropna())),
    np.std(data_vf["RT_diff_1ahead"], ddof=1) / np.sqrt(len(data_vf["RT_diff_1ahead"].dropna())),
    np.std(data_vf["RT_diff_2ahead"], ddof=1) / np.sqrt(len(data_vf["RT_diff_2ahead"].dropna()))
]

x_labels = [-5, -4, -3, -2, -1, 0, 1, 2]

# Plot bar chart with error bars
plt.figure(figsize=(8, 6))
plt.bar(x_labels, means, yerr=std_errors, capsize=5, alpha=0.7, color='mediumpurple')
plt.xlabel("Back Steps")
plt.ylabel("Mean RT")
plt.title("Mean RT")
plt.show()

In [None]:
def has_two_chunks(lst):
    count = 0  # Count of contiguous chunks of ones with length > 1
    i = 0
    while i < len(lst):
        if lst[i] == 1:
            start = i
            while i < len(lst) and lst[i] == 1:
                i += 1
            if i - start > 1:
                count += 1
                if count >= 2:
                    return 1
        else:
            i += 1
    return 0

# Apply function to each feature_* column, grouped by 'pid'
returned_to_same_feature = {
    col: data_vf.groupby("pid")[col].apply(has_two_chunks) for col in vf_featurecols
}

# Convert results to DataFrame
returned_to_same_feature_df = pd.DataFrame(returned_to_same_feature)
returned = (np.sum(returned_to_same_feature_df, axis=0)/len(data_vf["pid"].unique())).to_dict()
returned = dict(sorted(returned.items(), key=lambda item: item[1]))
plt.figure(figsize=(3,14))
plt.barh(list(returned.keys()), list(returned.values()))
plt.xticks(rotation=90);

In [None]:
plt.figure(figsize=(3,2))
plt.hist(data_vf["num_features_same"], alpha=0.3, label="vf", color="mediumpurple");
# plt.hist(data_autbrick["num_features_same"], alpha=0.3, label="autbrick");
# plt.hist(data_autpaperclip["num_features_same"], alpha=0.3, label="autpaperclip");
# plt.legend();
plt.xlabel("Number of features same")
plt.ylabel("Num responses")

In [None]:
data_vf.head(30)

In [None]:
data_vf = data_vf[(data_vf["order"] > 0) & (data_vf["order"] < 20)]
# data_autbrick = data_autbrick[(data_autbrick["order"] > 0) & (data_autbrick["order"] < 20)]
# data_autpaperclip = data_autpaperclip[(data_autpaperclip["order"] > 0) & (data_autpaperclip["order"] < 20)]

In [None]:
data_humans = pd.read_csv("../csvs/data_humans.csv")

In [None]:
data_vf.columns

In [None]:
data_vf = pd.merge(data_vf, data_humans[data_humans["task"] == 1].drop("Unnamed: 0", axis=1), on=['pid', 'task', 'starttime', 'endtime', 'original_response_Dutch',
       'original_response', 'original_response_cleaned', 'response', 'invalid', 'response_len', 'response_num_words', 'order', 'RT'], how='left')
data_vf = data_vf[~data_vf["response"].isin(vf_to_remove)]

# data_autbrick = pd.merge(data_autbrick, data_humans[data_humans["task"] == 2].drop("Unnamed: 0", axis=1), on=['pid', 'task', 'starttime', 'endtime', 'original_response_Dutch',
#        'original_response', 'original_response_cleaned', 'response', 'invalid', 'response_len', 'response_num_words', 'previous_original_response', 'previous_response', 'order', 'RT'], how='left')
# data_autpaperclip = pd.merge(data_autpaperclip, data_humans[data_humans["task"] == 3].drop("Unnamed: 0", axis=1), on=['pid', 'task', 'starttime', 'endtime', 'original_response_Dutch',
#        'original_response', 'original_response_cleaned', 'response', 'invalid', 'response_len', 'response_num_words', 'previous_original_response', 'previous_response', 'order', 'RT'], how='left')

In [None]:
plt.figure(figsize=(4,3))
valid_indices = ~np.isnan(data_vf["SS"]) & ~np.isnan(data_vf["num_features_same"])
filtered_SS = data_vf["SS"][valid_indices]
filtered_num_features_same = data_vf["num_features_same"][valid_indices]
plt.scatter(filtered_SS, filtered_num_features_same, alpha=0.3, c="mediumpurple")
print(np.corrcoef(filtered_SS, filtered_num_features_same)[0,1])
plt.xlabel("GTE Large SS")
plt.ylabel("Number of features same");

# plt.figure()
# plt.scatter(data_autbrick["SS"], data_autbrick["num_features_same"])
# plt.xlabel("SS")
# plt.ylabel("Number of features same");

# plt.figure()
# plt.scatter(data_autpaperclip["SS"], data_autpaperclip["num_features_same"])
# plt.xlabel("SS")
# plt.ylabel("Number of features same");

In [None]:
data_vf['previous_response'] = data_vf.groupby('pid')['response'].shift(1)

In [None]:
# least similar
# print(data_vf[["response", "previous_response", "num_features_same", "SS", "jump"]][data_vf["num_features_same"] < 10]["SS"].mean())
data_vf[["pid", "response", "previous_response", "num_features_same"]][data_vf["num_features_same"] < 44].drop_duplicates().head(50)

In [None]:
# most similar
# print(data_vf[["response", "previous_response", "num_features_same", "SS", "jump"]][data_vf["num_features_same"] > 80]["SS"].mean())
data_vf[["response", "previous_response", "num_features_same"]][data_vf["num_features_same"] > 78].drop_duplicates().sort_values(by=["num_features_same"], ascending=False).head(50)

In [None]:
import pandas as pd
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

# Step 1: Assume your dataframe is called df
names = vf_featuredf.iloc[:, 0]            # First column = names
features = vf_featuredf.iloc[:, 1:]        # Rest = binary features

# Step 2: t-SNE with Hamming distance (good for binary vectors)
tsne = TSNE(n_components=2, perplexity=50, random_state=42, metric='hamming')
embedding = tsne.fit_transform(features)

# Step 3: Plotting
plt.figure(figsize=(10, 7))
plt.scatter(embedding[:, 0], embedding[:, 1], s=50, alpha=0.5, c="mediumpurple")

# Optionally annotate points with names
for i, name in enumerate(names):
    if i % 4 == 0:
        plt.text(embedding[i, 0] + 0.005, embedding[i, 1] + 0.005, str(name), fontsize=10)

plt.title("t-SNE of Binary Vectors")
plt.xlabel("Dimension 1")
plt.ylabel("Dimension 2")
plt.grid(False)
plt.show()

In [None]:
vf_featuredf = vf_featuredf.reset_index()

In [None]:
temp = np.array(list(pd.merge(data_vf[["pid", "num_features_same"]].groupby("pid").mean().reset_index(), data_vf[["pid", "jump_profile"]].groupby("pid").max().reset_index(), on=["pid"]).sort_values("jump_profile")["num_features_same"]))
arr = temp[np.argsort(temp)][:-1]

In [None]:
# plt.scatter(data_vf[["pid", "jump_profile"]].groupby("pid").max().reset_index()["jump_profile"].values, data_vf[["pid", "num_features_same"]].groupby("pid").mean()["num_features_same"].values)
plt.figure(figsize=(4,3))
plt.scatter(np.arange(219), pd.merge(data_vf[["pid", "num_features_same"]].groupby("pid").mean().reset_index(), data_vf[["pid", "jump_profile"]].groupby("pid").max().reset_index(), on=["pid"]).sort_values("jump_profile")["num_features_same"], c="mediumpurple")
print(np.corrcoef(np.arange(218), arr)[0,1])
plt.xlabel("Max jump profile")
plt.ylabel("Mean features same");

# plt.figure()
# t = pd.merge(data_autbrick[["pid", "num_features_same"]].groupby("pid").mean().reset_index(), data_autbrick[["pid", "jump_profile"]].groupby("pid").max().reset_index(), on=["pid"]).sort_values("jump_profile")["num_features_same"]
# plt.scatter(np.arange(len(t)), t)
# plt.xlabel("Max jump profile")
# plt.ylabel("Mean features same");

# plt.figure()
# t = pd.merge(data_autpaperclip[["pid", "num_features_same"]].groupby("pid").mean().reset_index(), data_autpaperclip[["pid", "jump_profile"]].groupby("pid").max().reset_index(), on=["pid"]).sort_values("jump_profile")["num_features_same"]
# plt.scatter(np.arange(len(t)), t)
# plt.xlabel("Max jump profile")
# plt.ylabel("Mean features same");

In [None]:
def plot_binary_features_heatmap(df, featurecols):
    pids = df["pid"].unique()
    cnt = 1
    for pid in pids:
        responses = df[df["pid"] == pid]["response"].values
        pid_data = df[df["pid"] == pid][featurecols].reset_index(drop=True)
        
        plt.figure(figsize=(25, len(pid_data) * 0.25))
        sns.heatmap(
            pid_data,
            cmap=sns.color_palette(["white", "mediumpurple"]),
            cbar=False,
            linewidths=0.5,
            linecolor='black'
        )
        plt.title(f"Binary Features Heatmap for PID {pid}")
        plt.gca().xaxis.tick_top()
        plt.xticks(ticks=np.arange(len(featurecols)) + 0.5, labels=featurecols, rotation=90)
        plt.yticks(ticks=np.arange(len(responses)) + 0.5, labels=responses, rotation=0)
        plt.show()
        
        if cnt == 10:
            break
        cnt += 1

In [None]:
data_vf

In [None]:
plot_binary_features_heatmap(data_vf, vf_featurecols)

In [None]:
data_vf

In [None]:
plot_binary_features_heatmap(data_autbrick, autbrick_featurecols)

In [None]:
plot_binary_features_heatmap(data_autpaperclip, autpaperclip_featurecols)

In [None]:
# Few vs many dimensions
# How persistant are different dimensions? Compared to random
# Transitions between different dimensions? 