In [1]:
# load packages
import os
import sys
import numpy as np
import pandas as pd
import pickle
import matplotlib.pyplot as plt

sys.path.append("code")
import ridge_utils

In [2]:
# define path to data
data_path = '/ocean/projects/mth240012p/shared/data'

In [3]:
# load wordseqs dictionary
with open(f'{data_path}/raw_text.pkl', 'rb') as file:
    wordseqs = pickle.load(file)

  wordseqs = pickle.load(file)


In [5]:
# extract story IDs from filenames
stories2 = [i[:-4] for i in os.listdir(f"{data_path}/subject2")]
stories3 = [i[:-4] for i in os.listdir(f"{data_path}/subject3")]

# check that story IDs match
assert len(stories2) == len(stories3)
assert set(stories2) == set(stories3)

stories = stories2
N = len(stories)

In [73]:
# create dataframe of word and char counts from wordseqs dictionary data
def words_data():
    rows = []
    for i in range(N):
        story = stories[i]
        words = wordseqs[story].data
        chars = " ".join(words)
        w = len(words)
        c = len(chars)
        row = {"words": w, "chars": c}
        rows.append({"story_index": i, "story_name": story, "words": w, "chars": c})
    df = pd.DataFrame(rows)
    return df

# create dataframe
df_words = words_data()
df_words

Unnamed: 0,story_index,story_name,words,chars
0,0,stumblinginthedark,2681,13350
1,1,singlewomanseekingmanwich,1486,7411
2,2,theclosetthatateeverything,1928,9646
3,3,jugglingandjesus,887,4552
4,4,threemonths,2062,9709
...,...,...,...,...
96,96,canplanetearthfeedtenbillionpeoplepart2,2532,15344
97,97,notontheusualtour,1431,7301
98,98,canplanetearthfeedtenbillionpeoplepart3,2066,12548
99,99,thatthingonmyarm,2073,10222


In [74]:
# save dataframe of word and char counts to file
df_words.to_csv("words.csv", index=False)

In [7]:
# function to load data for a given subject, story pair
def load_fMRI(subject, story, mmap_mode=None):
    data = np.load(f"{data_path}/subject{subject}/{story}.npy", mmap_mode=mmap_mode)
    return data

In [61]:
# create dataframe of summary stats for along every timepoint of each story
def summary_stats(subject, percentiles=[25, 50, 75]):
    dfs = []
    cols = ["story_index", "story_name", "time_index", "nans", "infs", "min", "mean", "max"] + [f"q_{p}" for p in percentiles]

    # process each .npy file
    for i in range(N):
        story = stories[i]
        data = load_fMRI(subject, story)
        nrow, ncol = data.shape

        minimum = np.round(np.nanmin(data, axis=1), 4)
        mean = np.round(np.nanmean(data, axis=1), 4)
        maximum = np.round(np.nanmax(data, axis=1), 4)
        percs = np.round(np.nanpercentile(data, percentiles, axis=1), 4)
        nans = np.sum(np.isnan(data), axis=1)
        infs = np.sum(np.isinf(data), axis=1)

        story_index = np.repeat(i, nrow)
        story_name = np.repeat(story, nrow)
        time_index = np.arange(nrow)

        summary = pd.DataFrame({
            "story_index": story_index,
            "story_name": story_name,
            "time_index": time_index,
            "nans": nans,
            "infs": infs,
            "min": minimum,
            "mean": mean,
            "max": maximum,
            **{f"q_{p}": percs[i] for i, p in enumerate(percentiles)}
        })
        
        #summary = np.column_stack([story_index, story_name, time_index, nans, infs, minimum, mean, maximum, percs.T])
        dfs.append(pd.DataFrame(summary, columns=cols))

    df = pd.concat(dfs, ignore_index=True)
    return df

In [62]:
# create summary stats dataframe for each subject
df2 = summary_stats(2)
df3 = summary_stats(3)

# merge summary stats dataframes
df = pd.merge(df2, df3, on=["story_index", "story_name", "time_index"], suffixes=("_2", "_3"))
df

Unnamed: 0,story_index,story_name,time_index,nans_2,infs_2,min_2,mean_2,max_2,q_25_2,q_50_2,q_75_2,nans_3,infs_3,min_3,mean_3,max_3,q_25_3,q_50_3,q_75_3
0,0,stumblinginthedark,0,0,0,-5.6165,0.3415,7.8767,-0.3829,0.3257,1.0498,0,0,-4.7914,0.4556,10.5450,-0.2859,0.4489,1.1824
1,0,stumblinginthedark,1,0,0,-4.5834,0.2229,7.0405,-0.5152,0.1990,0.9318,0,0,-6.0105,0.4199,6.6190,-0.3048,0.3975,1.1144
2,0,stumblinginthedark,2,0,0,-5.0584,0.1527,6.0400,-0.6186,0.1266,0.8940,0,0,-4.9083,0.4848,6.5773,-0.2341,0.4717,1.1850
3,0,stumblinginthedark,3,0,0,-5.3199,0.1726,5.4808,-0.5821,0.1446,0.9047,0,0,-4.5700,0.3828,6.6784,-0.3312,0.3642,1.0723
4,0,stumblinginthedark,4,0,0,-4.9265,0.1867,6.4712,-0.5677,0.1695,0.9296,0,0,-4.5083,0.2809,6.8675,-0.4344,0.2694,0.9894
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34781,100,penpal,250,0,0,-5.6440,-0.1326,3.9081,-0.7674,-0.1108,0.5166,0,0,-4.2690,-0.2691,3.7679,-0.8948,-0.2641,0.3627
34782,100,penpal,251,0,0,-6.1651,-0.1562,6.5111,-0.8138,-0.1325,0.5217,0,0,-4.4948,-0.3581,3.8908,-0.9959,-0.3469,0.2854
34783,100,penpal,252,0,0,-4.1019,-0.2367,4.1310,-0.8543,-0.2215,0.3963,0,0,-5.7382,-0.2340,4.4498,-0.9703,-0.2188,0.5164
34784,100,penpal,253,0,0,-4.2388,-0.2209,3.4341,-0.8647,-0.1939,0.4447,0,0,-5.1718,-0.2464,4.8960,-0.9182,-0.2305,0.4355


In [67]:
# save merged summary stats dataframe to file
df.to_csv("clean.csv", index=False)

In [70]:
# calculate overall summary stats (aggregating all time points)
def overall_stats(subject, percentiles=[25, 50, 75]):
    dfs = []
    cols = ["story_index", "story_name", "nans", "infs", "min", "mean", "max"] + [f"q_{p}" for p in percentiles]
    
    # process each .npy file
    for i in range(N):
        story = stories[i]
        data = load_fMRI(subject, story)
        nrow, ncol = data.shape

        minimum = np.round(np.nanmin(data), 8)
        mean = np.round(np.nanmean(data), 8)
        maximum = np.round(np.nanmax(data), 8)
        percs = np.round(np.nanpercentile(data, percentiles), 8)
        nans = np.sum(np.isnan(data))
        infs = np.sum(np.isinf(data))

        story_index = np.repeat(i, 1)
        story_name = np.repeat(story, 1)

        summary = pd.DataFrame({
            "story_index": story_index,
            "story_name": story_name,
            "nans": nans,
            "infs": infs,
            "min": minimum,
            "mean": mean,
            "max": maximum,
            **{f"q_{p}": percs[i] for i, p in enumerate(percentiles)}
        })
        
        #summary = np.column_stack([story_index, story_name, nans, infs, minimum, mean, maximum, percs])
        dfs.append(pd.DataFrame(summary, columns=cols))

    df = pd.concat(dfs, ignore_index=True)
    return df

In [71]:
# create overall stats dataframe for both subjects
df2_overall = overall_stats(2)
df3_overall = overall_stats(3)

# merge overall stats into single dataframe
df_overall = pd.merge(df2_overall, df3_overall, on=["story_index", "story_name"], suffixes=("_2", "_3"))
df_overall

Unnamed: 0,story_index,story_name,nans_2,infs_2,min_2,mean_2,max_2,q_25_2,q_50_2,q_75_2,nans_3,infs_3,min_3,mean_3,max_3,q_25_3,q_50_3,q_75_3
0,0,stumblinginthedark,0,0,-7.797415,0.0,10.720424,-0.672603,-0.002977,0.669483,0,0,-10.202945,-0.0,11.830579,-0.669168,-0.003285,0.666823
1,1,singlewomanseekingmanwich,0,0,-10.165119,-0.0,11.129302,-0.675288,-0.002273,0.672181,0,0,-9.522151,-0.0,10.067260,-0.676426,-0.004457,0.671040
2,2,theclosetthatateeverything,0,0,-12.557192,-0.0,11.252288,-0.675878,-0.002415,0.672600,0,0,-10.266390,-0.0,9.621383,-0.669767,-0.008394,0.660569
3,3,jugglingandjesus,0,0,-8.647146,0.0,9.960285,-0.672259,-0.001794,0.670111,0,0,-9.583273,0.0,7.942452,-0.678315,-0.006730,0.671146
4,4,threemonths,0,0,-10.536001,-0.0,10.544405,-0.664547,0.003703,0.669049,0,0,-16.273522,-0.0,16.466826,-0.647958,-0.004240,0.644168
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96,96,canplanetearthfeedtenbillionpeoplepart2,0,0,-11.657321,0.0,14.316625,-0.666895,-0.005298,0.661540,0,0,-17.562331,-0.0,14.876905,-0.677230,-0.007590,0.670013
97,97,notontheusualtour,0,0,-12.333584,0.0,11.025450,-0.674377,-0.000557,0.673269,0,0,-13.892000,0.0,14.066072,-0.611448,0.001559,0.615835
98,98,canplanetearthfeedtenbillionpeoplepart3,3432,0,-12.880770,-0.0,13.367670,-0.669520,-0.000885,0.668725,0,0,-17.696964,-0.0,17.012423,-0.656134,-0.006122,0.649781
99,99,thatthingonmyarm,0,0,-8.493048,-0.0,10.991366,-0.674687,-0.003630,0.671195,0,0,-16.912429,-0.0,17.108951,-0.656380,-0.006571,0.650092


In [72]:
# save overall dataframe to file
df_overall.to_csv("overall.csv", index=False)