In [1]:
from transformers import GPT2Tokenizer, GPT2Model
import torch
import pandas as pd
import numpy as np
import scipy.io as sio

In [11]:
  # Check and set up CUDA
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [14]:
# Load the CSV file
file_path = "D:\\PythonProjs\\fNIRS_NLP\\Original_design_mat\\HH_design_mat.csv"  # Replace with your file path
df = pd.read_csv(file_path)
# df.head()

In [None]:
# Ensure all values in the 'word' column are strings and handle NaN values
df['Text'] = df['Text'].fillna('').astype(str)
df = df.iloc[138:].reset_index(drop=True)

# story_words = df['Text'].to_list()
# Extract the 'word' column and concatenate into a story
story_text = ' '.join(df['Text'])

# Print or save the story
print(f"total word: {len(story_text.split())}. Text: {story_text}")

total word: 1956. Text: henry left the gym on this particular wednesday he stopped to watch a man tear down a circus poster then with three nickels and one dime in his pocket he went to the corner drugstore to buy a chocolate ice cream cone he thought he would eat the ice cream cone get on the bus drop his dime in the slot and ride home that is not what happened he bought the ice cream cone and paid for it with one of his nickels on his way out of the drugstore he stopped to look at funny books it was a free look because he had only two nickels left he stood there licking his chocolate ice cream cone and reading one of the funny books when he heard a thump thump thump henry turned and there behind him was a dog the dog was scratching himself he wasn't any special kind of dog he was too small to be a big dog but on the other hand he was much too big to be a little dog he wasn't a white dog because parts of him were brown and other parts were black and in between there were yellowish pat

In [9]:
"""def extract_layer_activations(text_list, model_name='gpt2'):
    # Check and set up CUDA
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    # Load pre-trained model and tokenizer to CUDA
    model = GPT2Model.from_pretrained(model_name).to(device)
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    tokenizer.pad_token = tokenizer.eos_token
    
    model.eval()
    
    activation_matrix = np.zeros((len(text_list), 768))
    
    batch_size = 100
    for start in range(0, len(text_list), batch_size):
        batch = text_list[start:start+batch_size]
        
        for idx, word in enumerate(batch):
            context = text_list[max(0, start+idx-1024):start+idx]
            context_text = ' '.join(context)
            
            if not context_text.strip():
                context_text = word
            
            inputs = tokenizer(context_text, return_tensors='pt', 
                               max_length=1024, 
                               truncation=True, 
                               padding=True).to(device)
            
            with torch.no_grad():
                outputs = model(**inputs, output_hidden_states=True)
                layer_8_activations = outputs.hidden_states[7][0][-1].cpu()
            
            activation_matrix[start+idx] = layer_8_activations.numpy()
    
    return activation_matrix"""

In [17]:

def extract_word_activations(story, model_name='gpt2'):
    # Load pre-trained GPT-2 model and tokenizer on CUDA
    model = GPT2Model.from_pretrained(model_name).to(device)
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    model.eval()

    # Split story into words
    words = story.split()
    
    # Initialize output activation matrix
    word_activations = torch.zeros(len(words), 768).cuda()
    
    # Process each word
    for i, word in enumerate(words):
        # Tokenize with previous context (up to 1024 tokens)
        context = ' '.join(words[max(0, i-1024//2):i])
        inputs = tokenizer(context, return_tensors='pt', truncation=True, max_length=1024).to(device)
        
        # Add current word to input
        current_input = tokenizer(context + ' ' + word, return_tensors='pt', 
                                  truncation=True, max_length=1024).to(device)
        
        # Get hidden states
        with torch.no_grad():
            outputs = model(**current_input, output_hidden_states=True)
            
        # Extract the 8th layer's hidden state for the last token (1024th position)
        activation = outputs.hidden_states[7][0, -1, :]
        
        word_activations[i] = activation
    
    return word_activations.cpu()  # Move back to CPU for further processing



In [18]:
layer_8_full_activations = extract_word_activations(story_text)
print(f"Full story activation matrix shape: {layer_8_full_activations.shape}")
torch.save(layer_8_full_activations, 'full_story_layer_8_activations.pt')

Full story activation matrix shape: torch.Size([2095, 768])


In [20]:
len(layer_8_full_activations)

2095

In [21]:
output_path = './activation/activation_baseline.mat'
# Save to .mat file
sio.savemat(output_path, {'activations': layer_8_full_activations})

In [22]:
def normalize_columns(activations):
    # Normalize each column to [0,1] range
    normalized_activations = (activations - activations.min(axis=1, keepdims=True)) / \
                              (activations.max(axis=1, keepdims=True) - activations.min(axis=1, keepdims=True))
    
    return normalized_activations


In [23]:
mat_file_path = './activation/activation_baseline.mat'
# Load .mat file
mat_contents = sio.loadmat(mat_file_path)

# Assuming the matrix is stored under 'activations' key
activations = mat_contents['activations']

norm_activations = normalize_columns(activations)

In [24]:
output_path = './activation/norm_activation_baseline.mat'
# Save to .mat file
sio.savemat(output_path, {'activations': norm_activations})