In [1]:
import sys
import pickle

import numpy as np
from sklearn.model_selection import train_test_split
import sklearn.preprocessing

sys.path.append('code')
import ridge_utils
import preprocessing

In [2]:
train_size = 0.75

In [3]:
with open('data/raw_text.pkl', 'rb') as file:
    wordseqs = pickle.load(file)

stories = list(wordseqs.keys())
train, test = train_test_split(stories, train_size=train_size, random_state=4193332621)

  wordseqs = pickle.load(file)


In [4]:
class BagofWords:
    def __init__(self):
        self.word_to_index = {}
    
    def fit(self, texts):
        for text in texts:
            for word in text:
                word = word.lower().strip()
                if word not in self.word_to_index:
                    self.word_to_index[word] = len(self.word_to_index)
        return self
    
    def transform(self, texts):
        features = np.zeros((len(texts), self.vocab_size + 1))
        for i, text in enumerate(texts):
            for word in text:
                word = word.lower().strip()
                if word in self.word_to_index:
                    features[i, self.word_to_index[word]] += 1
                else:
                    features[i, self.vocab_size] += 1
        return features
    
    def transform_words(self, words):
        tokens = []
        for word in words:
            word = word.lower().strip()
            if word in self.word_to_index:
                tokens.append(self.word_to_index[word])
            else:
                tokens.append(self.vocab_size)
        # Convert to one-hot encoding
        one_hot = np.zeros((len(tokens), self.vocab_size + 1))
        one_hot[np.arange(len(tokens)), tokens] = 1
        return one_hot
        

    @property
    def vocab_size(self):
        return len(self.word_to_index)
    
    def __repr__(self):
        return f'BagofWords(vocab_size={self.vocab_size})'
    
def trim_first_5_last_10(embeddings):
    # Trim the first 5 and last 10 elements
    for k, v in embeddings.items():
        embeddings[k] = v[5:-10]
    return embeddings
    
def bow_embed(stories, wordseqs, trimmed=True):
    bow = BagofWords().fit([wordseqs[story].data for story in stories])
    word_vectors = {}
    for story in stories:
        word_vectors[story] = bow.transform_words(wordseqs[story].data)
    embeddings = preprocessing.downsample_word_vectors(stories, word_vectors, wordseqs)
    if trimmed:
        embeddings = trim_first_5_last_10(embeddings)
    return embeddings

class embeddings_aggregator:
    def __init__(self, delays=None, standardize=True):
        self.delays = delays
        self.standardize = standardize
        self.scaler = sklearn.preprocessing.StandardScaler() if standardize else None

    def _concatenate_embeddings(self, stories, embeddings):
        all_embeddings = []
        for story in stories:
            all_embeddings.append(embeddings[story])
        all_embeddings = np.concatenate(all_embeddings, axis=0)
        if self.delays:
            all_embeddings = preprocessing.make_delayed(all_embeddings, self.delays)
        return all_embeddings
    
    def fit(self, stories, embeddings):
        if self.standardize:
            all_embeddings = self._concatenate_embeddings(stories, embeddings)
            self.scaler = sklearn.preprocessing.StandardScaler().fit(all_embeddings)
        return self
    
    def transform(self, stories, embeddings):
        all_embeddings = self._concatenate_embeddings(stories, embeddings)
        if self.standardize:
            all_embeddings = self.scaler.transform(all_embeddings)
        return all_embeddings
    
    def fit_transform(self, stories, embeddings):
        all_embeddings = self._concatenate_embeddings(stories, embeddings)
        if self.standardize:
            all_embeddings = self.scaler.fit_transform(all_embeddings)
        return all_embeddings

train_embeddings = bow_embed(train, wordseqs)

aggregator = embeddings_aggregator(delays=range(1, 5), standardize=True)
train_X = aggregator.fit_transform(['adollshouse'], train_embeddings)

In [None]:
train_X.shape

(241, 44076)

In [6]:
train_y = np.load('data/subject3/adollshouse.npy')

In [9]:
ridge = sklearn.linear_model.Ridge().fit(train_X, train_y)