In [1]:
import dataclasses

from sklearn.model_selection import train_test_split

from config import PreprocessConfig
from datasets import MeaningEmbeddingSentenceStyleDataset
from experiment import Experiment
from settings import EXPERIMENTS_DIR
from utils import load_pickle, save_json, load_json, load_embeddings, create_embeddings_matrix, extract_word_embeddings_style_dimensions
from vocab import Vocab

In [2]:
import pickle

In [3]:
def save_pickle(obj, filename):
    with open(filename, 'wb') as f:
        pickle.dump(obj,f)

In [4]:
def save_dataset(exp, dataset_train, dataset_val, dataset_test, vocab, style_vocab, W_emb):
    # save_pickle((dataset_train, dataset_val, dataset_test), exp.experiment_dir.joinpath('datasets.pkl'))
    # save_pickle((vocab, style_vocab), exp.experiment_dir.joinpath('vocabs.pkl'))
    # save_pickle(W_emb, exp.experiment_dir.joinpath('W_emb.pkl'))

    save_pickle(dataset_train, exp.experiment_dir.joinpath('datasets1.pkl'))
    save_pickle(dataset_val, exp.experiment_dir.joinpath('datasets2.pkl'))
    save_pickle(dataset_test, exp.experiment_dir.joinpath('datasets3.pkl'))
    save_pickle(vocab, exp.experiment_dir.joinpath('vocabs1.pkl'))
    save_pickle(style_vocab, exp.experiment_dir.joinpath('vocabs2.pkl'))
    save_pickle(W_emb, exp.experiment_dir.joinpath('W_emb.pkl'))

    print(f'Saved: {exp.experiment_dir}')

In [5]:

def create_dataset_reader(cfg):
    dataset_reader_class = cfg.dataset_reader_class

    dataset_reader_params = dataclasses.asdict(cfg)
    dataset_reader = dataset_reader_class(**dataset_reader_params)

    return dataset_reader


def create_vocab(instances):
    vocab = Vocab([Vocab.PAD_TOKEN, Vocab.START_TOKEN, Vocab.END_TOKEN, Vocab.UNK_TOKEN, ])
    vocab.add_documents([inst['sentence'] for inst in instances])

    style_vocab = Vocab()
    style_vocab.add_document([inst['style'] for inst in instances])

    return vocab, style_vocab

In [6]:

def create_splits(cfg, instances):
    if cfg.test_size != 0:
        instances_train_val, instances_test = train_test_split(instances, test_size=cfg.test_size, random_state=42)
    else:
        instances_test = []
        instances_train_val = instances

    if cfg.val_size != 0:
        instances_train, instances_val = train_test_split(instances_train_val, test_size=cfg.val_size, random_state=0)
    else:
        instances_train = []
        instances_val = []

    return instances_train, instances_val, instances_test

In [7]:
cfg = PreprocessConfig()

In [43]:
cfg

PreprocessConfig(data_path=PosixPath('data/datasets/keigo/practice_500'), dataset_reader_class=<class 'datasets.KeigoDatasetReader'>, min_len=3, max_len=20, lowercase=True, word_embeddings='gensim', max_vocab_size=50000, nb_style_dims=50, nb_style_dims_sentences=50000, style_tokens_proportion=0.2, test_size=50, val_size=50)

In [44]:
e = Experiment(EXPERIMENTS_DIR, cfg, prefix='preprocess')

In [47]:
e.experiment_dir

In [41]:
with Experiment(EXPERIMENTS_DIR, cfg, prefix='preprocess') as exp:
    print(f'Experiment started: {exp.experiment_id}')

    # # read instances
    # dataset_reader = create_dataset_reader(exp.config)
    # print(f'Dataset reader: {dataset_reader.__class__.__name__}')
    # #KeigoDatasetReaderのメソッドreadを呼んでいる
    # instances = dataset_reader.read(exp.config.data_path)
    # print(f'Instances: {len(instances)}')

Experiment started: preprocess.kniubegg


In [10]:
instances[0]

{'sentence': ['今回',
  'は',
  '、',
  'この',
  '方',
  'の',
  '転職',
  '先',
  'と',
  'お',
  '仕事',
  'が',
  'でき',
  'ない',
  'か',
  '、',
  'と',
  'いう',
  'こと'],
 'style': 'keigo'}

In [11]:
exp.config.data_path

PosixPath('data/datasets/keigo/practice_500')

In [12]:
cfg.dataset_reader_class

datasets.KeigoDatasetReader

In [13]:
    with Experiment(EXPERIMENTS_DIR, cfg, prefix='preprocess') as exp:
        print(f'Experiment started: {exp.experiment_id}')

        # read instances
        dataset_reader = create_dataset_reader(exp.config)
        print(f'Dataset reader: {dataset_reader.__class__.__name__}')

        instances = dataset_reader.read(exp.config.data_path)
        print(f'Instances: {len(instances)}')

        # create vocabularies
        vocab, style_vocab = create_vocab(instances)
        print(f'Vocab: {len(vocab)}, style vocab: {style_vocab}')

        if exp.config.max_vocab_size != 0:
            vocab.prune_vocab(exp.config.max_vocab_size)

        # create splits
        instances_train, instances_val, instances_test = create_splits(exp.config, instances)
        print(f'Train: {len(instances_train)}, val: {len(instances_val)}, test: {len(instances_test)}')

        # create embeddings
        word_embeddings = load_embeddings(cfg)
        print("magnituideのロード完了")


Experiment started: preprocess.e9llwftm
Dataset reader: KeigoDatasetReader
Instances: 991
Vocab: 3884, style vocab: Vocab: 2 tokens
Train: 891, val: 50, test: 50
use gensim word embeddings.
magnituideのロード完了


In [14]:
W_emb = create_embeddings_matrix(word_embeddings, vocab)
print("matrix created")
# extract style dimensions
style_dimensions = extract_word_embeddings_style_dimensions(cfg, instances_train, vocab, style_vocab, W_emb)

# create datasets
dataset_train = MeaningEmbeddingSentenceStyleDataset(
    W_emb, style_dimensions, exp.config.style_tokens_proportion,
    instances_train, vocab, style_vocab
)
dataset_val = MeaningEmbeddingSentenceStyleDataset(
    W_emb, style_dimensions, exp.config.style_tokens_proportion,
    instances_val, vocab, style_vocab
)
dataset_test = MeaningEmbeddingSentenceStyleDataset(
    W_emb, style_dimensions, exp.config.style_tokens_proportion,
    instances_test, vocab, style_vocab
)



matrix created
Styles instances: [444, 447]
Styles means: [(300,), (300,)]
Style dimensions: (50,)


In [66]:
a
type(a)

list

In [15]:

with open("test.pk","wb") as f:
    pickle.dump(dataset_test,f)

In [76]:
with open("test.pk","wb") as f:
    pickle.dump(vocab,f)

NotImplementedError: [E111] Pickling a token is not supported, because tokens are only views of the parent Doc and can't exist on their own. A pickled token would always have to include its Doc and Vocab, which has practically no advantage over pickling the parent Doc directly. So instead of pickling the token, pickle the Doc it belongs to.

In [20]:
from tqdm.notebook import tqdm
keys = [key for key,value in tqdm(word_embeddings)]

  0%|          | 0/482238 [00:00<?, ?it/s]

In [24]:
with open("data/magnitude_keys","wb") as f:
    pickle.dump(keys,f)

In [25]:
with open("data/magnitude_keys","rb") as f:
    keys = pickle.load(f)

In [22]:
exp.experiment_id

'preprocess.e9llwftm'

In [17]:
dir(dataset_test)

NameError: name 'dataset_test' is not defined

In [12]:
save_dataset(exp, dataset_train, dataset_val, dataset_test, vocab, style_vocab, W_emb)

print(f'Experiment finished: {exp.experiment_id}')

NotImplementedError: [E111] Pickling a token is not supported, because tokens are only views of the parent Doc and can't exist on their own. A pickled token would always have to include its Doc and Vocab, which has practically no advantage over pickling the parent Doc directly. So instead of pickling the token, pickle the Doc it belongs to.

In [29]:
word_embeddings._key_t("あああああ")

'あああああ'

In [31]:
from tqdm import tqdm
for key,value in tqdm(word_embeddings):
    pass

 49%|████▉     | 236634/482238 [00:23<00:24, 9971.26it/s] 


KeyboardInterrupt: 

In [49]:
import numpy as np


In [50]:
%%time
n = np.array(word_embeddings)

  """Entry point for launching an IPython kernel.


KeyboardInterrupt: 