In [4]:
import os

import numpy as np

from preprocessing.transformers import *

from sklearn.pipeline import Pipeline
from gensim.models import Word2Vec


os.environ['TF_CPP_MIN_LOG_LEVEL'] = '1'

0 = all messages are logged (default behavior)
1 = INFO messages are not printed
2 = INFO and WARNING messages are not printed
3 = INFO, WARNING, and ERROR messages are not printed

In [5]:
train = pd.read_csv('../data/train.csv', header=0, index_col='id')
test = pd.read_csv('../data/test.csv', header=0, index_col='id')

In [6]:
X_train = train['comment_text']
Y_train = train.drop('comment_text', axis=1)

X_test = test.comment_text

In [7]:
preprocessing_pipeline = Pipeline(steps=[
    ('cleaner', Cleaner()),
    ('splitter', Splitter()),
    ('stopwords_remover', StopWordsRemover()),
    ('stemmer', Stemmer().with_stemmer('snowball', language='english')),
    ('lemmatizer', Lemmatizer()),
    ('tokenizer', Tokenizer(pad=True))
])

In [8]:
preprocessing_pipeline.fit_transform(['Hello is not to be the baby'])

array([[1, 2]], dtype=int32)

In [9]:
text = '''
But do cats eat bats, I wonder?' And here Alice began to get
rather sleepy, and went on saying to herself, in a dreamy sort of
way, `Do cats eat bats?  Do cats eat bats?' and sometimes, `Do
bats eat cats?' for, you see, as she couldn't answer either
question, it didn't much matter which way she put it.  She felt
that she was dozing off, and had just begun to dream that she
was walking hand in hand with Dinah, and saying to her very
earnestly, `Now, Dinah, tell me the truth:  did you ever eat a
bat?' when suddenly, thump! thump! down she came upon a heap of
sticks and dry leaves, and the fall was over.
'''

' '.join(map(str, preprocessing_pipeline.fit_transform([text])[0]))

'12 3 1 2 13 14 15 16 17 18 19 20 21 5 22 23 6 4 3 1 2 4 3 1 2 24 4 2 1 3 25 26 27 28 29 30 31 32 6 33 34 35 36 37 38 39 7 7 8 5 40 41 8 42 43 44 1 2 45 9 9 46 47 48 49 50 51 52'

In [10]:
preprocessing_pipeline = Pipeline(
    steps=[
        ('cleaner', Cleaner()),
        ('splitter', Splitter()),
        ('stopwords_remover', StopWordsRemover()),
        ('stemmer', Stemmer().with_stemmer('snowball', language='english')),
        ('lemmatizer', Lemmatizer()),
        ('tokenizer', Tokenizer(pad=True))
    ],
    verbose=True
)

In [11]:
preprocessing_pipeline.get_params()

{'memory': None,
 'steps': [('cleaner', Cleaner()),
  ('splitter', Splitter()),
  ('stopwords_remover', StopWordsRemover()),
  ('stemmer', Stemmer()),
  ('lemmatizer', Lemmatizer()),
  ('tokenizer',
   Tokenizer(pad=True,
             tokenizer=<keras_preprocessing.text.Tokenizer object at 0x7f0c9124e130>))],
 'verbose': True,
 'cleaner': Cleaner(),
 'splitter': Splitter(),
 'stopwords_remover': StopWordsRemover(),
 'stemmer': Stemmer(),
 'lemmatizer': Lemmatizer(),
 'tokenizer': Tokenizer(pad=True,
           tokenizer=<keras_preprocessing.text.Tokenizer object at 0x7f0c9124e130>),
 'cleaner__addresses': True,
 'cleaner__dates': True,
 'cleaner__emails': True,
 'cleaner__ips': True,
 'cleaner__links': True,
 'cleaner__punctuation': True,
 'stopwords_remover__language': 'english',
 'tokenizer__pad': True,
 'tokenizer__pad_len': None,
 'tokenizer__tokenizer': <keras_preprocessing.text.Tokenizer at 0x7f0c9124e130>}

In [12]:
preprocessing_pipeline.fit(X_train)
preprocessing_pipeline.fit(X_test)

X_train = preprocessing_pipeline.transform(X_train)

[Pipeline] ........... (step 1 of 6) Processing cleaner, total=  51.4s
[Pipeline] .......... (step 2 of 6) Processing splitter, total=   2.4s
[Pipeline] . (step 3 of 6) Processing stopwords_remover, total=   9.6s
[Pipeline] ........... (step 4 of 6) Processing stemmer, total=  38.0s
[Pipeline] ........ (step 5 of 6) Processing lemmatizer, total=  13.2s
[Pipeline] ......... (step 6 of 6) Processing tokenizer, total=   3.6s
[Pipeline] ........... (step 1 of 6) Processing cleaner, total=  45.6s
[Pipeline] .......... (step 2 of 6) Processing splitter, total=   2.1s
[Pipeline] . (step 3 of 6) Processing stopwords_remover, total=   8.5s
[Pipeline] ........... (step 4 of 6) Processing stemmer, total=  34.4s
[Pipeline] ........ (step 5 of 6) Processing lemmatizer, total=  11.9s
[Pipeline] ......... (step 6 of 6) Processing tokenizer, total=   4.1s


In [13]:
X_test = preprocessing_pipeline.transform(X_test)

In [14]:
np.savetxt(
    fname='../data/prepared_train.csv',
    X=X_train,
    delimiter=',',
    fmt='%i'
)
np.savetxt(
    fname='../data/prepared_test.csv',
    X=X_test,
    delimiter=',',
    fmt='%i'
)

In [15]:
with open('../data/tokenizer.json', 'w') as file:
    file.write(
        preprocessing_pipeline['tokenizer'].tokenizer.to_json()
    )

In [None]:
# todo add CBOW, Glove, FastText

word2vec_models = dict(
    light=Word2Vec( # Skip-gram
        sentences=np.concatenate([X_train, X_test], axis=0),
        sg=True,
        window=5,
        vector_size=100,
        min_count=1,
        negative=15
    ),
    heavy=Word2Vec( # Skip-gram
        sentences=np.concatenate([X_train, X_test], axis=0),
        sg=True,
        window=10,
        vector_size=300,
        min_count=1,
        negative=20
    ),
)

for name, model in word2vec_models:
    model.save(name + '_word2vec.model')

Exception in thread Thread-7:
Traceback (most recent call last):
  File "/usr/lib/python3.9/threading.py", line 954, in _bootstrap_inner
Exception in thread Thread-9:
Traceback (most recent call last):
  File "/usr/lib/python3.9/threading.py", line 954, in _bootstrap_inner
Exception in thread Thread-8:
Traceback (most recent call last):
  File "/usr/lib/python3.9/threading.py", line 954, in _bootstrap_inner
        self.run()
  File "/usr/lib/python3.9/threading.py", line 892, in run
self.run()
  File "/usr/lib/python3.9/threading.py", line 892, in run
    self._target(*self._args, **self._kwargs)
  File "/home/uiqkos/pyenvs/mint/lib/python3.9/site-packages/gensim/models/word2vec.py", line 1162, in _worker_loop
        tally, raw_tally = self._do_train_job(data_iterable, alpha, thread_private_mem)
self._target(*self._args, **self._kwargs)  File "/home/uiqkos/pyenvs/mint/lib/python3.9/site-packages/gensim/models/word2vec.py", line 951, in _do_train_job

  File "/home/uiqkos/pyenvs/mint/