# Infrastructure setup

## Initilizing the runtime (runtime)

In [3]:
# Google collab restarts the VMs after some time and drops all the files stored on VM,
# so some files would be persisted on mounted Google Drive to persist the progress
# between the sessions

GDRIVE_PATH = '/content/drive/My Drive/Colab/homework-10/'

# 1551 Dataset has lots of small files. Reading them from mounted Google Drive
# sometimes takes significant time, so I have decided to checkout it to VM
# DATA_1551_PATH = '/content/drive/My  Drive/Colab/homework-10/1551.gov.ua/raw/'
DATA_1551_PATH = '/content/1551.gov.ua/raw/'


#
# Working with word2vec models
#

models_path = '/content/drive/My Drive/Colab/homework-10/vec_models/'

# news.cased.tokenized.word2vec.300d
news_cased_tokenized_word2vec_300d_path = models_path +'news.cased.tokenized.word2vec.300d' 
# news.lowercased.tokenized.word2vec.300d
news_lowercased_tokenized_word2vec_300d_path = models_path +'news.lowercased.tokenized.word2vec.300d' 
# news.cased.lemmatized.word2vec.300d
news_cased_lemmatized_word2vec_300d_path = models_path +'news.cased.lemmatized.word2vec.300d' 
# news.lowercased.lemmatized.word2vec.300d
news_lowercased_lemmatized_word2vec_300d_path = models_path +'news.lowercased.lemmatized.word2vec.300d' 

# ubercorpus.cased.tokenized.word2vec.300d
ubercorpus_cased_tokenized_word2vec_300d_path = models_path + 'ubercorpus.cased.tokenized.word2vec.300d'

# ubercorpus.lowercased.tokenized.word2vec.300d
ubercorpus_lowercased_tokenized_word2vec_300d_path = models_path + 'ubercorpus.lowercased.tokenized.word2vec.300d'

# ubercorpus.cased.lemmatized.word2vec.300d
ubercorpus_cased_lemmatized_word2vec_300d_path = models_path + 'ubercorpus.cased.lemmatized.word2vec.300d'

# ubercorpus.lowercased.lemmatized.word2vec.300d
ubercorpus_lowercased_lemmatized_word2vec_300d_path = models_path + 'ubercorpus.lowercased.lemmatized.word2vec.300d'


#
# An amazing progress bar function
#

def printProgressBar (iteration, total, prefix = '', suffix = '', decimals = 1, length = 100, fill = '█', printEnd = "\r"):
    """
    Call in a loop to create terminal progress bar
    @params:
        iteration   - Required  : current iteration (Int)
        total       - Required  : total iterations (Int)
        prefix      - Optional  : prefix string (Str)
        suffix      - Optional  : suffix string (Str)
        decimals    - Optional  : positive number of decimals in percent complete (Int)
        length      - Optional  : character length of bar (Int)
        fill        - Optional  : bar fill character (Str)
        printEnd    - Optional  : end character (e.g. "\r", "\r\n") (Str)
    """
    percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total)))
    filledLength = int(length * iteration // total)
    bar = fill * filledLength + '-' * (length - filledLength)

    print('\r%s |%s| %s%% %s' % (prefix, bar, percent, suffix), end = '')
    # Print New Line on Complete
    if iteration == total: 
        print()

printProgressBar(1, 10, prefix = 'Progress:', suffix = 'Complete', length = 50)        
printProgressBar(1, 100, prefix = 'Progress:', suffix = 'Complete', length = 50)   

Progress: |█████---------------------------------------------| 10.0% CompleteProgress: |--------------------------------------------------| 1.0% Complete

## Downloading the data (once)


### Donwloading 1551 data to VMs drive (runtime)

In [5]:
!git clone https://github.com/lang-uk/1551.gov.ua '/content/1551.gov.ua'

Cloning into '/content/1551.gov.ua'...
remote: Enumerating objects: 127370, done.[K
remote: Total 127370 (delta 0), reused 0 (delta 0), pack-reused 127370[K
Receiving objects: 100% (127370/127370), 108.40 MiB | 29.48 MiB/s, done.
Resolving deltas: 100% (43/43), done.
Checking out files: 100% (127332/127332), done.


### Downloading 1551 data to Google Drive (once)

In [0]:
!git clone https://github.com/lang-uk/1551.gov.ua '/content/drive/My Drive/Colab/homework-10/1551.gov.ua'

Cloning into '/content/drive/My Drive/Colab/homework-10/1551.gov.ua'...
remote: Enumerating objects: 127370, done.[K
remote: Total 127370 (delta 0), reused 0 (delta 0), pack-reused 127370[K
Receiving objects: 100% (127370/127370), 108.40 MiB | 3.41 MiB/s, done.
Resolving deltas: 100% (43/43), done.
Checking out files: 100% (127332/127332), done.


### Downloading **news** corpus to GDrive (once)

https://lang.org.ua/static/downloads/models/news.cased.tokenized.word2vec.300d.bz2

https://lang.org.ua/static/downloads/models/news.lowercased.tokenized.word2vec.300d.bz2

https://lang.org.ua/static/downloads/models/news.cased.lemmatized.word2vec.300d.bz2

https://lang.org.ua/static/downloads/models/news.lowercased.lemmatized.word2vec.300d.bz2

In [3]:
!wget -P '/content/drive/My Drive/Colab/homework-10/' https://lang.org.ua/static/downloads/models/news.cased.tokenized.word2vec.300d.bz2
!wget -P '/content/drive/My Drive/Colab/homework-10/' https://lang.org.ua/static/downloads/models/news.lowercased.tokenized.word2vec.300d.bz2
!wget -P '/content/drive/My Drive/Colab/homework-10/' https://lang.org.ua/static/downloads/models/news.cased.lemmatized.word2vec.300d.bz2
!wget -P '/content/drive/My Drive/Colab/homework-10/' https://lang.org.ua/static/downloads/models/news.lowercased.lemmatized.word2vec.300d.bz2

!bzip2 -d '/content/drive/My Drive/Colab/homework-10/news.cased.tokenized.word2vec.300d.bz2'
!bzip2 -d '/content/drive/My Drive/Colab/homework-10/news.lowercased.tokenized.word2vec.300d.bz2'
!bzip2 -d '/content/drive/My Drive/Colab/homework-10/news.cased.lemmatized.word2vec.300d.bz2'
!bzip2 -d '/content/drive/My Drive/Colab/homework-10/news.lowercased.lemmatized.word2vec.300d.bz2'

--2020-05-15 08:51:53--  https://lang.org.ua/static/downloads/models/news.cased.tokenized.word2vec.300d.bz2
Resolving lang.org.ua (lang.org.ua)... 95.216.74.77
Connecting to lang.org.ua (lang.org.ua)|95.216.74.77|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 343843428 (328M) [application/octet-stream]
Saving to: ‘/content/drive/My Drive/Colab/homework-10/news.cased.tokenized.word2vec.300d.bz2’


2020-05-15 08:52:31 (9.33 MB/s) - ‘/content/drive/My Drive/Colab/homework-10/news.cased.tokenized.word2vec.300d.bz2’ saved [343843428/343843428]

--2020-05-15 08:52:41--  https://lang.org.ua/static/downloads/models/news.lowercased.tokenized.word2vec.300d.bz2
Resolving lang.org.ua (lang.org.ua)... 95.216.74.77
Connecting to lang.org.ua (lang.org.ua)|95.216.74.77|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 310107678 (296M) [application/octet-stream]
Saving to: ‘/content/drive/My Drive/Colab/homework-10/news.lowercased.tokenized.word2vec.30

### Downloading **ubercorpus** to GDrive (once)

```
https://lang.org.ua/static/downloads/models/ubercorpus.cased.tokenized.word2vec.300d.bz2

https://lang.org.ua/static/downloads/models/ubercorpus.lowercased.tokenized.word2vec.300d.bz2

https://lang.org.ua/static/downloads/models/ubercorpus.cased.lemmatized.word2vec.300d.bz2

https://lang.org.ua/static/downloads/models/ubercorpus.lowercased.lemmatized.word2vec.300d.bz2

```

In [4]:
!wget -P '/content/drive/My Drive/Colab/homework-10/' https://lang.org.ua/static/downloads/models/ubercorpus.cased.tokenized.word2vec.300d.bz2
!wget -P '/content/drive/My Drive/Colab/homework-10/' https://lang.org.ua/static/downloads/models/ubercorpus.lowercased.tokenized.word2vec.300d.bz2
!wget -P '/content/drive/My Drive/Colab/homework-10/' https://lang.org.ua/static/downloads/models/ubercorpus.cased.lemmatized.word2vec.300d.bz2
!wget -P '/content/drive/My Drive/Colab/homework-10/' https://lang.org.ua/static/downloads/models/ubercorpus.lowercased.lemmatized.word2vec.300d.bz2

!bzip2 -d '/content/drive/My Drive/Colab/homework-10/ubercorpus.cased.tokenized.word2vec.300d.bz2'
!bzip2 -d '/content/drive/My Drive/Colab/homework-10/ubercorpus.lowercased.tokenized.word2vec.300d.bz2'
!bzip2 -d '/content/drive/My Drive/Colab/homework-10/ubercorpus.cased.lemmatized.word2vec.300d.bz2'
!bzip2 -d '/content/drive/My Drive/Colab/homework-10/ubercorpus.lowercased.lemmatized.word2vec.300d.bz2'


--2020-05-15 08:57:33--  https://lang.org.ua/static/downloads/models/ubercorpus.cased.tokenized.word2vec.300d.bz2
Resolving lang.org.ua (lang.org.ua)... 95.216.74.77
Connecting to lang.org.ua (lang.org.ua)|95.216.74.77|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 555203899 (529M) [application/octet-stream]
Saving to: ‘/content/drive/My Drive/Colab/homework-10/ubercorpus.cased.tokenized.word2vec.300d.bz2’


2020-05-15 08:58:30 (9.69 MB/s) - ‘/content/drive/My Drive/Colab/homework-10/ubercorpus.cased.tokenized.word2vec.300d.bz2’ saved [555203899/555203899]

--2020-05-15 08:58:31--  https://lang.org.ua/static/downloads/models/ubercorpus.lowercased.tokenized.word2vec.300d.bz2
Resolving lang.org.ua (lang.org.ua)... 95.216.74.77
Connecting to lang.org.ua (lang.org.ua)|95.216.74.77|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 502679218 (479M) [application/octet-stream]
Saving to: ‘/content/drive/My Drive/Colab/homework-10/ubercorpus.lo

## Initializing models on GDrive (once)

In [5]:
!rm -r '/content/drive/My Drive/Colab/homework-10/vec_models'
!mkdir '/content/drive/My Drive/Colab/homework-10/vec_models'

# news.cased.tokenized.word2vec.300d
!python -m spacy init-model uk '/content/drive/My Drive/Colab/homework-10/vec_models/news.cased.tokenized.word2vec.300d' --vectors-loc '/content/drive/My Drive/Colab/homework-10/news.cased.tokenized.word2vec.300d'
# news.lowercased.tokenized.word2vec.300d
!python -m spacy init-model uk '/content/drive/My Drive/Colab/homework-10/vec_models/news.lowercased.tokenized.word2vec.300d' --vectors-loc '/content/drive/My Drive/Colab/homework-10/news.lowercased.tokenized.word2vec.300d'
# news.cased.lemmatized.word2vec.300d
!python -m spacy init-model uk '/content/drive/My Drive/Colab/homework-10/vec_models/news.cased.lemmatized.word2vec.300d' --vectors-loc '/content/drive/My Drive/Colab/homework-10/news.cased.lemmatized.word2vec.300d'
# news.lowercased.lemmatized.word2vec.300d
!python -m spacy init-model uk '/content/drive/My Drive/Colab/homework-10/vec_models/news.lowercased.lemmatized.word2vec.300d' --vectors-loc '/content/drive/My Drive/Colab/homework-10/news.lowercased.lemmatized.word2vec.300d'

# ubercorpus.cased.tokenized.word2vec.300d
!python -m spacy init-model uk '/content/drive/My Drive/Colab/homework-10/vec_models/ubercorpus.cased.tokenized.word2vec.300d' --vectors-loc '/content/drive/My Drive/Colab/homework-10/ubercorpus.cased.tokenized.word2vec.300d'
# ubercorpus.lowercased.tokenized.word2vec.300d
!python -m spacy init-model uk '/content/drive/My Drive/Colab/homework-10/vec_models/ubercorpus.lowercased.tokenized.word2vec.300d' --vectors-loc '/content/drive/My Drive/Colab/homework-10/ubercorpus.lowercased.tokenized.word2vec.300d'
# ubercorpus.cased.lemmatized.word2vec.300d
!python -m spacy init-model uk '/content/drive/My Drive/Colab/homework-10/vec_models/ubercorpus.cased.lemmatized.word2vec.300d' --vectors-loc '/content/drive/My Drive/Colab/homework-10/ubercorpus.cased.lemmatized.word2vec.300d'
# ubercorpus.lowercased.lemmatized.word2vec.300d
!python -m spacy init-model uk '/content/drive/My Drive/Colab/homework-10/vec_models/ubercorpus.lowercased.lemmatized.word2vec.300d' --vectors-loc '/content/drive/My Drive/Colab/homework-10/ubercorpus.lowercased.lemmatized.word2vec.300d'


[2K[38;5;2m✔ Successfully created model[0m
365319it [00:38, 9528.11it/s] 
[2K[38;5;2m✔ Loaded vectors from /content/drive/My
Drive/Colab/homework-10/news.cased.tokenized.word2vec.300d[0m
[38;5;2m✔ Sucessfully compiled vocab[0m
365495 entries, 365319 vectors
[2K[38;5;2m✔ Successfully created model[0m
328958it [00:35, 9198.26it/s] 
[2K[38;5;2m✔ Loaded vectors from /content/drive/My
Drive/Colab/homework-10/news.lowercased.tokenized.word2vec.300d[0m
[38;5;2m✔ Sucessfully compiled vocab[0m
329134 entries, 328958 vectors
[2K[38;5;2m✔ Successfully created model[0m
178460it [00:19, 9054.59it/s] 
[2K[38;5;2m✔ Loaded vectors from /content/drive/My
Drive/Colab/homework-10/news.cased.lemmatized.word2vec.300d[0m
[38;5;2m✔ Sucessfully compiled vocab[0m
178636 entries, 178460 vectors
[2K[38;5;2m✔ Successfully created model[0m
174311it [00:19, 9101.24it/s] 
[2K[38;5;2m✔ Loaded vectors from /content/drive/My
Drive/Colab/homework-10/news.lowercased.lemmatized.word2vec.300d[

## Installing pymorphy (runtime)

In [1]:
!pip uninstall pymorphy2-dicts-uk -y
!pip uninstall pymorphy2 -y
!pip install git+https://github.com/kmike/pymorphy2.git 
!pip install pymorphy2-dicts-uk

Uninstalling pymorphy2-dicts-uk-2.4.1.1.1460299261:
  Successfully uninstalled pymorphy2-dicts-uk-2.4.1.1.1460299261
Uninstalling pymorphy2-0.8:
  Successfully uninstalled pymorphy2-0.8
Collecting git+https://github.com/kmike/pymorphy2.git
  Cloning https://github.com/kmike/pymorphy2.git to /tmp/pip-req-build-pp19282g
  Running command git clone -q https://github.com/kmike/pymorphy2.git /tmp/pip-req-build-pp19282g
Building wheels for collected packages: pymorphy2
  Building wheel for pymorphy2 (setup.py) ... [?25l[?25hdone
  Created wheel for pymorphy2: filename=pymorphy2-0.8-cp36-none-any.whl size=54982 sha256=984f0976610fb1facf22b82c6d5a66114e20814d0dd7be70409ff7a4adc0c251
  Stored in directory: /tmp/pip-ephem-wheel-cache-ma2oixg7/wheels/0f/c8/2e/9d912793948be59c5fdd670697fe29a3bd1882eaa268eba90d
Successfully built pymorphy2
Installing collected packages: pymorphy2
Successfully installed pymorphy2-0.8
Collecting pymorphy2-dicts-uk
  Using cached https://files.pythonhosted.org/packa

### Testing if pymorphy models work

In [50]:
import spacy
news_cased_tokenized_model = spacy.load(news_cased_tokenized_word2vec_300d_path)

  "__main__", mod_spec)


In [51]:
w1 = news_cased_tokenized_model('кіт')
w2 = news_cased_tokenized_model('собака')
print(w1.similarity(w2))

0.7224378959711921


# Data

## Doc2Vec (runtime)

In [0]:
def vectorize_without_stopwords(text, model):
    words = model(text)
    filtered_words = []

    for w in words:
      if w not in spacy.lang.uk.stop_words.STOP_WORDS:
        filtered_words.append(w.text)

    words = model(' '.join(filtered_words))
    return words.vector

def vectorize_with_stopwords(text, model):
    words = model(text)
    return words.vector

## Loading the data in memory (runtime)

In [8]:
import gzip
import json
import glob

import time

def read_file(f):
    with gzip.open(f, 'rt', encoding='utf-8') as inf:
        j = json.load(inf)
        return j[0]

start = time.time()

data = []
files = []

for f in glob.glob(DATA_1551_PATH+'*/*'):
  files.append(f)

print('Files count', len(files))
counter = 0
for file in files:
  data.append(read_file(file))
  counter+=1
  if counter%100 ==0:
    #print (len(files) - counter,'/',len(files))
    printProgressBar(counter, len(files), prefix = 'Progress:', suffix = 'Complete', length = 50)

printProgressBar(counter, len(files), prefix = 'Progress:', suffix = 'Complete', length = 50)
print()
print ('Data loaded:',len(data))

end = time.time()
print('Time elapsed:', end - start, 'seconds')


Files count 127329
Progress: |██████████████████████████████████████████████████| 100.0% Complete

Data loaded: 127329
Time elapsed: 28.41733145713806 seconds


## Selecting a subset of data for fest processing

In [0]:
category_id = {}

def get_top_categories (n_categories):
  categories_count = {}
  counter = 0

  for data_item in data:
    if data_item['CallZType'] not in categories_count:
      categories_count[data_item['CallZType']] = 0
      category_id[data_item['CallZType']] = counter
      counter+=1
    categories_count[data_item['CallZType']]+=1


  def takeSecond(elem):
      return elem[1]


  category_statistics = []

  for category in categories_count:
    category_statistics.append((category, categories_count[category]))

  category_statistics.sort(key=takeSecond, reverse=True)

  selected_categories = {}

  for category in category_statistics[:n_categories]:
      selected_categories[category[0]] = []

  for data_item in data:
      if data_item['CallZType'] in selected_categories:
          selected_categories[data_item['CallZType']].append(data_item['CallZText'])
  
  return selected_categories

filtered_data = get_top_categories(10)


## Preparing the small corpus (runtime)

In [54]:
def get_vectorized_data(source_data, vectorizer, model):

  features = []
  labels = []

  for category in source_data:
    for data_item in source_data[category]:
      vectorized_data = vectorizer(data_item, model)
      if vectorized_data is not None:
        features.append(vectorized_data)
        labels.append(category_id[category])

  return features, labels

44895 44895


# Baseline

This is a prettified version of our classroom baseline KNeighborsClassifier is rather slow and it does not provide any verbose output. It is not quite fun to play with.

In [61]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.metrics import classification_report

filtered_data = get_top_categories(10)

X, y = get_vectorized_data(filtered_data, vectorizer = vectorize_with_stopwords, model = news_cased_tokenized_model)

print (len(X), len(y))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

neighbors = KNeighborsClassifier(n_neighbors=3, n_jobs = -1)
neighbors.fit(np.array(X_train), np.array(y_train))

y_pred = neighbors.predict(X_test)


print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

print()

print(classification_report(y_test, y_pred))


44895 44895
Accuracy: 0.562702483801296

              precision    recall  f1-score   support

           1       0.58      0.82      0.68      4450
           3       0.48      0.49      0.49      1071
           4       0.43      0.43      0.43       761
           7       0.57      0.47      0.52      1946
          15       0.37      0.32      0.34       714
          16       0.58      0.48      0.53      2155
          18       0.63      0.58      0.60      1190
          23       0.43      0.26      0.33       820
          46       0.71      0.42      0.53       857
          57       0.69      0.43      0.53       852

    accuracy                           0.56     14816
   macro avg       0.55      0.47      0.50     14816
weighted avg       0.56      0.56      0.55     14816



# Iteration 1 (MLPClassifier + GridSearchCV)

In [0]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.metrics import classification_report

filtered_data = get_top_categories(10)

X, y = get_vectorized_data(filtered_data, vectorizer = vectorize_with_stopwords, model = news_cased_tokenized_model)

print (len(X), len(y))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

parameters = {'solver': ['lbfgs'], 'max_iter': [300,400,500], 'alpha': 10.0 ** -np.arange(1, 5), 'hidden_layer_sizes':np.arange(10, 15)}
clf = GridSearchCV(MLPClassifier(), parameters, n_jobs=-1, verbose = 100)
clf.fit(np.array(X_train), np.array(y_train))

print(clf.score(X_train, y_train))
print(clf.best_params_)

y_pred = clf.predict(X_test)

print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

print()

print(classification_report(y_test, y_pred))


44895 44895
Fitting 5 folds for each of 60 candidates, totalling 300 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
Memmapping (shape=(30079, 300), dtype=float32) to new file /dev/shm/joblib_memmapping_folder_390_1568765816/390-139852888724144-a0cdaac5b8e84678b01201776a4044f5.pkl
Pickling array (shape=(30079,), dtype=int64).
Pickling array (shape=(24063,), dtype=int64).
Pickling array (shape=(6016,), dtype=int64).
Memmapping (shape=(30079, 300), dtype=float32) to old file /dev/shm/joblib_memmapping_folder_390_1568765816/390-139852888724144-a0cdaac5b8e84678b01201776a4044f5.pkl
Pickling array (shape=(30079,), dtype=int64).
Pickling array (shape=(24063,), dtype=int64).
Pickling array (shape=(6016,), dtype=int64).
Memmapping (shape=(30079, 300), dtype=float32) to old file /dev/shm/joblib_memmapping_folder_390_1568765816/390-139852888724144-a0cdaac5b8e84678b01201776a4044f5.pkl
Pickling array (shape=(30079,), dtype=int64).
Pickling array (shape=(24063,), dty

# Iteration 2 (no stop words)

# Visualization

# Summary

1. I've learned how to use MLPClassifier, GridSearchCV, and KNeighborsClassifier
2. I've used various pre-trained Word2Vec models
3. I've learned how to use TSNE for data visualization 