In [0]:
import os
from os.path import exists, join, basename, splitext

is_on_colab = True
project_path = 'mongolian-bert'
try:
  import colab
except ModuleNotFoundError:
  is_on_colab = False
  project_path = '../../mongolian-bert'
  
import sys
sys.path.append(project_path)

In [2]:
if is_on_colab:
  # we are on Colab, clone our project
  if not exists(project_path):
    import getpass
    GITHUB_USERNAME = input("your GITHUB username: ")
    GITHUB_PASSWORD = getpass.getpass('your GITHUB password:')
    !git clone -q --recursive https://$GITHUB_USERNAME:$GITHUB_PASSWORD@github.com/tugstugi/mongolian-bert.git

your GITHUB username: tugstugi
your GITHUB password:··········


In [0]:
if not exists("eduge.csv"):
  !wget -q https://github.com/tugstugi/mongolian-nlp/raw/master/datasets/eduge.csv.gz
  !gunzip eduge.csv.gz

In [4]:
!pip install -q sentencepiece

[?25l[K    0% |▎                               | 10kB 16.2MB/s eta 0:00:01[K    1% |▋                               | 20kB 3.4MB/s eta 0:00:01[K    2% |█                               | 30kB 4.9MB/s eta 0:00:01[K    3% |█▎                              | 40kB 3.1MB/s eta 0:00:01[K    4% |█▋                              | 51kB 3.8MB/s eta 0:00:01[K    5% |██                              | 61kB 4.6MB/s eta 0:00:01[K    6% |██▏                             | 71kB 5.2MB/s eta 0:00:01[K    7% |██▌                             | 81kB 5.9MB/s eta 0:00:01[K    8% |██▉                             | 92kB 6.6MB/s eta 0:00:01[K    9% |███▏                            | 102kB 5.1MB/s eta 0:00:01[K    10% |███▌                            | 112kB 5.2MB/s eta 0:00:01[K    11% |███▉                            | 122kB 7.0MB/s eta 0:00:01[K    12% |████                            | 133kB 6.9MB/s eta 0:00:01[K    13% |████▍                           | 143kB 12.7MB/s eta 0:00:01[

In [5]:
import sentencepiece as spm
import pandas as pd
import numpy as np
import time

from sklearn.feature_extraction.text import *
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

sp = spm.SentencePieceProcessor()
sp.Load(join(project_path, 'model-32k/mn_cased.model'))
def sp_tokenize(w):
  return sp.EncodeAsPieces(w)

sp_tokenize('Мөнгөө тушаачихсаныхаа дараа мэдэгдээрэй')

['▁Мөнгөө', '▁тушаа', 'чихсан', 'ыхаа', '▁дараа', '▁мэдэгд', 'ээрэй']

In [6]:
df = pd.read_csv("eduge.csv")
df = df.rename(columns=lambda x: x.strip())
df['label'].unique().tolist()

['урлаг соёл',
 'эдийн засаг',
 'эрүүл мэнд',
 'хууль',
 'улс төр',
 'спорт',
 'технологи',
 'боловсрол',
 'байгал орчин']

In [0]:
#
# same train/test split as BERT
#
train, test = train_test_split(df, test_size=0.1, random_state=999, stratify=df['label'])

In [8]:
text_clf = Pipeline([('vect', CountVectorizer(tokenizer=sp_tokenize)),
                         ('tfidf', TfidfTransformer()),
                         ('clf', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-4, n_iter=5, random_state=0))])

t = time.time()
text_clf = text_clf.fit(train['news'], train['label'])
t = time.time()-t
print("Training time in seconds: ", t)

t = time.time()
predicted = text_clf.predict(test['news'])
t = time.time()-t
print("Prediction time in seconds: ", t)

print("Feature count:", len(text_clf.named_steps['vect'].vocabulary_))
print("Classifier accuracy: ", np.mean(predicted == test['label']))



Training time in seconds:  130.03448629379272
Prediction time in seconds:  16.292343378067017
Feature count: 25384
Classifier accuracy:  0.91053257565746
