In [1]:
import numpy as np
import pandas as pd
import matplotlib.pylab as plt

from sklearn.linear_model import SGDClassifier

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS as sk_stop_words


from sklearn.utils.class_weight import compute_class_weight
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.metrics import classification_report as cls_report

from sklearn.base import BaseEstimator, TransformerMixin

from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import RandomOverSampler as ROS

import re

import string
import nltk
from nltk.tokenize import word_tokenize, wordpunct_tokenize, sent_tokenize
from nltk.corpus import wordnet as wn
from nltk.corpus import stopwords as sw
from nltk import WordNetLemmatizer
from nltk import pos_tag

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

from sklearn.externals import joblib

np.random.seed(42)

[nltk_data] Downloading package punkt to /home/tsungmin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/tsungmin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/tsungmin/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/tsungmin/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


### Load dataset

In [2]:
def read_medical_data(fn, is_train=True):
    """Return DataFrame of medical data
    
    Train data columns: [label, doc]
    Test data columns: [text]
    """
    if is_train:
        df = pd.read_table(fn, sep='\t', names=['label', 'doc'])
    else:
        df = pd.read_table(fn, names=['doc'])
        
    return df


def save_predict_result(fn, pred):
    
    df = pd.DataFrame(pred)
    df.to_csv(fn, header=None, index=None)
    
def subsample_idx(n_samples, labels):
    
    count = (5 * np.random.rand(5) + n_samples).astype(int)
    shuffle_idx = np.arange(len(labels))
    sub_idx = []
    for idx in shuffle_idx:
        if count[label_train.values[idx] - 1] > 0:
            sub_idx.append(idx)
            count[label_train.values[idx] - 1] -= 1
    
    return np.array(sub_idx)

### Feature extraction

In [3]:
# Using NTLK to do lemmaization 
# https://bbengfort.github.io/tutorials/2016/05/19/text-classification-nltk-sckit-learn.html

def lemma_stem(lemmatizer, doc, stopwords):
    doc = re.sub(r'\d+', '', doc)
    
    for sent in sent_tokenize(doc):
        
        for token, tag in pos_tag(wordpunct_tokenize(sent)):
        
            token = token.lower() #if self.lower else token
            token = token.strip()    #if self.strip else token
            token = token.strip('_') #if self.strip else token
            token = token.strip('*') #if self.strip else token

            # If stopword, ignore token and continue
            if token in stopwords or len(token) <= 2:
                continue
                                  
            if all(char in string.punctuation for char in token):
                continue
                                  
            lemma = lemmatize(lemmatizer, token, tag)
            yield lemma                      
            
def lemmatize(lemmatizer, token, tag):
    tag = {
        'N': wn.NOUN,
        'V': wn.VERB,
        'R': wn.ADV,
        'J': wn.ADJ
    }.get(tag[0], wn.NOUN)
                                  
    return lemmatizer.lemmatize(token, tag)

In [4]:
data = read_medical_data('train.dat', True)

# cls = [1, 2, 3, 4, 5]
# for c in cls:
#     data['is_cls{}'.format(c)] = \
#         np.array(data.label.values == c, dtype=np.int)
#data['doc_sent'] = data['doc'].apply(lambda doc : doc.count('.'))

cls_weight = compute_class_weight('balanced', np.unique(data.label.values), data.label.values)
cls_weight = {k + 1: v for k, v in enumerate(cls_weight)}

In [5]:
lemmatizer = WordNetLemmatizer()
stop_words = set(sw.words('english'))
data['lemma'] = data['doc'].apply(
            lambda doc: ' '.join(lemma_stem(lemmatizer, doc, stop_words)))

# data['lemma_count'] = \
#     data['lemma'].apply(lambda text: len(text.split()))

In [6]:
data.to_csv('tran_stemm.csv', index=False)

In [7]:
test = read_medical_data('test.dat', False)
#test['doc_sent'] = test['doc'].apply(lambda doc : doc.count('.'))

In [8]:
lemmatizer = WordNetLemmatizer()
stop_words = set(sw.words('english'))
test['lemma'] = test['doc'].apply(
            lambda doc: ' '.join(lemma_stem(lemmatizer, doc, stop_words)))

In [9]:
data_train, data_valid = train_test_split(data, test_size=0.2, 
                                          stratify=data.label, 
                                          random_state=42)

In [10]:
sgd_model = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words=stop_words, ngram_range=(1, 4))),
    ('clf', SGDClassifier(loss='modified_huber',
                          alpha=1e-4, 
                          l1_ratio=1e-4, penalty='elasticnet',
                          max_iter=5000, average=True, tol=1e-4,
                          #random_state=42,
                          #class_weight=cls_weight #{1: 1, 2: 1, 3: 1, 4:1, 5:2}
                         ))
])

sgd_model = sgd_model.fit(data_train.lemma, data_train.label)

In [11]:
sgd_pred_train = sgd_model.predict(data_train.lemma)
sgd_pred_valid = sgd_model.predict(data_valid.lemma)
sgd_score_train = f1_score(data_train.label, sgd_pred_train, average='macro')
sgd_score_valid = f1_score(data_valid.label, sgd_pred_valid, average='macro')

print("="*80)
print('sgd macro f1 (training):', sgd_score_train)
print(cls_report(data_train.label, sgd_pred_train))
print(confusion_matrix(data_train.label, sgd_pred_train))

print("="*80)

print('sgd macro f1:', sgd_score_valid)
print(cls_report(data_valid.label, sgd_pred_valid))
print(confusion_matrix(data_valid.label, sgd_pred_valid))

sgd macro f1 (training): 0.8129281000366078
             precision    recall  f1-score   support

          1       0.83      0.92      0.87      2530
          2       0.79      0.74      0.76      1195
          3       0.83      0.76      0.80      1540
          4       0.82      0.89      0.85      2441
          5       0.81      0.75      0.78      3844

avg / total       0.82      0.82      0.82     11550

[[2325   35   36   27  107]
 [  92  882   16   22  183]
 [  75   16 1171   75  203]
 [  31    8   37 2172  193]
 [ 273  178  144  352 2897]]
sgd macro f1: 0.5014083852826295
             precision    recall  f1-score   support

          1       0.65      0.67      0.66       633
          2       0.43      0.46      0.44       299
          3       0.43      0.36      0.39       385
          4       0.59      0.62      0.61       610
          5       0.41      0.40      0.41       961

avg / total       0.50      0.51      0.51      2888

[[426  37  29  18 123]
 [ 31 137  

In [12]:
sgd_full_model = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words=stop_words, ngram_range=(1, 4))),
    ('clf', SGDClassifier(loss='modified_huber',
                          alpha=1e-4,
                          l1_ratio=1e-4, penalty='elasticnet',
                          max_iter=5000, average=True, tol=1e-4,
                          #random_state=42,
                          #class_weight=cls_weight #{1: 1, 2: 1, 3: 1, 4:1, 5:2}
                         ))
])

sgd_full_model = sgd_full_model.fit(data.lemma, data.label)

In [13]:
sgd_pred_train = sgd_full_model.predict(data.lemma)
sgd_pred_valid = sgd_full_model.predict(data_valid.lemma)
sgd_score_train = f1_score(data.label, sgd_pred_train, average='macro')
sgd_score_valid = f1_score(data_valid.label, sgd_pred_valid, average='macro')

print("="*80)
print('sgd full macro f1 (training):', sgd_score_train)
print(cls_report(data.label, sgd_pred_train))
print(confusion_matrix(data.label, sgd_pred_train))

print("="*80)

print('sgd full macro f1:', sgd_score_valid)
print(cls_report(data_valid.label, sgd_pred_valid))
print(confusion_matrix(data_valid.label, sgd_pred_valid))

sgd full macro f1 (training): 0.7709761277722308
             precision    recall  f1-score   support

          1       0.79      0.91      0.85      3163
          2       0.75      0.67      0.70      1494
          3       0.78      0.73      0.75      1925
          4       0.79      0.86      0.82      3051
          5       0.76      0.69      0.73      4805

avg / total       0.78      0.78      0.77     14438

[[2868   47   56   35  157]
 [ 137  995   25   23  314]
 [ 116   20 1397  104  288]
 [  57   26   61 2633  274]
 [ 430  243  244  554 3334]]
sgd full macro f1: 0.7707513199518771
             precision    recall  f1-score   support

          1       0.80      0.91      0.85       633
          2       0.73      0.71      0.72       299
          3       0.77      0.72      0.75       385
          4       0.78      0.85      0.81       610
          5       0.77      0.68      0.72       961

avg / total       0.77      0.78      0.77      2888

[[578  10  14  10  21]
 

In [14]:
y_test_pred = sgd_full_model.predict(test.lemma)
joblib.dump(sgd_full_model, 'sgd_full_model_v2.pkl', compress=1)
save_predict_result('sgd_full_model_v2.dat', y_test_pred)

### Oversample method

In [15]:
imb_vec = CountVectorizer(min_df=5, stop_words=stop_words, ngram_range=(1, 4))
X_vec = imb_vec.fit_transform(data_train.lemma)

# Oversampling on feature space
smote = SMOTE()
X_vec_imb, y_imb = smote.fit_sample(X_vec, data_train.label)

# update vocabulary 
vocab = imb_vec.vocabulary_
vocab = {'_'.join(k.split()): v for k, v in vocab.items() }
imb_vec.vocabulary_ = vocab

# convert feature back to text representation
X_vec_imb_inv = [' '.join(x) for x in imb_vec.inverse_transform(X_vec_imb)]

# Tfidf feature 
imb_tfidf = TfidfVectorizer(min_df=5, stop_words=stop_words)
X_tfidf_imb = imb_tfidf.fit_transform(X_vec_imb_inv, y_imb)

In [16]:
imb_model = SGDClassifier(loss='modified_huber', 
                          alpha=1e-4,
                          l1_ratio=1e-4, penalty='elasticnet',
                          max_iter=5000, average=True, tol=1e-4,
                          #random_state=42,
                          #class_weight=cls_weight #{1: 1, 2: 1, 3: 1, 4:1, 5:2}
                         )

imb_model = imb_model.fit(X_tfidf_imb, y_imb)

In [17]:
imb_pred_train = imb_model.predict(X_tfidf_imb)
imb_pred_valid = imb_model.predict(imb_tfidf.transform(data_valid.lemma))
imb_score_train = f1_score(y_imb, imb_pred_train, average='macro')
imb_score_valid = f1_score(data_valid.label, imb_pred_valid, average='macro')

print("="*80)
print('imb macro f1 (training):', imb_score_train)
print(cls_report(y_imb, imb_pred_train))
print(confusion_matrix(y_imb, imb_pred_train))

print("="*80)

print('imb macro f1:', imb_score_valid)
print(cls_report(data_valid.label, imb_pred_valid))
print(confusion_matrix(data_valid.label, imb_pred_valid))

imb macro f1 (training): 0.8832197294190374
             precision    recall  f1-score   support

          1       0.89      0.93      0.91      3844
          2       0.89      0.97      0.93      3844
          3       0.89      0.95      0.92      3844
          4       0.88      0.93      0.91      3844
          5       0.90      0.65      0.75      3844

avg / total       0.89      0.89      0.88     19220

[[3573  100   85   26   60]
 [  43 3738   15    6   42]
 [  42   21 3670   46   65]
 [  33   30   74 3593  114]
 [ 321  331  285  417 2490]]
imb macro f1: 0.5912234448187933
             precision    recall  f1-score   support

          1       0.71      0.79      0.75       633
          2       0.49      0.57      0.53       299
          3       0.50      0.52      0.51       385
          4       0.67      0.72      0.70       610
          5       0.53      0.43      0.47       961

avg / total       0.59      0.60      0.59      2888

[[501  32  24  13  63]
 [ 27 171  

In [18]:
imb_vec = CountVectorizer(min_df=5, stop_words=stop_words, ngram_range=(1, 4))
X_vec = imb_vec.fit_transform(data.lemma)

# Oversampling on feature space
smote = SMOTE(random_state=42)
X_vec_imb, y_imb = smote.fit_sample(X_vec, data.label)

# update vocabulary 
vocab = imb_vec.vocabulary_
vocab = {'_'.join(k.split()): v for k, v in vocab.items() }
imb_vec.vocabulary_ = vocab

# convert feature back to text representation
X_vec_imb_inv = [' '.join(x) for x in imb_vec.inverse_transform(X_vec_imb)]

# Tfidf feature 
imb_tfidf = TfidfVectorizer(min_df=5, stop_words=stop_words)
X_tfidf_imb = imb_tfidf.fit_transform(X_vec_imb_inv, y_imb)

In [19]:
imb_model_full = SGDClassifier(loss='modified_huber', 
                          alpha=1e-4,
                          l1_ratio=1e-4, penalty='elasticnet',
                          max_iter=5000, average=True, tol=1e-4,
                          #random_state=42,
                          #class_weight=cls_weight #{1: 1, 2: 1, 3: 1, 4:1, 5:2}
                         )

imb_model_full = imb_model_full.fit(X_tfidf_imb, y_imb)

In [20]:
imb_pred_train = imb_model_full.predict(X_tfidf_imb)
imb_pred_valid = imb_model_full.predict(imb_tfidf.transform(data_valid.lemma))
imb_score_train = f1_score(y_imb, imb_pred_train, average='macro')
imb_score_valid = f1_score(data_valid.label, imb_pred_valid, average='macro')

print("="*80)
print('imb full macro f1 (training):', imb_score_train)
print(cls_report(y_imb, imb_pred_train))
print(confusion_matrix(y_imb, imb_pred_train))

print("="*80)

print('imb full macro f1:', imb_score_valid)
print(cls_report(data_valid.label, imb_pred_valid))
print(confusion_matrix(data_valid.label, imb_pred_valid))

imb full macro f1 (training): 0.8564096031470211
             precision    recall  f1-score   support

          1       0.87      0.92      0.89      4805
          2       0.86      0.96      0.91      4805
          3       0.87      0.94      0.90      4805
          4       0.85      0.92      0.89      4805
          5       0.86      0.58      0.69      4805

avg / total       0.86      0.86      0.86     24025

[[4410  141  118   43   93]
 [  70 4636   26    9   64]
 [  76   28 4501   81  119]
 [  53   52  109 4432  159]
 [ 461  509  430  641 2764]]
imb full macro f1: 0.7104570249444975
             precision    recall  f1-score   support

          1       0.76      0.88      0.82       633
          2       0.61      0.78      0.69       299
          3       0.64      0.70      0.67       385
          4       0.73      0.83      0.78       610
          5       0.74      0.51      0.60       961

avg / total       0.72      0.71      0.70      2888

[[555  20  19   9  30]
 

In [21]:
y_test_pred = imb_model_full.predict(imb_tfidf.transform(test.lemma))
joblib.dump(imb_model_full, 'imb_model_full_v2.pkl', compress=1)
save_predict_result('imb_model_full_v2.dat', y_test_pred)

In [22]:
from scipy.stats import truncnorm
from collections import Counter

def genrate_syn_len(k, true_doc_len):
    """Generate k samples token length
    
    """
    
    mu = np.mean(true_doc_len)
    sigma = np.std(true_doc_len)
    lower = np.min(true_doc_len)
    upper = np.max(true_doc_len)
    
    X = truncnorm((lower - mu) / sigma, (upper - mu) / sigma, loc=mu, scale=sigma)
    
    return X.rvs(k).astype(np.int)


def generate_text(X, k_lens):
    """Generate text from sample
    
    X: list of docs
    k_lens: length of k syn samples
    """
    
    # compute the word frequency from true sample
    counter = Counter()
    for x in X:
        counter.update(x.split())
    vocabs = list(counter.keys())
    freqs = np.array(list(counter.values()))
    freqs = freqs / np.sum(freqs)
    
    syns_text = []
    for k_len in k_lens:
        # sample text content based on word freq from source
        k_text = ' '.join(np.random.choice(vocabs, k_len, p=freqs))
        syns_text.append(k_text)
        
    return syns_text


def generate_syn_text(X, y):
    """Generate fake sample text data
    
    X: list of docs
    y: label of docs
    """
    
    # find major class count
    cls_max_count = (np.max(np.bincount(y)))
    
    syn_texts = []
    syn_labels = []
    
    cls = np.unique(y)
    for c in cls:
        cls_docs = X[y == c]
        n_cls_docs = len(cls_docs)        
        if n_cls_docs == cls_max_count:
            continue
        
        # compute class token distribution
        doc_lens =  np.apply_along_axis(lambda x : len(x[0].split()), 1, cls_docs.reshape(-1, 1))
        #doc_sort_idx = np.argsort(doc_lens)
        #cls_docs = cls_docs[doc_sort_idx]
        
        # generate k sample
        k = cls_max_count - len(cls_docs)
        k_lens = genrate_syn_len(k, doc_lens)
        k_lens = np.sort(k_lens)
        batch_size = 16
        n_batch = k // batch_size 
        n_batch = (n_batch + 1) if (k % n_batch) != 0 else n_batch
        
        # generate data with same length source
        for b_idx in range(n_batch):
            batch_sidx = (b_idx) * batch_size
            batch_eidx = (b_idx + 1) * batch_size
            if batch_eidx >= k:
                k_len_batch = k_lens[batch_sidx:]
            else:
                k_len_batch = k_lens[batch_sidx:batch_eidx]
            
            k_len_med = np.median(k_len_batch) if len(k_len_batch) >= 3 else np.mean(k_len_batch)
            k_len_med = int(k_len_med)
            
            X_batch = None
            range_relax = 1
            while X_batch is None or len(X_batch) <= batch_size:
                lower = doc_lens >= (k_len_med - range_relax)
                upper = doc_lens <= (k_len_med + range_relax)
                X_batch = cls_docs[lower & upper]
                range_relax += 1
            
            syn_batch_texts = generate_text(X_batch, k_len_batch)
            syn_texts.extend(syn_batch_texts)
            
        syn_labels.extend([c] * k)
        
    return np.array(syn_texts), np.array(syn_labels)

In [23]:
syn_vec = CountVectorizer(min_df=5, stop_words=stop_words, ngram_range=(1, 4))
X_vec_syn = syn_vec.fit_transform(data_train.lemma)

# update vocabulary 
vocab = syn_vec.vocabulary_
vocab = {'_'.join(k.split()): v for k, v in vocab.items() }
syn_vec.vocabulary_ = vocab

# convert feature back to text representation
X_vec_syn_inv = [' '.join(x) for x in syn_vec.inverse_transform(X_vec_syn)]

X_train = np.array(X_vec_syn_inv)
#X_train = data_train.lemma.values
y_train = data_train.label.values

syn_texts, syn_labels = generate_syn_text(X_train, y_train)
X_train_syn = np.hstack([X_train, syn_texts])
y_train_syn = np.hstack([y_train, syn_labels])

In [24]:
syn_model = Pipeline([
    ('tfidf', TfidfVectorizer(min_df=5, stop_words=stop_words)),
    ('clf', SGDClassifier(loss='modified_huber', 
                          alpha=1e-4,
                          l1_ratio=1e-4, penalty='elasticnet',
                          max_iter=5000, average=True, tol=1e-4,
                          #random_state=42,
                          #class_weight=cls_weight #{1: 1, 2: 1, 3: 1, 4:1, 5:2}
                         ))
])

syn_model = syn_model.fit(X_train_syn, y_train_syn)

In [25]:
syn_pred_train = syn_model.predict(X_train_syn)
syn_pred_valid = syn_model.predict(data_valid.lemma)
syn_score_train = f1_score(y_train_syn, syn_pred_train, average='macro')
syn_score_valid = f1_score(data_valid.label, syn_pred_valid, average='macro')

print("="*80)
print('syn macro f1 (training):', syn_score_train)
print(cls_report(y_train_syn, syn_pred_train))
print(confusion_matrix(y_train_syn, syn_pred_train))

print("="*80)

print('syn macro f1:', syn_score_valid)
print(cls_report(data_valid.label, syn_pred_valid))
print(confusion_matrix(data_valid.label, syn_pred_valid))

syn macro f1 (training): 0.8809035365608768
             precision    recall  f1-score   support

          1       0.89      0.93      0.91      3844
          2       0.88      0.98      0.93      3844
          3       0.88      0.96      0.92      3844
          4       0.87      0.94      0.91      3844
          5       0.92      0.62      0.74      3844

avg / total       0.89      0.89      0.88     19220

[[3586   96   76   26   60]
 [  39 3769   14   10   12]
 [  50   23 3675   56   40]
 [  33   28   67 3608  108]
 [ 326  369  321  428 2400]]
syn macro f1: 0.5966456802627192
             precision    recall  f1-score   support

          1       0.71      0.80      0.75       633
          2       0.49      0.64      0.56       299
          3       0.49      0.56      0.52       385
          4       0.67      0.72      0.69       610
          5       0.55      0.40      0.46       961

avg / total       0.60      0.60      0.59      2888

[[504  33  27  11  58]
 [ 27 191  

In [26]:
syn_vec = CountVectorizer(min_df=5, stop_words=stop_words, ngram_range=(1, 4))
X_vec_syn = syn_vec.fit_transform(data.lemma)

# update vocabulary 
vocab = syn_vec.vocabulary_
vocab = {'_'.join(k.split()): v for k, v in vocab.items() }
syn_vec.vocabulary_ = vocab

# convert feature back to text representation
X_vec_syn_inv = [' '.join(x) for x in syn_vec.inverse_transform(X_vec_syn)]

X_train = np.array(X_vec_syn_inv)
#X_train = data.lemma.values
y_train = data.label.values

syn_texts, syn_labels = generate_syn_text(X_train, y_train)
X_train_syn = np.hstack([X_train, syn_texts])
y_train_syn = np.hstack([y_train, syn_labels])

In [27]:
syn_full_model = Pipeline([
    ('tfidf', TfidfVectorizer(min_df=5, stop_words=stop_words)),
    ('clf', SGDClassifier(loss='modified_huber', 
                          alpha=1e-4,
                          l1_ratio=1e-4, penalty='elasticnet',
                          max_iter=5000, average=True, tol=1e-4,
                          #random_state=42,
                          #class_weight=cls_weight #{1: 1, 2: 1, 3: 1, 4:1, 5:2}
                         ))
])

syn_full_model = syn_full_model.fit(X_train_syn, y_train_syn)

In [28]:
syn_pred_train = syn_full_model.predict(X_train_syn)
syn_pred_valid = syn_full_model.predict(data_valid.lemma)
syn_score_train = f1_score(y_train_syn, syn_pred_train, average='macro')
syn_score_valid = f1_score(data_valid.label, syn_pred_valid, average='macro')

print("="*80)
print('syn full macro f1 (training):', syn_score_train)
print(cls_report(y_train_syn, syn_pred_train))
print(confusion_matrix(y_train_syn, syn_pred_train))

print("="*80)

print('syn full macro f1:', syn_score_valid)
print(cls_report(data_valid.label, syn_pred_valid))
print(confusion_matrix(data_valid.label, syn_pred_valid))

syn full macro f1 (training): 0.854061843992743
             precision    recall  f1-score   support

          1       0.87      0.92      0.90      4805
          2       0.86      0.97      0.91      4805
          3       0.86      0.95      0.90      4805
          4       0.85      0.93      0.89      4805
          5       0.90      0.54      0.68      4805

avg / total       0.87      0.86      0.85     24025

[[4443  130  111   44   77]
 [  74 4673   25   13   20]
 [  78   31 4557   82   57]
 [  50   53  111 4462  129]
 [ 472  559  501  674 2599]]
syn full macro f1: 0.7134337314628192
             precision    recall  f1-score   support

          1       0.77      0.87      0.82       633
          2       0.58      0.82      0.68       299
          3       0.64      0.77      0.70       385
          4       0.73      0.83      0.78       610
          5       0.78      0.48      0.59       961

avg / total       0.73      0.71      0.70      2888

[[553  25  19  10  26]
 [

In [29]:
y_test_pred = syn_full_model.predict(test.lemma)
joblib.dump(syn_full_model, 'syn_full_model_v2.pkl', compress=1)
save_predict_result('syn_full_model_v2.dat', y_test_pred)