# Assignment 4: Named entity recognition

Построить модель для обнаружения и классификации именованных сущностей (named entities). На базе корпуса CoNLL 2002.  

Используйте в своем решении ансамбли над решающими деревьями: RandomForest, Gradient Boosting (xgboost, lightgbm, catboost) 
Tutorials:  
1. https://github.com/Microsoft/LightGBM/tree/master/examples/python-guide
1. https://github.com/catboost/tutorials 


Чем больше baseline'ов вы превзойдете, тем выше ваша оценка
Метрика качества f1 (f1_macro) (чем выше, тем лучше)
 
baseline 1: 0.0604      random labels  
baseline 2: 0.3966      PoS features + logistic regression  
baseline 3: 0.8122      word2vec cbow embedding + baseline 2 + svm    

! Your results must be reproducible. Если ваша модель - стохастическая, то вы явно должны задавать все seed и random_state в параметрах моделей   

bonus, think about:  
1. How can you exploit that words belong to some sentence?
2. Why we selected f1 score with macro averaging as our classification quality measure? What other metrics are suitable?   

In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegressionCV
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import metrics

import warnings
warnings.filterwarnings('ignore')

SEED=1337

  _nan_object_mask = _nan_object_array != _nan_object_array


In [2]:
df = pd.read_csv('ner_short.csv', index_col=0)
df.head()

Unnamed: 0,next-next-pos,next-next-word,next-pos,next-word,pos,prev-pos,prev-prev-pos,prev-prev-word,prev-word,sentence_idx,word,tag
0,NNS,demonstrators,IN,of,NNS,__START1__,__START2__,__START2__,__START1__,1.0,Thousands,O
1,VBP,have,NNS,demonstrators,IN,NNS,__START1__,__START1__,Thousands,1.0,of,O
2,VBN,marched,VBP,have,NNS,IN,NNS,Thousands,of,1.0,demonstrators,O
3,IN,through,VBN,marched,VBP,NNS,IN,of,demonstrators,1.0,have,O
4,NNP,London,IN,through,VBN,VBP,NNS,demonstrators,have,1.0,marched,O


In [3]:
# number of sentences
df.sentence_idx.max()

1500.0

In [4]:
# class distribution
df.tag.value_counts(normalize=True)

O        0.852828
B-geo    0.027604
B-gpe    0.020935
B-org    0.020247
I-per    0.017795
B-tim    0.016927
B-per    0.015312
I-org    0.013937
I-geo    0.005383
I-tim    0.004247
B-art    0.001376
I-gpe    0.000837
I-art    0.000748
B-eve    0.000628
I-eve    0.000508
B-nat    0.000449
I-nat    0.000239
Name: tag, dtype: float64

In [5]:
# sentence length
tdf = df.set_index('sentence_idx')
tdf['length'] = df.groupby('sentence_idx').tag.count()
df = tdf.reset_index(drop=False)

In [6]:
# encode categorial variables
le = LabelEncoder()
df['pos'] = le.fit_transform(df.pos)
df['next-pos'] = le.fit_transform(df['next-pos'])
df['next-next-pos'] = le.fit_transform(df['next-next-pos'])
df['prev-pos'] = le.fit_transform(df['prev-pos'])
df['prev-prev-pos'] = le.fit_transform(df['prev-prev-pos'])

In [7]:
df.head()

Unnamed: 0,sentence_idx,next-next-pos,next-next-word,next-pos,next-word,pos,prev-pos,prev-prev-pos,prev-prev-word,prev-word,word,tag,length
0,1.0,18,demonstrators,9,of,18,39,40,__START2__,__START1__,Thousands,O,48
1,1.0,33,have,18,demonstrators,9,18,39,__START1__,Thousands,of,O,48
2,1.0,32,marched,33,have,18,9,18,Thousands,of,demonstrators,O,48
3,1.0,9,through,32,marched,33,18,9,of,demonstrators,have,O,48
4,1.0,16,London,9,through,32,33,18,demonstrators,have,marched,O,48


In [8]:
# splitting
y = LabelEncoder().fit_transform(df.tag)

df_train, df_test, y_train, y_test = train_test_split(df, y, stratify=y, 
                                                            test_size=0.25,
                                                            random_state=SEED,
                                                            shuffle=True)
print('train', df_train.shape[0])
print('test', df_test.shape[0])

train 50155
test 16719


In [9]:
# some wrappers to work with word2vec
from gensim.models.word2vec import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.base import TransformerMixin
from collections import defaultdict

   
class Word2VecWrapper(TransformerMixin):
    def __init__(self, window=5,negative=5, size=100, iter=100, is_cbow=False, random_state=SEED):
        self.window_ = window
        self.negative_ = negative
        self.size_ = size
        self.iter_ = iter
        self.is_cbow_ = is_cbow
        self.w2v = None
        self.random_state = random_state
        
    def get_size(self):
        return self.size_

    def fit(self, X, y=None):
        """
        X: list of strings
        """
        sentences_list = [x.split() for x in X]
        self.w2v = Word2Vec(sentences_list, 
                            window=self.window_,
                            negative=self.negative_, 
                            size=self.size_, 
                            iter=self.iter_,
                            sg=not self.is_cbow_, seed=self.random_state)

        return self
    
    def has(self, word):
        return word in self.w2v

    def transform(self, X):
        """
        X: a list of words
        """
        if self.w2v is None:
            raise Exception('model not fitted')
        return np.array([self.w2v[w] if w in self.w2v else np.zeros(self.size_) for w in X ])

In [10]:
%%time
# here we exploit that word2vec is an unsupervised learning algorithm
# so we can train it on the whole dataset (subject to discussion)

sentences_list = [x.strip() for x in ' '.join(df.word).split('.')]

w2v_cbow = Word2VecWrapper(window=5, negative=5, size=300, iter=300, is_cbow=True, random_state=SEED)
w2v_cbow.fit(sentences_list)

CPU times: user 50.8 s, sys: 704 ms, total: 51.5 s
Wall time: 21.1 s


In [11]:
%%time
# baseline 1 
# random labels

columns = ['pos','next-pos','next-next-pos','prev-pos','prev-prev-pos']

model = Pipeline([
    ('enc', OneHotEncoder()),
    ('est', DummyClassifier(random_state=SEED)),
])

model.fit(df_train[columns], y_train)

print('train', metrics.f1_score(y_train, model.predict(df_train[columns]), average='macro'))
print('test', metrics.f1_score(y_test, model.predict(df_test[columns]), average='macro'))

In [12]:
%%time
# baseline 2 
# pos features + one hot encoding + logistic regression
from sklearn.preprocessing import OneHotEncoder


columns = ['pos','next-pos','next-next-pos','prev-pos','prev-prev-pos']

model = Pipeline([
    ('enc', OneHotEncoder()),
    ('est', LogisticRegressionCV(Cs=5, cv=5, n_jobs=-1, scoring='f1_macro', 
                             penalty='l2', solver='newton-cg', multi_class='multinomial', random_state=SEED)),
])

model.fit(df_train[columns], y_train)

print('train', metrics.f1_score(y_train, model.predict(df_train[columns]), average='macro'))
print('test', metrics.f1_score(y_test, model.predict(df_test[columns]), average='macro'))

In [13]:
import scipy.sparse as sp

embeding = w2v_cbow
encoder_pos = OneHotEncoder()
X_train = sp.hstack([
    embeding.transform(df_train.word),
    embeding.transform(df_train['next-word']),
    embeding.transform(df_train['next-next-word']),
    embeding.transform(df_train['prev-word']),
    embeding.transform(df_train['prev-prev-word']),
    encoder_pos.fit_transform(df_train[['pos','next-pos','next-next-pos','prev-pos','prev-prev-pos']]),
    df_train[['length', 'sentence_idx']]
])
X_test = sp.hstack([
    embeding.transform(df_test.word),
    embeding.transform(df_test['next-word']),
    embeding.transform(df_test['next-next-word']),
    embeding.transform(df_test['prev-word']),
    embeding.transform(df_test['prev-prev-word']),
    encoder_pos.transform(df_test[['pos','next-pos','next-next-pos','prev-pos','prev-prev-pos']]),
    df_test[['length', 'sentence_idx']]
])

In [20]:
%%time
# baseline 3
# use word2vec cbow embedding + baseline 2 + svm
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.svm import LinearSVC

model = model_selection.GridSearchCV(LinearSVC(penalty='l2', multi_class='ovr', random_state=SEED), 
                                    {'C': np.logspace(-4, 0, 5)}, 
                                    cv=3, scoring='f1_macro', n_jobs=-1, verbose=1)
model.fit(X_train, y_train)

print('train', metrics.f1_score(y_train, model.predict(X_train), average='macro'))
print('test', metrics.f1_score(y_test, model.predict(X_test), average='macro'))

Подберем значение параметра ```boosting_type```:

In [88]:
columns = ['pos','next-pos','next-next-pos','prev-pos','prev-prev-pos']


params = {
    'boosting_type': ['gbdt', 'dart', 'goss']
}

gbm = lgb.LGBMClassifier(boosting_type='gbdt', objective='multiclass',
                         num_class=17, verbose=2, save_binary=True, seed=SEED)

grid = GridSearchCV(gbm, params, scoring='f1_macro', verbose=2, n_jobs=-1)
grid.fit(df_train[columns + ['length']], y_train)
grid.best_params_

Fitting 3 folds for each of 3 candidates, totalling 9 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   7 out of   9 | elapsed:   51.6s remaining:   14.7s
[Parallel(n_jobs=-1)]: Done   9 out of   9 | elapsed:  1.0min finished


{'boosting_type': 'dart'}

In [89]:
pd.DataFrame(grid.cv_results_)

Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_boosting_type,params,rank_test_score,split0_test_score,split0_train_score,split1_test_score,split1_train_score,split2_test_score,split2_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
0,8.614552,3.465454,0.270666,0.359975,gbdt,{'boosting_type': 'gbdt'},3,0.325514,0.429477,0.183395,0.238551,0.303083,0.411895,0.035523,0.343368,0.062385,0.086159
1,23.707252,2.365002,0.529477,0.735086,dart,{'boosting_type': 'dart'},1,0.550501,0.718046,0.52087,0.775735,0.517053,0.711478,0.143868,0.078751,0.01495,0.028868
2,11.640343,3.404577,0.271942,0.370197,goss,{'boosting_type': 'goss'},2,0.398389,0.555278,0.300093,0.432263,0.117285,0.123048,0.346563,0.558897,0.116474,0.181833


Подберём значения параметров ```max_depth```, ```num_leaves``` и ```min_data_in_leaf``` (подбираем параметры отдельно от ```boosting_type```, поскольку перебор по сетке требует много ресурсов):

In [90]:
params = {
    'max_depth': [-1, 10, 100],
    'num_leaves': [31, 70, 150],
    'min_data_in_leaf': [100, 500, 1000]
}

gbm = lgb.LGBMClassifier(boosting_type='dart', objective='multiclass',
                         num_class=17, verbose=2, save_binary=True, seed=SEED)

grid = GridSearchCV(gbm, params, scoring='f1_macro', verbose=2, n_jobs=-1)
grid.fit(df_train[columns + ['length']], y_train)
grid.best_params_

Fitting 3 folds for each of 27 candidates, totalling 81 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  7.3min
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed: 17.9min finished


{'max_depth': -1, 'min_data_in_leaf': 100, 'num_leaves': 150}

In [91]:
grid.best_score_

0.39299675138382106

Обучим модель с полученными параметрами, увеличив значение ```num_boost_round```:

In [15]:
def get_pred(pred):
    y_pred = []

    for x in pred:
        y_pred.append(np.argmax(x))
        
    return y_pred

In [95]:
params = {
    'boosting_type': 'dart',
    'objective': 'multiclass',
    'num_class': 17,
    'num_leaves': 150,
    'min_data_in_leaf': 100,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'metric': 'multi_logloss',
    'seed': SEED,
    'verbose': 0
}

print('Starting training...')

gbm = lgb.train(params,
                lgb.Dataset(df_train[columns + ['length']], y_train),
                num_boost_round=1000)

train_p = get_pred(gbm.predict(df_train[columns + ['length']]))
test_p = get_pred(gbm.predict(df_test[columns + ['length']]))

print('train', metrics.f1_score(y_train, train_p, average='macro'))
print('test', metrics.f1_score(y_test, test_p, average='macro'))

Starting training...
train 0.954533312953
test 0.769650105597


Baseline 2 побит!

Добавим в датасет ещё один признак -- номер предложения ```sentence_idx```:

In [21]:
%%time

params1 = {
    'boosting_type': 'dart',
    'objective': 'multiclass',
    'num_class': 17,
    'num_leaves': 150,
    'min_data_in_leaf': 100,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'metric': 'multi_logloss',
    'seed': SEED,
    'verbose': 0
}

gbm1 = lgb.train(params1,
                lgb.Dataset(df_train[columns + ['length', 'sentence_idx']], y_train),
                num_boost_round=1000)

CPU times: user 1h 46min 40s, sys: 9.08 s, total: 1h 46min 49s
Wall time: 17min 50s


In [22]:
train_p1 = get_pred(gbm1.predict(df_train[columns + ['length', 'sentence_idx']]))
test_p1 = get_pred(gbm1.predict(df_test[columns + ['length', 'sentence_idx']]))

print('train', metrics.f1_score(y_train, train_p1, average='macro'))
print('test', metrics.f1_score(y_test, test_p1, average='macro'))

train 0.997000963031
test 0.847599357398


Baseline 3 побит!

Добавим ```word2vec cbow embedding```:

In [None]:
%%time

params2 = {
    'boosting_type': 'dart',
    'objective': 'multiclass',
    'num_class': 17,
    'num_leaves': 150,
    'min_data_in_leaf': 100,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'metric': 'multi_logloss',
    'seed': SEED,
    'verbose': 1
}

gbm2 = lgb.train(params2,
                lgb.Dataset(X_train, y_train),
                num_boost_round=1000)

In [16]:
train_p2 = get_pred(gbm2.predict(X_train))
test_p2 = get_pred(gbm2.predict(X_test))

print('train', metrics.f1_score(y_train, train_p2, average='macro'))
print('test', metrics.f1_score(y_test, test_p2, average='macro'))

train 0.999591373653
test 0.886729641668


#### How can you exploit that words belong to some sentence?
Этот признак может помочь при определении именных сущностей, состоящих из нескольких слов и соответственно находящихся рядом в одном предложении.

#### Why we selected f1 score with macro averaging as our classification quality measure? What other metrics are suitable?
Используется метрика ```f1 score```, поскольку классы сильно не сбалансированы (класс ```'О'``` составляет целых 85% от всех остальных классов, а класс ```I-nat```, напротив, всего лишь 0.0002%). Если будет использоваться метрика ```accuracy```, то значение метрики будет высоким даже если всё будет предсказываться как ```'О'```. Мы могли бы также смотреть на ```precision-recall``` для каждого класса или использовать ```precision-recall macro-averaged```.