In [0]:
from IPython.display import clear_output

In [0]:
!pip install pycodestyle flake8 pycodestyle_magic
clear_output()

In [0]:
%load_ext pycodestyle_magic

In [0]:
!wget https://raw.githubusercontent.com/thedenaas/hse_seminars/master/2019/seminar_6/data.zip
clear_output()

In [0]:
!unzip data.zip
clear_output()

In [0]:
!rm data.zip

# Assignment 4: Named entity recognition

Create a model for Named Entity Recognition for dataset CoNLL 2002.  
Your quality metric = f1_macro

In your solution you should use: RandomForest, Gradient Boosting (xgboost, lightgbm, catboost)   
Tutorials:  
1. https://github.com/Microsoft/LightGBM/tree/master/examples/python-guide
1. https://github.com/catboost/tutorials 

More baselines you beat - better your score
 
baseline 1 [3 points]: 0.0604      random labels  
baseline 2 [5 points]: 0.3966      PoS features + logistic regression  
baseline 3 [8 points]: 0.8122      word2vec cbow embedding + baseline 2 + svm    

[1 point] using feature engineering (creating features not presented in the baselines)

! Your results must be reproducible. You should explicitly set all seeds random_states in yout model.  
! Remember to use proper training pipeline.  

bonus, think about:  
1. [1 point] Why did we select f1 score with macro averaging as our classification quality measure? What other metrics are suitable?   

In [0]:
import nltk
import warnings
import pandas as pd
import numpy as np
import scipy.sparse as sp
from collections import defaultdict
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from gensim.models.word2vec import Word2Vec
from sklearn.metrics import f1_score
from sklearn.pipeline import Pipeline
from sklearn.base import TransformerMixin
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [0]:
np.random.seed(42)

In [0]:
warnings.filterwarnings('ignore')

In [0]:
class Word2VecWrapper(TransformerMixin):
    def __init__(self, window=5,negative=5, size=100,
                 iter=100, is_cbow=False, random_state=42):
        self.window_ = window
        self.negative_ = negative
        self.size_ = size
        self.iter_ = iter
        self.is_cbow_ = is_cbow
        self.w2v = None
        self.random_state = random_state

    def get_size(self):
        return self.size_

    def fit(self, X, y=None):
        """
        X: list of strings
        """
        sentences_list = [x.split() for x in X]
        self.w2v = Word2Vec(sentences_list, 
                            window=self.window_,
                            negative=self.negative_, 
                            size=self.size_, 
                            iter=self.iter_,
                            sg=not self.is_cbow_, seed=self.random_state)

        return self

    def has(self, word):
        return word in self.w2v

    def transform(self, X):
        """
        X: a list of words
        """
        if self.w2v is None:
            raise Exception('model not fitted')
        return np.array([self.w2v[w] if w in self.w2v else np.zeros(self.size_)
                         for w in X])

In [11]:
df = pd.read_csv('data/ner_short.csv', index_col=0)
df.head()

Unnamed: 0,next-next-pos,next-next-word,next-pos,next-word,pos,prev-pos,prev-prev-pos,prev-prev-word,prev-word,sentence_idx,word,tag
0,NNS,demonstrators,IN,of,NNS,__START1__,__START2__,__START2__,__START1__,1.0,Thousands,O
1,VBP,have,NNS,demonstrators,IN,NNS,__START1__,__START1__,Thousands,1.0,of,O
2,VBN,marched,VBP,have,NNS,IN,NNS,Thousands,of,1.0,demonstrators,O
3,IN,through,VBN,marched,VBP,NNS,IN,of,demonstrators,1.0,have,O
4,NNP,London,IN,through,VBN,VBP,NNS,demonstrators,have,1.0,marched,O


In [12]:
# number of sentences
df.sentence_idx.max()

1500.0

In [13]:
# class distribution
df.tag.value_counts(normalize=True)

O        0.852828
B-geo    0.027604
B-gpe    0.020935
B-org    0.020247
I-per    0.017795
B-tim    0.016927
B-per    0.015312
I-org    0.013937
I-geo    0.005383
I-tim    0.004247
B-art    0.001376
I-gpe    0.000837
I-art    0.000748
B-eve    0.000628
I-eve    0.000508
B-nat    0.000449
I-nat    0.000239
Name: tag, dtype: float64

In [0]:
# sentence length
tdf = df.set_index('sentence_idx')
tdf['length'] = df.groupby('sentence_idx').tag.count()
df = tdf.reset_index(drop=False)

In [0]:
# encode categorial variables

le = LabelEncoder()
df['pos'] = le.fit_transform(df.pos)
df['next-pos'] = le.fit_transform(df['next-pos'])
df['next-next-pos'] = le.fit_transform(df['next-next-pos'])
df['prev-pos'] = le.fit_transform(df['prev-pos'])
df['prev-prev-pos'] = le.fit_transform(df['prev-prev-pos'])

In [16]:
df.head()

Unnamed: 0,sentence_idx,next-next-pos,next-next-word,next-pos,next-word,pos,prev-pos,prev-prev-pos,prev-prev-word,prev-word,word,tag,length
0,1.0,18,demonstrators,9,of,18,39,40,__START2__,__START1__,Thousands,O,48
1,1.0,33,have,18,demonstrators,9,18,39,__START1__,Thousands,of,O,48
2,1.0,32,marched,33,have,18,9,18,Thousands,of,demonstrators,O,48
3,1.0,9,through,32,marched,33,18,9,of,demonstrators,have,O,48
4,1.0,16,London,9,through,32,33,18,demonstrators,have,marched,O,48


#Feature engineering
Удаляем ненужные столбцы.

In [0]:
del df['length']
del df['sentence_idx']

In [18]:
df.head()

Unnamed: 0,next-next-pos,next-next-word,next-pos,next-word,pos,prev-pos,prev-prev-pos,prev-prev-word,prev-word,word,tag
0,18,demonstrators,9,of,18,39,40,__START2__,__START1__,Thousands,O
1,33,have,18,demonstrators,9,18,39,__START1__,Thousands,of,O
2,32,marched,33,have,18,9,18,Thousands,of,demonstrators,O
3,9,through,32,marched,33,18,9,of,demonstrators,have,O
4,16,London,9,through,32,33,18,demonstrators,have,marched,O


Добавим 5 новых признаков – капитализацию слов. Будем учитывать только слова, не являющиеся первым словом в предложении.

In [0]:
prefices = ['prev-prev-', 'prev-', '', 'next-', 'next-next-']

In [0]:
for i in range(len(prefices)):
    df[prefices[i] + 'capitalized'] = df.apply(
        lambda row: 1 if row[prefices[i] + 'word'].istitle() and (
            i == 0 or not row[prefices[i - 1] + 'word'].startswith('__START'))
        else 0, axis=1)

In [21]:
df.head()

Unnamed: 0,next-next-pos,next-next-word,next-pos,next-word,pos,prev-pos,prev-prev-pos,prev-prev-word,prev-word,word,tag,prev-prev-capitalized,prev-capitalized,capitalized,next-capitalized,next-next-capitalized
0,18,demonstrators,9,of,18,39,40,__START2__,__START1__,Thousands,O,0,0,0,0,0
1,33,have,18,demonstrators,9,18,39,__START1__,Thousands,of,O,0,0,0,0,0
2,32,marched,33,have,18,9,18,Thousands,of,demonstrators,O,1,0,0,0,0
3,9,through,32,marched,33,18,9,of,demonstrators,have,O,0,0,0,0,0
4,16,London,9,through,32,33,18,demonstrators,have,marched,O,0,0,0,0,1


Лемматизируем и приведём к нижнему регистру (т. к., информация о регистре теперь содержится в столбцах *-capitalized*) слова в столбцах *-word*.

In [22]:
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [0]:
lemmatizer = WordNetLemmatizer()

In [0]:
def pos_tagger(word):
    tag = nltk.pos_tag([word])[0][1]
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('R'):
        return wordnet.ADV
    elif tag.startswith('V'):
        return wordnet.VERB
    return wordnet.NOUN

In [0]:
def lemmatize(word):
    return lemmatizer.lemmatize(word.lower(), pos=pos_tagger(word.lower()))

In [0]:
for prefix in prefices:
    df[prefix + 'word'] = df[prefix + 'word'].apply(lemmatize)

In [27]:
df.head()

Unnamed: 0,next-next-pos,next-next-word,next-pos,next-word,pos,prev-pos,prev-prev-pos,prev-prev-word,prev-word,word,tag,prev-prev-capitalized,prev-capitalized,capitalized,next-capitalized,next-next-capitalized
0,18,demonstrator,9,of,18,39,40,__start2__,__start1__,thousand,O,0,0,0,0,0
1,33,have,18,demonstrator,9,18,39,__start1__,thousand,of,O,0,0,0,0,0
2,32,march,33,have,18,9,18,thousand,of,demonstrator,O,1,0,0,0,0
3,9,through,32,march,33,18,9,of,demonstrator,have,O,0,0,0,0,0
4,16,london,9,through,32,33,18,demonstrator,have,march,O,0,0,0,0,1


In [0]:
# here we exploit that word2vec is an unsupervised learning algorithm
# so we can train it on the whole dataset (subject to discussion)

sentences_list = [x.strip() for x in ' '.join(df.word).split('.')]

embedding = Word2VecWrapper(window=5, negative=5, size=300, iter=300,
                           is_cbow=True, random_state=42)
embedding.fit(sentences_list)
clear_output()

In [29]:
# splitting
y = LabelEncoder().fit_transform(df.tag)
del df['tag']

df_train, df_test, y_train, y_test = train_test_split(df, y, stratify=y,
                                                      test_size=0.25,
                                                      random_state=42,
                                                      shuffle=True)
print('train', df_train.shape[0])
print('test', df_test.shape[0])

train 50155
test 16719


In [0]:
encoder_pos = OneHotEncoder()

In [0]:
word_cols = [prefix + 'word' for prefix in prefices]
feature_cols = [prefix + feature for prefix in prefices
                for feature in ['pos', 'capitalized']]

In [32]:
encoder_pos.fit(df[feature_cols])

OneHotEncoder(categorical_features=None, categories=None, drop=None,
              dtype=<class 'numpy.float64'>, handle_unknown='error',
              n_values=None, sparse=True)

In [0]:
X_train = sp.hstack([embedding.transform(df_train[col]) for col in word_cols] +
                    [encoder_pos.transform(df_train[feature_cols])])
X_test = sp.hstack([embedding.transform(df_test[col]) for col in word_cols] +
                   [encoder_pos.transform(df_test[feature_cols])])

#Training

In [0]:
clf = RandomForestClassifier(random_state=42, verbose=1, n_jobs=-1)

In [35]:
clf.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   36.9s finished


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
                       oob_score=False, random_state=42, verbose=1,
                       warm_start=False)

In [36]:
print('train', f1_score(y_train, clf.predict(X_train), average='macro'))
print('test', f1_score(y_test, clf.predict(X_test), average='macro'))

[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  10 out of  10 | elapsed:    0.7s finished


train 0.9908417833926381


[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.


test 0.9061251212486536


[Parallel(n_jobs=2)]: Done  10 out of  10 | elapsed:    0.3s finished


Побиты все бэйзлайны, 0.90 на тестовых данных.
##Ответ на бонусный вопрос
Выбран f1-score, так как он учитывает и точность, и полноту классификации, причём с равным весом. Выбрано макро-усреднение, потому что при нём метрики подсчитываются отдельно для каждого класса, а затем считается среднее для них. Это лучше, чем микро-усредненение, но всё ещё не очень хорошая метрика для тех данных, в которых классы распределены неравномерно (а, как мы видим из распределения классов в ячейке 13, они не распределены равномерно – не-именованные сущности, что ожидаемо, встречаются чаще всех остальных классов). Поэтому лучше всего было бы использовать f1-score со взвешенным усреднением. <br>
Accuracy, precision и recall тоже можно использовать, но f1-score подходит лучше, т. к., учитывает и precision, и recall.