# sklearn-crfsuite

Memo
- [sklearn-crfsuite の公式 Tutorial](https://sklearn-crfsuite.readthedocs.io/en/latest/tutorial.html)
- [Tutorial の IPython file](https://github.com/TeamHG-Memex/sklearn-crfsuite/blob/master/docs/CoNLL2002.ipynb)
- [入力データの内部形式](https://python-crfsuite.readthedocs.io/en/latest/pycrfsuite.html) 
- [CRFモデルで時系列データからある時点の状態を推定する](https://qiita.com/ruka38/items/c9212d827acfdd9d41a7)
- AttributeError: 'CRF' object has no attribute 'keep_tempfiles' の発生 -> sklearn_crfsuite.\_\_file\_\_ で表示されたパスにあるファイルを[このバグ報告](https://github.com/TeamHG-Memex/sklearn-crfsuite/issues/60)のdoctor-entropy のコメントの通りに修正する
- 上のエラーを[直したバージョン](https://github.com/TeamHG-Memex/sklearn-crfsuite/blob/master/docs/CoNLL2002.ipynb)もあるらしいけどうまくできなかった

# Example: Conll2002 dataset 

In [54]:
from itertools import chain

import nltk
import sklearn
import scipy.stats
import numpy as np
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV

import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

nltk.download('conll2002')
print(nltk.corpus.conll2002.fileids())
train_sents = list(nltk.corpus.conll2002.iob_sents('esp.train'))
test_sents = list(nltk.corpus.conll2002.iob_sents('esp.testb'))

def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    
    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],        
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True
        
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True
                
    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

X_train = [sent2features(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]

X_test = [sent2features(s) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]

print(X_train[0][0])
print(y_train[0])

[nltk_data] Downloading package conll2002 to
[nltk_data]     C:\Users\tanakai\AppData\Roaming\nltk_data...
[nltk_data]   Package conll2002 is already up-to-date!


['esp.testa', 'esp.testb', 'esp.train', 'ned.testa', 'ned.testb', 'ned.train']
{'bias': 1.0, 'word.lower()': 'melbourne', 'word[-3:]': 'rne', 'word[-2:]': 'ne', 'word.isupper()': False, 'word.istitle()': True, 'word.isdigit()': False, 'postag': 'NP', 'postag[:2]': 'NP', 'BOS': True, '+1:word.lower()': '(', '+1:word.istitle()': False, '+1:word.isupper()': False, '+1:postag': 'Fpa', '+1:postag[:2]': 'Fp'}
['B-LOC', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'O', 'O']


In [2]:
# training
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs', 
    c1=0.1, 
    c2=0.1, 
    max_iterations=100, 
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

In [14]:
from collections import Counter

labels = list(crf.classes_)
labels.remove('O')
print(labels)
y_pred = crf.predict(X_test)
print(metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=labels))

# details
sorted_labels = sorted(labels, key=lambda name: (name[1:], name[0]))
from sklearn.metrics import classification_report
# This code has not been worked.
# print(metrics.flat_classification_report(
#     y_test, y_pred, labels=sorted_labels, digits=3
# ))
print(classification_report(
    list(chain.from_iterable(y_test)), list(chain.from_iterable(y_pred)), labels=sorted_labels, digits=3
))


def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))
        
def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print("%0.6f %-8s %s" % (weight, label, attr))

print("Top likely transitions:")
print_transitions(Counter(crf.transition_features_).most_common(20))
print("Top unlikely transitions:")
print_transitions(Counter(crf.transition_features_).most_common()[-20:])

print("Top positive:")
print_state_features(Counter(crf.state_features_).most_common(30))
print("Top negative:")
print_state_features(Counter(crf.state_features_).most_common()[-30:])

['B-LOC', 'B-ORG', 'B-PER', 'I-PER', 'B-MISC', 'I-ORG', 'I-LOC', 'I-MISC']
0.7964686316443963
              precision    recall  f1-score   support

       B-LOC      0.810     0.784     0.797      1084
       I-LOC      0.690     0.637     0.662       325
      B-MISC      0.731     0.569     0.640       339
      I-MISC      0.699     0.589     0.639       557
       B-ORG      0.807     0.832     0.820      1400
       I-ORG      0.852     0.786     0.818      1104
       B-PER      0.850     0.884     0.867       735
       I-PER      0.893     0.943     0.917       634

   micro avg      0.813     0.787     0.799      6178
   macro avg      0.791     0.753     0.770      6178
weighted avg      0.809     0.787     0.796      6178

Top likely transitions:
B-ORG  -> I-ORG   7.500912
I-ORG  -> I-ORG   7.206322
B-MISC -> I-MISC  6.833142
I-MISC -> I-MISC  6.753222
B-PER  -> I-PER   6.404557
B-LOC  -> I-LOC   5.696274
I-LOC  -> I-LOC   4.877422
I-PER  -> I-PER   4.709231
O      -> O    

# sensor data

In [2]:
from itertools import chain

import sklearn
import sklearn_crfsuite
import scipy.stats
from pathlib import Path
from sklearn_crfsuite import scorers, metrics
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

# self-made
import activity_model
import analysis
import anomaly
import comparison
import floor_plan

import new_functions
import sensor_model

working_path = Path().resolve()
layout_data_path = working_path / 'layout_data'

# Wandering

In [3]:
def data2features(data):
    """
    Parameters
    ----------
    data : numpy.ndarray
        data.shape = (number of time, number of sensors).
        
    Returns
    -------
    features : list of dict
    """
    
    features = []
    T = data.shape[0]  # number of time
    M = data.shape[1]  # number of sensors
    for i in range(T):
        d = data[i]
        feature = {f"x_{j}": d[j] for j in range(M)}
        # if i >= 1:
        #     feature.update({f"-1 x_{j}": data[i-1][j] for j in range(M)})
        # if i >= 60:
        #     feature['sum_60'] = np.sum(data[i-60:i])
        if i == 0:
            feature['BOS'] = True
        if i == T - 1:
            feature['EOS'] = True
        feature['bias'] = 1
        features.append(feature)
    return features

_type = 'raw'
data_folder_name = 'test_data_1'
path = layout_data_path / 'test_layout' / data_folder_name
reduced_SD_mat = new_functions.pickle_load(path / 'experiment1', f'reduced_SD_mat_{_type}_1')
reduced_AL_mat = new_functions.pickle_load(path / 'experiment1', f'reduced_AL_mat_{_type}_1')
SD_names = new_functions.pickle_load(path / 'experiment1', 'SD_names')
AL_names = new_functions.pickle_load(path / 'experiment1', 'AL_names')

In [4]:
# training

# X_train = [[{'a': True, 'b':1.2}, {'a': False, 'b':2}]]
# y_train = [['True', 'False']]

num = 1000000

X_train = [data2features(reduced_SD_mat[-num:, :24])]
y_train = [[str(b) for b in reduced_AL_mat[-num:, 3]]]
print(np.sum(reduced_AL_mat[-num:, 3]))

c1, c2 = 0.1, 0.1
crf = sklearn_crfsuite.CRF(
    algorithm = 'lbfgs', 
    c1 = c1, 
    c2 = c2, 
    max_iterations = 100,
    all_possible_transitions = True
)
crf.fit(X_train, y_train)
new_functions.pickle_dump(path / 'experiment1', f"crf_c1_{c1}_c2_{c2}", crf)
# test = new_functions.pickle_load(path / 'experiment1', f"crf_c1_{c1}_c2_{c2}")

4078


In [21]:
from collections import Counter
from sklearn.metrics import classification_report

# evaluation
_type = 'raw'
data_folder_name = 'test_data_2'
path = layout_data_path / 'test_layout' / data_folder_name
test_SD = new_functions.pickle_load(path / 'experiment1', f'reduced_SD_mat_{_type}_1')
test_AL = new_functions.pickle_load(path / 'experiment1', f'reduced_AL_mat_{_type}_1')
test_SD_names = new_functions.pickle_load(path / f'experiment1', 'SD_names')
test_AL_names = new_functions.pickle_load(path / 'experiment1', 'AL_names')

X_test = [data2features(test_SD[-num:, :24])]
y_test = [[str(b) for b in test_AL[-num:, 3]]]
y_pred = crf.predict(X_test)


labels = list(crf.classes_)
print(labels)
print(metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=labels))

# details
sorted_labels = sorted(labels, key=lambda name: (name[1:], name[0]))

print(classification_report(
    list(chain.from_iterable(y_test)), list(chain.from_iterable(y_pred)), labels=sorted_labels, digits=3
))


def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))
        
def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print("%0.6f %-8s %s" % (weight, label, attr))


print("transition features:")
print_transitions(Counter(crf.transition_features_).most_common())

print("state features:")
print_state_features(Counter(crf.state_features_).most_common())

['False', 'True']
0.999839561308744
              precision    recall  f1-score   support

       False      1.000     1.000     1.000   9950057
        True      0.978     0.990     0.984     49943

    accuracy                          1.000  10000000
   macro avg      0.989     0.995     0.992  10000000
weighted avg      1.000     1.000     1.000  10000000

transition features:
False  -> False   1.965917
True   -> True    1.218246
False  -> True    -7.926812
True   -> False   -8.001090
state features:
0.798157 True     x_1
0.782871 True     x_0
0.721518 True     x_6
0.675986 True     x_16
0.554948 True     x_12
0.552182 True     x_17
0.542444 True     x_4
0.521867 True     x_15
0.498995 True     x_22
0.476387 True     x_18
0.446110 True     x_20
0.411493 True     x_23
0.409099 True     x_13
0.406345 True     x_5
0.394719 True     x_9
0.390470 True     x_19
0.344259 True     x_21
0.294665 True     x_3
0.268331 True     x_14
0.201844 True     x_2
0.180941 False    x_10
0.152746 True  

# Falls

In [4]:
def data2features(data):
    """
    Parameters
    ----------
    data : numpy.ndarray
        data.shape = (number of time, number of sensors).
        
    Returns
    -------
    features : list of dict
    """
    
    features = []
    T = data.shape[0]  # number of time
    M = data.shape[1]  # number of sensors
    for i in range(T):
        d = data[i]
        feature = {f"x_{j}": d[j] for j in range(M)}
        # if i >= 1:
        #     feature.update({f"-1 x_{j}": data[i-1][j] for j in range(M)})
        # if i >= 60:
        #     feature['sum_60'] = np.sum(data[i-60:i])
        if i == 0:
            feature['BOS'] = True
        if i == T - 1:
            feature['EOS'] = True
        feature['bias'] = 1
        features.append(feature)
    return features

_type = 'raw'
data_folder_name = 'test_data_1'
path = layout_data_path / 'test_layout' / data_folder_name
reduced_SD_mat = new_functions.pickle_load(path / 'experiment1', f'reduced_SD_mat_{_type}_1')
reduced_AL_mat = new_functions.pickle_load(path / 'experiment1', f'reduced_AL_mat_{_type}_1')
SD_names = new_functions.pickle_load(path / 'experiment1', 'SD_names')
AL_names = new_functions.pickle_load(path / 'experiment1', 'AL_names')
print(AL_names)

['being semi-bedridden', 'being housebound', 'forgetting', 'wandering', 'fall while walking', 'fall while standing']


In [None]:
# training

# X_train = [[{'a': True, 'b':1.2}, {'a': False, 'b':2}]]
# y_train = [['True', 'False']]

num = 100000000

X_train = [data2features(reduced_SD_mat[-num:, :24])]
y_train = [[str(b) for b in reduced_AL_mat[-num:, 4]]]
print(np.sum(reduced_AL_mat[-num:, 4]))

crf = sklearn_crfsuite.CRF(
    algorithm = 'lbfgs', 
    c1 = 0.1, 
    c2 = 0.1, 
    max_iterations = 100,
    all_possible_transitions = True
)
crf.fit(X_train, y_train)

In [None]:
from collections import Counter
from sklearn.metrics import classification_report

# evaluation
_type = 'raw'
data_folder_name = 'test_data_2'
path = layout_data_path / 'test_layout' / data_folder_name
test_SD = new_functions.pickle_load(path / 'experiment1', f'reduced_SD_mat_{_type}_1')
test_AL = new_functions.pickle_load(path / 'experiment1', f'reduced_AL_mat_{_type}_1')
test_SD_names = new_functions.pickle_load(path / f'experiment1', 'SD_names')
test_AL_names = new_functions.pickle_load(path / 'experiment1', 'AL_names')

X_test = [data2features(test_SD[-num:, :24])]
y_test = [[str(b) for b in test_AL[-num:, 4]]]
print(np.sum(test_AL[-num:, 4]))
y_pred = crf.predict(X_test)


labels = list(crf.classes_)
print(labels)
print(metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=labels))

# details
sorted_labels = sorted(labels, key=lambda name: (name[1:], name[0]))

print(classification_report(
    list(chain.from_iterable(y_test)), list(chain.from_iterable(y_pred)), labels=sorted_labels, digits=3
))


def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))
        
def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print("%0.6f %-8s %s" % (weight, label, attr))


print("transition features:")
print_transitions(Counter(crf.transition_features_).most_common())

print("state features:")
print_state_features(Counter(crf.state_features_).most_common())