<a href="https://colab.research.google.com/github/ufrpe-ensino/workshop-extracao-informacao/blob/main/notebooks/01_ExtracaoInformacao_CRF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# NER com Conditional Random Fields

## Setup

In [1]:
!pip install -U 'scikit-learn<0.24' #fix a bug in CRF package
!pip install sklearn_crfsuite

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import pandas as pd
import numpy as np

## Lendo arquivo de entrada

In [3]:
df = pd.read_csv('https://raw.githubusercontent.com/ufrpe-ensino/workshop-extracao-informacao/main/data/ner_dataset.csv', encoding = "ISO-8859-1")
df = df[:10000] #apenas para processar mais rápido
df.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O


Existem 457 sentenças contendo 2.746 palavras diferentes e 17 tags.

In [4]:
df = df.fillna(method='ffill')
df['Sentence #'].nunique(), df.Word.nunique(), df.Tag.nunique()

(457, 2746, 17)

Tags presentes no corpus

In [5]:
df.groupby('Tag').size().reset_index(name='counts')

Unnamed: 0,Tag,counts
0,B-art,28
1,B-eve,10
2,B-geo,244
3,B-gpe,303
4,B-nat,5
5,B-org,176
6,B-per,160
7,B-tim,149
8,I-art,20
9,I-eve,10


# CRF

## Aux Functions

In [6]:
import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
from collections import Counter

Função para recuperar sentenças com os POS e as tags

In [7]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s['Word'].values.tolist(), 
                                                           s['POS'].values.tolist(), 
                                                           s['Tag'].values.tolist())]
        self.grouped = self.data.groupby('Sentence #').apply(agg_func)
        self.sentences = [s for s in self.grouped]
        
    def get_next(self):
        try: 
            s = self.grouped['Sentence: {}'.format(self.n_sent)]
            self.n_sent += 1
            return s 
        except:
            return None
getter = SentenceGetter(df)
sentences = getter.sentences

In [8]:
sentences

[[('Thousands', 'NNS', 'O'),
  ('of', 'IN', 'O'),
  ('demonstrators', 'NNS', 'O'),
  ('have', 'VBP', 'O'),
  ('marched', 'VBN', 'O'),
  ('through', 'IN', 'O'),
  ('London', 'NNP', 'B-geo'),
  ('to', 'TO', 'O'),
  ('protest', 'VB', 'O'),
  ('the', 'DT', 'O'),
  ('war', 'NN', 'O'),
  ('in', 'IN', 'O'),
  ('Iraq', 'NNP', 'B-geo'),
  ('and', 'CC', 'O'),
  ('demand', 'VB', 'O'),
  ('the', 'DT', 'O'),
  ('withdrawal', 'NN', 'O'),
  ('of', 'IN', 'O'),
  ('British', 'JJ', 'B-gpe'),
  ('troops', 'NNS', 'O'),
  ('from', 'IN', 'O'),
  ('that', 'DT', 'O'),
  ('country', 'NN', 'O'),
  ('.', '.', 'O')],
 [('Iranian', 'JJ', 'B-gpe'),
  ('officials', 'NNS', 'O'),
  ('say', 'VBP', 'O'),
  ('they', 'PRP', 'O'),
  ('expect', 'VBP', 'O'),
  ('to', 'TO', 'O'),
  ('get', 'VB', 'O'),
  ('access', 'NN', 'O'),
  ('to', 'TO', 'O'),
  ('sealed', 'JJ', 'O'),
  ('sensitive', 'JJ', 'O'),
  ('parts', 'NNS', 'O'),
  ('of', 'IN', 'O'),
  ('the', 'DT', 'O'),
  ('plant', 'NN', 'O'),
  ('Wednesday', 'NNP', 'B-tim'),
  ('

## Criando o formato de entrada do CRF (extração de características)

In [9]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    
    features = {
        'bias': 1.0, 
        'word.lower()': word.lower(), 
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True
    return features

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]
    
def sent2tokens(sent):
    return [token for token, postag, label in sent]

# Divisão de treinamento e teste

In [15]:
from sklearn.model_selection import train_test_split

X = [sent2features(s) for s in sentences]
y = [sent2labels(s) for s in sentences]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

In [16]:
X

[[{'bias': 1.0,
   'word.lower()': 'thousands',
   'word[-3:]': 'nds',
   'word[-2:]': 'ds',
   'word.isupper()': False,
   'word.istitle()': True,
   'word.isdigit()': False,
   'postag': 'NNS',
   'postag[:2]': 'NN',
   'BOS': True,
   '+1:word.lower()': 'of',
   '+1:word.istitle()': False,
   '+1:word.isupper()': False,
   '+1:postag': 'IN',
   '+1:postag[:2]': 'IN'},
  {'bias': 1.0,
   'word.lower()': 'of',
   'word[-3:]': 'of',
   'word[-2:]': 'of',
   'word.isupper()': False,
   'word.istitle()': False,
   'word.isdigit()': False,
   'postag': 'IN',
   'postag[:2]': 'IN',
   '-1:word.lower()': 'thousands',
   '-1:word.istitle()': True,
   '-1:word.isupper()': False,
   '-1:postag': 'NNS',
   '-1:postag[:2]': 'NN',
   '+1:word.lower()': 'demonstrators',
   '+1:word.istitle()': False,
   '+1:word.isupper()': False,
   '+1:postag': 'NNS',
   '+1:postag[:2]': 'NN'},
  {'bias': 1.0,
   'word.lower()': 'demonstrators',
   'word[-3:]': 'ors',
   'word[-2:]': 'rs',
   'word.isupper()': F

In [17]:
y

[['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-geo',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-geo',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-gpe',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['B-gpe',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-tim',
  'O',
  'O',
  'O',
  'B-org',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['O',
  'O',
  'B-tim',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-geo',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-org',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-geo',
  'I-geo',
  'O'],
 ['O', 'B-gpe', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'B-org', 'O', 'O'],
 ['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-tim',
  'O'],
 ['O',
  'B-tim',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-geo',
  'O'],
 ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O

# Treinando o modelo CRF

Documentação da biblioteca: https://sklearn-crfsuite.readthedocs.io/en/latest/

In [18]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True,
)
crf.fit(X_train, y_train)



CRF(algorithm='lbfgs', all_possible_transitions=True, c1=0.1, c2=0.1,
    keep_tempfiles=None, max_iterations=100)

In [20]:
crf.classes_

['O',
 'B-tim',
 'B-gpe',
 'B-per',
 'I-per',
 'B-geo',
 'B-org',
 'I-org',
 'B-art',
 'I-art',
 'B-eve',
 'I-eve',
 'I-geo',
 'I-tim',
 'I-gpe',
 'B-nat']

# Avaliação

In [19]:
y_pred = crf.predict(X_test)
print(metrics.flat_classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       B-art       0.50      0.40      0.44         5
       B-eve       0.00      0.00      0.00         2
       B-geo       0.79      0.68      0.73        77
       B-gpe       0.75      0.88      0.81        91
       B-nat       0.00      0.00      0.00         2
       B-org       0.77      0.68      0.72        53
       B-per       0.85      0.92      0.88        61
       B-tim       0.95      0.89      0.92        45
       I-art       0.00      0.00      0.00         4
       I-eve       0.00      0.00      0.00         1
       I-geo       0.75      0.38      0.50        16
       I-gpe       0.67      0.57      0.62         7
       I-nat       0.00      0.00      0.00         2
       I-org       0.74      0.70      0.72        50
       I-per       0.87      0.97      0.92        75
       I-tim       0.33      1.00      0.50         1
           O       0.99      1.00      1.00      2884

    accuracy              

  _warn_prf(average, modifier, msg_start, len(result))


# Entendendo o funcionamento do algoritmo

In [21]:
def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))

print("Top-20 transições mais prováveis:")
print_transitions(Counter(crf.transition_features_).most_common(20))

print("Top-20 transições menos prováveis:")
print_transitions(Counter(crf.transition_features_).most_common()[-20:])

Top-20 transições mais prováveis:
B-org  -> I-org   4.680201
B-per  -> I-per   4.242035
I-org  -> I-org   4.198692
I-per  -> I-per   3.991878
B-art  -> I-art   3.688363
B-eve  -> I-eve   3.636531
I-art  -> I-art   3.464745
B-gpe  -> I-gpe   3.365085
O      -> O       3.119727
B-geo  -> I-geo   2.966060
B-tim  -> I-tim   2.372497
I-tim  -> I-tim   2.276223
I-eve  -> I-eve   1.833767
O      -> B-tim   1.764286
I-geo  -> I-geo   1.733762
O      -> B-gpe   1.572374
O      -> B-geo   1.331913
B-gpe  -> B-per   1.262858
O      -> B-art   1.110531
O      -> B-org   1.102532
Top-20 transições menos prováveis:
B-gpe  -> I-art   -0.389697
I-org  -> B-geo   -0.457163
B-geo  -> I-gpe   -0.470893
B-gpe  -> I-org   -0.489994
B-eve  -> O       -0.505690
O      -> I-gpe   -0.537264
O      -> I-eve   -0.579971
O      -> I-geo   -0.604511
B-geo  -> I-org   -0.612946
B-gpe  -> B-gpe   -0.624199
B-tim  -> B-gpe   -0.627473
B-geo  -> I-art   -0.651209
B-org  -> I-per   -0.664598
B-geo  -> I-per   -0.781181

In [22]:
def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print("%0.6f %-8s %s" % (weight, label, attr))

print("Top-20 positivo:")
print_state_features(Counter(crf.state_features_).most_common(20))

print("Top-20 negativo:")
print_state_features(Counter(crf.state_features_).most_common()[-20:])

Top-20 positivo:
5.860922 O        bias
3.722280 O        BOS
3.721897 B-tim    word[-3:]:day
3.690997 B-tim    word[-2:]:ay
3.149401 B-gpe    word.istitle()
3.047903 B-tim    word[-2:]:0s
2.978401 B-geo    -1:word.lower():in
2.502835 O        postag:NN
2.437516 B-tim    -1:word.lower():in
2.386781 B-gpe    postag:JJ
2.223646 B-org    word.isupper()
2.059408 B-gpe    word[-2:]:na
2.041420 B-org    word[-3:]:ban
2.033487 B-gpe    postag[:2]:JJ
1.980111 B-tim    word[-3:]:ber
1.962547 O        word[-2:]:ic
1.914448 B-tim    word.isdigit()
1.906829 I-tim    word.isdigit()
1.823769 B-gpe    -1:word.lower():with
1.813563 O        +1:word.lower():men
Top-20 negativo:
-1.070367 B-gpe    word[-3:]:can
-1.106395 O        word[-2:]:sh
-1.129731 I-tim    word.istitle()
-1.141161 B-gpe    -1:word.istitle()
-1.156629 O        word.isupper()
-1.177472 O        word[-3:]:ish
-1.192073 O        postag:NNPS
-1.203677 B-org    -1:postag:NNP
-1.217778 B-art    -1:word.istitle()
-1.394852 O        word[-2

In [23]:
!pip install eli5
import eli5
eli5.show_weights(crf, top=10)

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting eli5
  Downloading eli5-0.13.0.tar.gz (216 kB)
[K     |████████████████████████████████| 216 kB 11.9 MB/s 
Collecting jinja2>=3.0.0
  Downloading Jinja2-3.1.2-py3-none-any.whl (133 kB)
[K     |████████████████████████████████| 133 kB 45.7 MB/s 
Building wheels for collected packages: eli5
  Building wheel for eli5 (setup.py) ... [?25l[?25hdone
  Created wheel for eli5: filename=eli5-0.13.0-py2.py3-none-any.whl size=107748 sha256=b4c7856c058382dc31a4ef00532e186d786e0c815679ea296c79fcfbec5a8b02
  Stored in directory: /root/.cache/pip/wheels/cc/3c/96/3ead31a8e6c20fc0f1a707fde2e05d49a80b1b4b30096573be
Successfully built eli5
Installing collected packages: jinja2, eli5
  Attempting uninstall: jinja2
    Found existing installation: Jinja2 2.11.3
    Uninstalling Jinja2-2.11.3:
      Successfully uninstalled Jinja2-2.11.3
[31mERROR: pip's dependency resolver does not currently t



From \ To,O,B-art,I-art,B-eve,I-eve,B-geo,I-geo,B-gpe,I-gpe,B-nat,B-org,I-org,B-per,I-per,B-tim,I-tim
O,3.12,1.111,-1.29,0.688,-0.58,1.332,-0.605,1.572,-0.537,0.0,1.103,-1.871,0.028,-0.97,1.764,-1.494
B-art,0.0,0.0,3.688,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.011,0.0,-0.216,0.0,0.0
I-art,-0.058,0.0,3.465,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.229,0.0,0.0
B-eve,-0.506,0.0,0.0,0.0,3.637,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.074,0.0,0.0
I-eve,0.0,0.0,0.0,0.0,1.834,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
B-geo,0.197,0.0,-0.651,0.0,-0.284,-0.182,2.966,0.687,-0.471,0.0,-0.17,-0.613,-0.224,-0.781,1.057,-0.217
I-geo,-0.246,0.0,0.0,0.0,0.0,0.0,1.734,0.0,0.0,0.0,0.0,0.0,0.0,-0.041,0.0,0.0
B-gpe,0.435,0.0,-0.39,0.0,-0.14,-0.206,-0.244,-0.624,3.365,0.0,0.785,-0.49,1.263,-0.828,-0.271,0.0
I-gpe,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.934,0.0,0.0,0.0,0.0,-0.093,0.0,0.0
B-nat,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0

Weight?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,Unnamed: 9_level_0,Unnamed: 10_level_0,Unnamed: 11_level_0,Unnamed: 12_level_0,Unnamed: 13_level_0,Unnamed: 14_level_0,Unnamed: 15_level_0
Weight?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Weight?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2
Weight?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3
Weight?,Feature,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,Unnamed: 8_level_4,Unnamed: 9_level_4,Unnamed: 10_level_4,Unnamed: 11_level_4,Unnamed: 12_level_4,Unnamed: 13_level_4,Unnamed: 14_level_4,Unnamed: 15_level_4
Weight?,Feature,Unnamed: 2_level_5,Unnamed: 3_level_5,Unnamed: 4_level_5,Unnamed: 5_level_5,Unnamed: 6_level_5,Unnamed: 7_level_5,Unnamed: 8_level_5,Unnamed: 9_level_5,Unnamed: 10_level_5,Unnamed: 11_level_5,Unnamed: 12_level_5,Unnamed: 13_level_5,Unnamed: 14_level_5,Unnamed: 15_level_5
Weight?,Feature,Unnamed: 2_level_6,Unnamed: 3_level_6,Unnamed: 4_level_6,Unnamed: 5_level_6,Unnamed: 6_level_6,Unnamed: 7_level_6,Unnamed: 8_level_6,Unnamed: 9_level_6,Unnamed: 10_level_6,Unnamed: 11_level_6,Unnamed: 12_level_6,Unnamed: 13_level_6,Unnamed: 14_level_6,Unnamed: 15_level_6
Weight?,Feature,Unnamed: 2_level_7,Unnamed: 3_level_7,Unnamed: 4_level_7,Unnamed: 5_level_7,Unnamed: 6_level_7,Unnamed: 7_level_7,Unnamed: 8_level_7,Unnamed: 9_level_7,Unnamed: 10_level_7,Unnamed: 11_level_7,Unnamed: 12_level_7,Unnamed: 13_level_7,Unnamed: 14_level_7,Unnamed: 15_level_7
Weight?,Feature,Unnamed: 2_level_8,Unnamed: 3_level_8,Unnamed: 4_level_8,Unnamed: 5_level_8,Unnamed: 6_level_8,Unnamed: 7_level_8,Unnamed: 8_level_8,Unnamed: 9_level_8,Unnamed: 10_level_8,Unnamed: 11_level_8,Unnamed: 12_level_8,Unnamed: 13_level_8,Unnamed: 14_level_8,Unnamed: 15_level_8
Weight?,Feature,Unnamed: 2_level_9,Unnamed: 3_level_9,Unnamed: 4_level_9,Unnamed: 5_level_9,Unnamed: 6_level_9,Unnamed: 7_level_9,Unnamed: 8_level_9,Unnamed: 9_level_9,Unnamed: 10_level_9,Unnamed: 11_level_9,Unnamed: 12_level_9,Unnamed: 13_level_9,Unnamed: 14_level_9,Unnamed: 15_level_9
Weight?,Feature,Unnamed: 2_level_10,Unnamed: 3_level_10,Unnamed: 4_level_10,Unnamed: 5_level_10,Unnamed: 6_level_10,Unnamed: 7_level_10,Unnamed: 8_level_10,Unnamed: 9_level_10,Unnamed: 10_level_10,Unnamed: 11_level_10,Unnamed: 12_level_10,Unnamed: 13_level_10,Unnamed: 14_level_10,Unnamed: 15_level_10
Weight?,Feature,Unnamed: 2_level_11,Unnamed: 3_level_11,Unnamed: 4_level_11,Unnamed: 5_level_11,Unnamed: 6_level_11,Unnamed: 7_level_11,Unnamed: 8_level_11,Unnamed: 9_level_11,Unnamed: 10_level_11,Unnamed: 11_level_11,Unnamed: 12_level_11,Unnamed: 13_level_11,Unnamed: 14_level_11,Unnamed: 15_level_11
Weight?,Feature,Unnamed: 2_level_12,Unnamed: 3_level_12,Unnamed: 4_level_12,Unnamed: 5_level_12,Unnamed: 6_level_12,Unnamed: 7_level_12,Unnamed: 8_level_12,Unnamed: 9_level_12,Unnamed: 10_level_12,Unnamed: 11_level_12,Unnamed: 12_level_12,Unnamed: 13_level_12,Unnamed: 14_level_12,Unnamed: 15_level_12
Weight?,Feature,Unnamed: 2_level_13,Unnamed: 3_level_13,Unnamed: 4_level_13,Unnamed: 5_level_13,Unnamed: 6_level_13,Unnamed: 7_level_13,Unnamed: 8_level_13,Unnamed: 9_level_13,Unnamed: 10_level_13,Unnamed: 11_level_13,Unnamed: 12_level_13,Unnamed: 13_level_13,Unnamed: 14_level_13,Unnamed: 15_level_13
Weight?,Feature,Unnamed: 2_level_14,Unnamed: 3_level_14,Unnamed: 4_level_14,Unnamed: 5_level_14,Unnamed: 6_level_14,Unnamed: 7_level_14,Unnamed: 8_level_14,Unnamed: 9_level_14,Unnamed: 10_level_14,Unnamed: 11_level_14,Unnamed: 12_level_14,Unnamed: 13_level_14,Unnamed: 14_level_14,Unnamed: 15_level_14
Weight?,Feature,Unnamed: 2_level_15,Unnamed: 3_level_15,Unnamed: 4_level_15,Unnamed: 5_level_15,Unnamed: 6_level_15,Unnamed: 7_level_15,Unnamed: 8_level_15,Unnamed: 9_level_15,Unnamed: 10_level_15,Unnamed: 11_level_15,Unnamed: 12_level_15,Unnamed: 13_level_15,Unnamed: 14_level_15,Unnamed: 15_level_15
+5.861,bias,,,,,,,,,,,,,,
+3.722,BOS,,,,,,,,,,,,,,
+2.503,postag:NN,,,,,,,,,,,,,,
+1.963,word[-2:]:ic,,,,,,,,,,,,,,
+1.814,+1:word.lower():men,,,,,,,,,,,,,,
+1.787,"-1:word.lower():8,000",,,,,,,,,,,,,,
… 201 more positive …,… 201 more positive …,,,,,,,,,,,,,,
… 107 more negative …,… 107 more negative …,,,,,,,,,,,,,,
-1.731,postag[:2]:NN,,,,,,,,,,,,,,
-3.106,word.isdigit(),,,,,,,,,,,,,,

Weight?,Feature
+5.861,bias
+3.722,BOS
+2.503,postag:NN
+1.963,word[-2:]:ic
+1.814,+1:word.lower():men
+1.787,"-1:word.lower():8,000"
… 201 more positive …,… 201 more positive …
… 107 more negative …,… 107 more negative …
-1.731,postag[:2]:NN
-3.106,word.isdigit()

Weight?,Feature
+1.296,word[-2:]:xx
+1.296,word[-3:]:oxx
+1.296,word.lower():vioxx
+1.234,word.lower():huygens
+1.231,word[-3:]:ens
+1.223,word[-2:]:ep
+1.223,word.lower():jeep
+1.223,word[-3:]:eep
+1.212,word.lower():dodge
… 110 more positive …,… 110 more positive …

Weight?,Feature
+1.154,-1:word.istitle()
+0.855,+1:postag[:2]:NN
+0.775,word[-2:]:le
+0.718,+1:word.lower():airport
+0.585,word.lower():mirror
+0.585,word[-3:]:ror
+0.582,-1:word.lower():daily
+0.525,word.lower():non-proliferation
+0.525,+1:word.lower():treaty
+0.525,-1:word.lower():nuclear

Weight?,Feature
+1.202,+1:word.lower():war
+0.921,word.lower():games
+0.918,word[-3:]:mes
+0.915,+1:word.lower():open
+0.859,+1:postag:NNP
+0.819,postag:NNPS
+0.746,word[-2:]:es
+0.599,-1:word.lower():the
+0.579,+1:word.istitle()
… 36 more positive …,… 36 more positive …

Weight?,Feature
+0.847,word[-3:]:War
+0.845,word.lower():war
+0.835,+1:word.lower():in
+0.820,word.lower():open
+0.820,word[-3:]:pen
+0.804,word[-2:]:ar
+0.704,-1:word.lower():australian
+0.697,word[-2:]:en
+0.619,-1:word.lower():war
+0.583,word[-2:]:II

Weight?,Feature
+2.978,-1:word.lower():in
+1.790,-1:word.lower():neighboring
+1.760,-1:word.lower():from
+1.722,word.lower():bali
+1.699,word[-3:]:sia
+1.662,postag:NNP
+1.567,-1:word.lower():to
+1.537,word.istitle()
+1.407,+1:word.lower():jury
… 239 more positive …,… 239 more positive …

Weight?,Feature
+1.241,-1:word.lower():new
+0.883,-1:word.lower():arab
+0.879,word.lower():emirates
+0.814,word.lower():arab
+0.814,word[-3:]:rab
+0.798,word[-2:]:ab
+0.622,word.lower():waziristan
+0.621,-1:word.lower():south
+0.612,-1:word.istitle()
+0.602,+1:word.lower():and

Weight?,Feature
+3.149,word.istitle()
+2.387,postag:JJ
+2.059,word[-2:]:na
+2.033,postag[:2]:JJ
+1.824,-1:word.lower():with
+1.636,-1:word.lower():recognize
+1.588,word.lower():u.s.
+1.588,word[-3:]:.S.
… 257 more positive …,… 257 more positive …
… 41 more negative …,… 41 more negative …

Weight?,Feature
+1.185,word[-3:]:can
+0.851,word.lower():states
+0.691,word.lower():republic
+0.691,+1:word.lower():under
+0.690,word[-3:]:lic
+0.669,postag:JJ
+0.636,-1:postag:NNP
+0.628,postag[:2]:JJ
+0.620,word[-3:]:tes
+0.611,-1:word.istitle()

Weight?,Feature
+1.725,word.lower():h5n1
+1.725,word[-3:]:5N1
+1.725,word[-2:]:N1
+1.113,word.isupper()
+0.513,-1:word.lower():the
+0.487,+1:postag:NN
+0.306,-1:postag:DT
+0.306,-1:postag[:2]:DT
+0.249,+1:word.lower():virus
… 3 more positive …,… 3 more positive …

Weight?,Feature
+2.224,word.isupper()
+2.041,word[-3:]:ban
+1.674,-1:word.lower():u.s.
+1.621,word[-3:]:rck
+1.621,word.lower():merck
+1.569,+1:word.lower():was
+1.567,word.lower():india
+1.558,word.lower():halliburton
+1.477,word[-3:]:dia
+1.458,word.lower():senate

Weight?,Feature
+1.178,word[-3:]:ion
+0.979,word.lower():nations
+0.951,word[-2:]:ry
+0.948,word[-3:]:ons
+0.916,-1:postag[:2]:IN
+0.916,-1:postag:IN
+0.908,-1:postag:NNP
+0.905,-1:word.lower():emirates
+0.904,word.lower():airlines
+0.901,word[-3:]:nes

Weight?,Feature
+1.762,word[-2:]:am
+1.645,word.lower():ramda
+1.645,word[-3:]:mda
+1.480,+1:word.lower():said
+1.385,+1:word.lower():administration
+1.345,word.lower():obama
+1.336,word[-3:]:yam
+1.336,word.lower():khayam
+1.317,word.lower():sperling
… 153 more positive …,… 153 more positive …

Weight?,Feature
+1.067,+1:postag[:2]:VB
+1.031,-1:postag:NNP
+0.984,postag:NNP
+0.960,-1:word.lower():mr.
+0.932,word[-2:]:er
+0.892,word[-2:]:in
+0.826,-1:word.lower():president
+0.779,-1:postag[:2]:NN
… 114 more positive …,… 114 more positive …
… 8 more negative …,… 8 more negative …

Weight?,Feature
+3.722,word[-3:]:day
+3.691,word[-2:]:ay
+3.048,word[-2:]:0s
+2.438,-1:word.lower():in
+1.980,word[-3:]:ber
+1.914,word.isdigit()
+1.581,word[-3:]:uly
+1.581,word.lower():july
+1.493,word.lower():august
+1.489,word[-3:]:ust

Weight?,Feature
+1.907,word.isdigit()
+1.439,postag[:2]:CD
+1.439,postag:CD
+0.769,-1:word.lower():july
+0.709,+1:postag:CD
+0.709,+1:postag[:2]:CD
+0.614,-1:word.lower():25
+0.593,+1:word.lower():1995
+0.570,-1:postag:NNP
… 35 more positive …,… 35 more positive …
