# Chuẩn bị dữ liệu 

In [13]:
import codecs
def _generate_examples(filepath):
    examples = []
    with codecs.open(filepath, "rb") as f:
        for id_, row in enumerate(f):
            # One non-ASCII byte: sisterBADBYTEcity. We replace it with a space
            label, _, text = row.replace(b"\xf0",
                                         b" ").strip().decode().partition(" ")
            coarse_label, _, fine_label = label.partition(":")
            examples.append((id_, {
                "label-coarse": coarse_label,
                "label-fine": fine_label,
                "text": text,
            }))
    return examples 

In [14]:
train = _generate_examples("train_5500.label")
test = _generate_examples("TREC_10.label")

In [15]:
# lấy danh sách các nhãn trong dữ liệu huấn luyện
labels = [x['label-coarse'] for _, x in train]
set_labels = list(set(labels))
label2id = {x: i for i, x in enumerate(set_labels)}
id2label = {i: x for i, x in enumerate(set_labels)}

print("------")
print(len(labels))
print("------")
print(set_labels)
print("------")
print(label2id)
print("------")
print(id2label)

------
5452
------
['HUM', 'ENTY', 'NUM', 'ABBR', 'LOC', 'DESC']
------
{'HUM': 0, 'ENTY': 1, 'NUM': 2, 'ABBR': 3, 'LOC': 4, 'DESC': 5}
------
{0: 'HUM', 1: 'ENTY', 2: 'NUM', 3: 'ABBR', 4: 'LOC', 5: 'DESC'}


In [16]:
train_target = [label2id[x['label-coarse']] for _, x in train]
train_data = [x['text'] for _, x in train]

test_data = [x['text'] for _, x in test]
test_target = [label2id[x['label-coarse']] for _, x in test]


print("#training size", len(train))
print("#testing size", len(test))
print(train[0])
print(train[1])
print(test[0])
print(test[1])
print(train_data[0], train_target[0])
print(train_data[1], train_target[1])

#training size 5452
#testing size 500
(0, {'label-coarse': 'DESC', 'label-fine': 'manner', 'text': 'How did serfdom develop in and then leave Russia ?'})
(1, {'label-coarse': 'ENTY', 'label-fine': 'cremat', 'text': 'What films featured the character Popeye Doyle ?'})
(0, {'label-coarse': 'NUM', 'label-fine': 'dist', 'text': 'How far is it from Denver to Aspen ?'})
(1, {'label-coarse': 'LOC', 'label-fine': 'city', 'text': 'What county is Modesto , California in ?'})
How did serfdom develop in and then leave Russia ? 5
What films featured the character Popeye Doyle ? 1


# Pipeline 

In [17]:
from sklearn import svm
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
ngram_range = (1,2)
text_clf = Pipeline([
    ('vect', CountVectorizer(ngram_range=ngram_range)),
    ('tfidf', TfidfTransformer(use_idf=True)),
    ('clf', svm.LinearSVC()),
])

In [18]:
text_clf.fit(train_data, train_target)

In [19]:
text_clf.get_params()

{'memory': None,
 'steps': [('vect', CountVectorizer(ngram_range=(1, 2))),
  ('tfidf', TfidfTransformer()),
  ('clf', LinearSVC())],
 'verbose': False,
 'vect': CountVectorizer(ngram_range=(1, 2)),
 'tfidf': TfidfTransformer(),
 'clf': LinearSVC(),
 'vect__analyzer': 'word',
 'vect__binary': False,
 'vect__decode_error': 'strict',
 'vect__dtype': numpy.int64,
 'vect__encoding': 'utf-8',
 'vect__input': 'content',
 'vect__lowercase': True,
 'vect__max_df': 1.0,
 'vect__max_features': None,
 'vect__min_df': 1,
 'vect__ngram_range': (1, 2),
 'vect__preprocessor': None,
 'vect__stop_words': None,
 'vect__strip_accents': None,
 'vect__token_pattern': '(?u)\\b\\w\\w+\\b',
 'vect__tokenizer': None,
 'vect__vocabulary': None,
 'tfidf__norm': 'l2',
 'tfidf__smooth_idf': True,
 'tfidf__sublinear_tf': False,
 'tfidf__use_idf': True,
 'clf__C': 1.0,
 'clf__class_weight': None,
 'clf__dual': True,
 'clf__fit_intercept': True,
 'clf__intercept_scaling': 1,
 'clf__loss': 'squared_hinge',
 'clf__max_i

# Predict 

In [20]:
docs_new = ['what is computer', 
            'who is Newton', 
            'when is the Tet holiday ?']

predicted = text_clf.predict(docs_new)
for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, id2label[category]))

'what is computer' => DESC
'who is Newton' => HUM
'when is the Tet holiday ?' => NUM


In [21]:
# LinearSVC (1, 2) tfidf (5452, 32693)  ==> accurracy 0.9
predicted = text_clf.predict(test_data)
ncorrect = sum([y_pred == y for y_pred, y in zip (predicted, test_target)])
accurracy = ncorrect / len(test_target)
accurracy

0.886

In [22]:
text_clf.steps

[('vect', CountVectorizer(ngram_range=(1, 2))),
 ('tfidf', TfidfTransformer()),
 ('clf', LinearSVC())]

In [23]:

vect = text_clf.steps[0][1]
tfidf = text_clf.steps[1][1]
clf = text_clf.steps[2][1]

# y_pred = text_clf['clf'].predict(text_clf['tfidf'].transform(text_clf['vect'].transform(test_data)))
y_pred = clf.predict(tfidf.transform(vect.transform(test_data)))
ncorrect = sum([y_pred == y for y_pred, y in zip (predicted, test_target)])
accurracy = ncorrect / len(test_target)
accurracy

0.886

# Separated

In [12]:
from sklearn import svm
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

ngram_range  = (1,1)
use_idf = False
# step 1
count_vect = CountVectorizer(ngram_range=ngram_range)
X_train_counts = count_vect.fit_transform(train_data)        
transformer = TfidfTransformer(use_idf=use_idf).fit(X_train_counts)
X_train = transformer.transform(X_train_counts) 
print(X_train.shape)
clf = svm.LinearSVC()

# step2: evaluation
clf.fit(X_train, train_target)

# step3: evaluation

print("Gold/Ground Truth Label:")
print(test_target[:30], "...")
print([id2label[x] for x in test_target[:10]], "...")

X_new_counts = count_vect.transform(test_data)
X_new = transformer.transform(X_new_counts)
predicted = clf.predict(X_new)
print("\nNumber Item Predicted:", len(predicted))
print("System / Predicted Label:")
print(list(predicted[:30]), "...")
ncorrect = sum([y_pred == y for y_pred, y in zip (predicted, test_target)])
accurracy = ncorrect / len(test_target)

print("\nResult:")
print(" ==> accurracy", accurracy)      

(5452, 8410)
Gold/Ground Truth Label:
[2, 4, 0, 5, 2, 2, 0, 1, 5, 5, 4, 0, 2, 0, 2, 2, 1, 0, 5, 2, 0, 5, 4, 5, 5, 0, 5, 4, 4, 4] ...
['NUM', 'LOC', 'HUM', 'DESC', 'NUM', 'NUM', 'HUM', 'ENTY', 'DESC', 'DESC'] ...

Number Item Predicted: 500
System / Predicted Label:
[2, 4, 0, 5, 2, 2, 0, 5, 5, 5, 4, 5, 2, 0, 2, 2, 4, 0, 5, 2, 0, 5, 4, 5, 5, 0, 5, 5, 4, 5] ...

Result:
 ==> accurracy 0.87
