In [1]:
cd ../src

/Users/williamferreira/Dropbox/mscproject/src


In [2]:
import numpy as np

In [3]:
from model.classifiers.lr_predictors import LogitPredictor, CompoundPredictor
from model.utils import get_dataset, split_data, RunCV, run_test

from model.baseline.transforms import (
    RefutingWordsTransform,
    QuestionMarkTransform,
    HedgingWordsTransform,
    InteractionTransform,
    NegationOfRefutingWordsTransform,
    BoWTransform,
    PolarityTransform,
    BrownClusterPairTransform
)

from model.ext.transforms import (
    AlignedPPDBSemanticTransform,
    NegationAlignmentTransform,
    Word2VecSimilaritySemanticTransform,
    DependencyRootDistanceTransform,
    SVOTransform
)

In [4]:
transforms = {
        'BoW': lambda: BoWTransform(),
        'BoW-Ref': RefutingWordsTransform,
        'BoW-Hed': HedgingWordsTransform,
        'Q': QuestionMarkTransform,
        'I': InteractionTransform,
        'Sim-Algn-W2V': Word2VecSimilaritySemanticTransform,
        'Sim-Algn-PPDB': AlignedPPDBSemanticTransform,
        'BoW-Neg-Ref': NegationOfRefutingWordsTransform,
        'Neg-Algn': NegationAlignmentTransform,
        'Root-Dist': DependencyRootDistanceTransform,
        'SVO': SVOTransform,
        'Pol': PolarityTransform,
        'Brown': BrownClusterPairTransform
    }

In [5]:
inc_transforms = [
        'Q',                # 1             1
        'BoW-Hed',          # 1-36         35
        'BoW-Ref',          # 37-48         12
        'I',                # 49-1272       1,224
        'BoW',              # 1273-1772     500
#         'Sim-Algn-W2V',     # 1773-1773     1
        'Sim-Algn-PPDB',    # 1774-1774     1
        'Root-Dist',        # 1775-1776     2
        'Neg-Algn',         # 1777-1779     3
        'SVO',              # 1780-1788     9
        ]

In [6]:
predictor = LogitPredictor

In [7]:
train_data = get_dataset('url-versions-2015-06-14-clean-train.csv')
X, y = split_data(train_data)
test_data = get_dataset('url-versions-2015-06-14-clean-test.csv')

In [8]:
p = predictor([transforms[t] for t in inc_transforms])
test_score = run_test(X, y, test_data, p, display=True)

>> Running against test data <<

Confusion matrix:
           for  against  observing
for        198       11         39
against     10       72         11
observing   52       10        106

Measures:
accuracy: 0.7387

Per class:
            accuracy  precision     recall         F1
for        0.7799607  0.7615385  0.7983871  0.7795276
against    0.9174853  0.7741935  0.7741935  0.7741935
observing  0.7799607  0.6794872  0.6309524   0.654321


  if e in SVOTransform._entailment_map.keys() and x == w]


In [9]:
feature_sizes = [transforms[t]().fit(X).transform(X).shape[1] for t in inc_transforms]

In [10]:
boundaries = np.hstack(([0], np.cumsum(feature_sizes)))

In [11]:
feature_boundaries = dict(tuple(zip(zip(boundaries[:-1], boundaries[1:]), inc_transforms)))

In [12]:
feature_boundaries

{(0, 1): 'Q',
 (1, 36): 'BoW-Hed',
 (36, 48): 'BoW-Ref',
 (48, 1272): 'I',
 (1272, 1772): 'BoW',
 (1772, 1773): 'Sim-Algn-PPDB',
 (1773, 1775): 'Root-Dist',
 (1775, 1778): 'Neg-Algn',
 (1778, 1787): 'SVO'}

In [13]:
def in_range(x, r):
    return r[0] <= x < r[1]

In [14]:
def map_important_features(f):
    d = {}
    for i in f:
        for r, t in feature_boundaries.items():
            if in_range(i, r):
#                 print t, i
                d.setdefault(t, set()).add(i)
    return d

In [15]:
p.classifier.classes_

array(['against', 'for', 'observing'], dtype=object)

In [24]:
important_features = np.where(p.classifier.coef_[1, :] > 0)[0]

In [25]:
map_important_features(important_features).keys()

['SVO', 'Neg-Algn', 'I', 'BoW', 'Sim-Algn-PPDB', 'BoW-Hed', 'Root-Dist']

In [166]:
bowt = BoWTransform()

In [168]:
bowt.fit(X).transform(X)

<2086x500 sparse matrix of type '<type 'numpy.int64'>'
	with 15858 stored elements in Compressed Sparse Row format>

In [170]:
bowt.cv.vocabulary_

{u'000': 0,
 u'10': 1,
 u'100': 2,
 u'12': 3,
 u'12 inch': 4,
 u'2015': 5,
 u'300': 6,
 u'500': 7,
 u'abdel': 8,
 u'about': 9,
 u'abu': 10,
 u'abu bakr': 11,
 u'accidentally': 12,
 u'afghan': 13,
 u'afghan soldiers': 14,
 u'after': 15,
 u'after he': 16,
 u'against': 17,
 u'air': 18,
 u'airport': 19,
 u'airstrike': 20,
 u'airstrikes': 21,
 u'al': 22,
 u'al baghdadi': 23,
 u'all': 24,
 u'alleged': 25,
 u'allegedly': 26,
 u'amazon': 27,
 u'american': 28,
 u'an': 29,
 u'and': 30,
 u'angry': 31,
 u'ankles': 32,
 u'app': 33,
 u'apple': 34,
 u'apple watch': 35,
 u'are': 36,
 u'arrest': 37,
 u'arrested': 38,
 u'as': 39,
 u'at': 40,
 u'at least': 41,
 u'attack': 42,
 u'audio': 43,
 u'baby': 44,
 u'back': 45,
 u'baghdadi': 46,
 u'bakr': 47,
 u'bakr al': 48,
 u'bale': 49,
 u'bank': 50,
 u'bank hank': 51,
 u'banksy': 52,
 u'batmobile': 53,
 u'battery': 54,
 u'battery life': 55,
 u'be': 56,
 u'bear': 57,
 u'bear attack': 58,
 u'beats': 59,
 u'beats music': 60,
 u'because': 61,
 u'been': 62,
 u'befo

In [181]:
sum([len(np.where(p.classifier.coef_[i, :] > 0)[0]) for i in range(3)])

353