In [97]:
from pathlib import Path 
from dataclasses import dataclass
from typing import List
from pprint import pprint

In [167]:
import warnings
warnings.filterwarnings("ignore")

In [4]:
data_train_path = Path('SemEval-PIT2015-py3/data/train.data')
data_test_path = Path('SemEval-PIT2015-py3/data/test.data')

In [113]:
@dataclass
class Token:
    word: str
    person: str
    pos: str
    clause: str
    event: str
        
        
@dataclass
class Doc:
    tokens: List[Token]
        
    def __iter__(self):
        for t in self.tokens:
            yield t
            
    def _list_attrs(self, attr_name):
        return [getattr(t, attr_name) for t in self]
    
    def _string_attrs(self, attr_name):
        return ' '.join(self._list_attrs(attr_name))
    
    def _ner_words(self, ner_name):
        result = list()
        for word, ne in zip(self.words, self._list_attrs(ner_name)):
            if ne != 'O':
                result.append(word)
        return result
    
    @property
    def words(self):
        return self._list_attrs('word')
    
    @property
    def lemmas(self):
        return [x.lower() for x in self.words]
    
    @property
    def persons_tags(self):
        return self._list_attrs('person')
    
    @property
    def pos(self):
        return self._list_attrs('pos')
    
    @property
    def clauses(self):
        return self._list_attrs('clause')
    
    @property
    def events_tags(self):
        return self._list_attrs('event')
    
    @property
    def sent(self):
        return self._string_attrs('word')
    
    @property
    def persons(self):
        return self._ner_words('person')
    
    @property
    def events(self):
        return self._ner_words('event')

In [172]:
def split_tags(string):
    return [Token(*i.split("/")) for i in string.split()]


def read_train_data(filename):
    data = []
    for line in open(filename):
        line = line.strip()
        #read in training or dev data with labels
        if len(line.split('\t')) == 7:
            (trendid, trendname, origsent, candsent, judge, origsenttag, candsenttag) = \
            line.split('\t')
        else:
            continue
        # ignoring the training data that has middle label 
        nYes = eval(judge)[0]        
        if nYes >= 3:
            amt_label = True
            data.append((Doc(split_tags(origsenttag)), Doc(split_tags(candsenttag)), amt_label))
        elif nYes <= 1:
            amt_label = False
            data.append((Doc(split_tags(origsenttag)), Doc(split_tags(candsenttag)), amt_label))
    return data


def read_test_data(filename):
    data = []
    for line in open(filename):
        line = line.strip()
        #read in training or dev data with labels
        if len(line.split('\t')) == 7:
            (trendid, trendname, origsent, candsent, judge, origsenttag, candsenttag) = \
            line.split('\t')
        else:
            continue
        # ignoring the training data that has middle label 
        nYes = int(judge[0])
        if nYes >= 4:
            expert_label = True
        elif nYes <= 2:
            expert_label = False
        else:
            expert_label = None
        data.append((Doc(split_tags(origsenttag)), Doc(split_tags(candsenttag)), expert_label))
    return data


In [199]:
def data_label_sep(data):
    x, labels = list(), list()
    for d1, d2, label in data:
        x.append((d1, d2))
        labels.append(labels)
    return x, labels

In [181]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.base import TransformerMixin

In [201]:
def jaccard_similarity(s1, s2):
    return len(s1 & s2) / (len(s1 | s2) + 1e-5)


def char_ngrams(text, n=3):
    return [text[i:i + n] for i in range(len(text) - n + 1)]


def word_ngrams(words, n=2):
    return ['_'.join(words[i:i + n]) for i in range(len(words) - n + 1)]

# =============================================================================


class TextFeatures(TransformerMixin):
    
    def fit(self, data, labels=None):
        return self
    
    def transform(self, data):
        return [self.compute_features_one(doc1, doc2) for doc1, doc2 in data]
        
    def compute_features_one(self, doc1, doc2):
        features = dict()
        features.update(self.jaccard_char_ngrams(doc1, doc2))
        features.update(self.jaccard_word_ngrams(doc1, doc2))
        
        features[f'persons_jsim'] = jaccard_similarity(set(doc1.persons), set(doc2.persons))
        features[f'events_jsim'] = jaccard_similarity(set(doc1.events), set(doc2.events))
    
        features['n_words_1'] = len(doc1.words)
        features['n_words_2'] = len(doc2.words)
        features['fraction_n_words'] = len(doc1.words) / len(doc2.words)
        return features
    
    def jaccard_char_ngrams(self, doc1, doc2):
        features = dict()
        for n in range(2, 5):
            ng1 = char_ngrams(doc1.sent.lower(), n=n)
            ng2 = char_ngrams(doc2.sent.lower(), n=2)
            features[f'char_jsim_{n}'] = jaccard_similarity(set(ng1), set(ng2))
        return features
    
    def jaccard_word_ngrams(self, doc1, doc2):
        features = dict()
        for n in range(1, 3):
            ng1 = word_ngrams(doc1.lemmas, n=n)
            ng2 = word_ngrams(doc2.lemmas, n=n)
            features[f'word_jsim_{n}'] = jaccard_similarity(set(ng1), set(ng2))
        return features

In [208]:
from nltk.corpus import wordnet as wn

In [209]:
doc1, doc2 = data_train[0]

In [205]:
feat_pipe = Pipeline([
    ('text', TextFeatures()),
    ('vectorizer', DictVectorizer()),
    ('polynomial', PolynomialFeatures())
])

model_pipe = Pipeline([
    ('features', feat_pipe),
    ('logit', LogisticRegression())
])

hyperparameters = {
    'features__polynomial__degree': [1],
    'logit__C': [10],
    'logit__class_weight': ['balanced'],
}

clf = GridSearchCV(model_pipe, hyperparameters, cv=3, scoring='f1', refit=True)

In [206]:
data_train = read_train_data(data_train_path)
data_test = read_test_data(data_test_path)

data_train, label_traint = data_label_sep(data_train)

In [207]:
clf.fit(data_train, train_labels)

print(f'Best Score: {clf.best_score_:.3f}')
print(f'\nBest Params: {clf.best_params_}')

Best Score: 0.702

Best Params: {'features__polynomial__degree': 1, 'logit__C': 10, 'logit__class_weight': 'balanced'}
