# Replicating SemEval Task 9 - Subtask A results 

In [1]:
import pandas as pd
from sklearn.metrics import f1_score, accuracy_score as acc
import numpy
from nltk.tokenize import word_tokenize
import nltk
import re
from collections import Counter
from textblob import TextBlob
from spellchecker import SpellChecker
import string

### Read in the data

In [2]:
train_data = pd.read_csv('../SemEval_Task_9/Subtask-A-master/V1.4_Training.csv',
                         header=None, names=['id','sentence','label'])
dev_data = pd.read_csv('../SemEval_Task_9/Subtask-A-master/SubtaskA_Trial_Test_Labeled.csv', 
                       encoding='latin-1', header=0)
test_data = pd.read_csv('../SemEval_Task_9/Subtask-A-master/SubtaskA_EvaluationData_labeled.csv',
                        header=None, names=['id','sentence','label'])

sent_list = train_data.loc[:, ['sentence']].values.tolist()
gold_labels = train_data.loc[:,'label'].values.tolist()

test_sent_list = test_data.loc[:, 'sentence'].values.tolist()
test_gold_labels = test_data.loc[:,'label'].values.tolist()

dev_sent_list = dev_data.loc[:, 'sentence'].values.tolist()
dev_gold_labels = dev_data.loc[:,'label'].values.tolist()

In [3]:
print("1 is a suggestion, 0 is not.")
print("Distribution of Train set:", Counter(gold_labels))
print("Distribution of Dev set:", Counter(dev_gold_labels))
print("Distribution of Test set:", Counter(test_gold_labels))

1 is a suggestion, 0 is not.
Distribution of Train set: Counter({0: 6415, 1: 2085})
Distribution of Dev set: Counter({1: 296, 0: 296})
Distribution of Test set: Counter({0: 746, 1: 87})


### Baseline

In [4]:
def classify(sent_list):

    keywords = ["suggest","recommend","hopefully","go for","request","it would be nice","adding",
                "should come with","should be able","could come with", "i need" , "we need","needs", 
                "would like to","would love to","allow","add"]

    # Goldberg et al.
    pattern_strings = [r'.*would\slike.*if.*', r'.*i\swish.*', r'.*i\shope.*', r'.*i\swant.*', 
                       r'.*hopefully.*', r".*if\sonly.*", r".*would\sbe\sbetter\sif.*", r".*should.*", 
                       r".*would\sthat.*",r".*can't\sbelieve.*didn't.*", r".*don't\sbelieve.*didn't.*", 
                       r".*do\swant.*", r".*i\scan\shas.*"]

    compiled_patterns = []
    for patt in pattern_strings:
        compiled_patterns.append(re.compile(patt))

    label_list = []
    for sent in sent_list:
        tokenized_sent = word_tokenize(sent)
        tagged_sent = nltk.pos_tag(tokenized_sent)
        tags = [i[1] for i in tagged_sent]
        label = 0
        patt_matched = False
        for compiled_patt in compiled_patterns:
            joined_sent = " ".join(tokenized_sent)
            matches = compiled_patt.findall(joined_sent)
            if len(matches) > 0:
                patt_matched = True
        keyword_match = any(elem in keywords for elem in tokenized_sent)


        pos_match = any(elem in ['MD', 'VB'] for elem in tags)

        if patt_matched:
            label = 1
        elif keyword_match == True:
            label = 1
        elif pos_match == True:
            label = 1

        label_list.append(label)



    return label_list

In [5]:
test_pred_labels = classify(test_sent_list)
print("Baseline performance on test:", f1_score(test_gold_labels, test_pred_labels))

Baseline performance on test: 0.26755852842809363


In [6]:
dev_pred_labels = classify(dev_sent_list)
print("Baseline performance on dev:", f1_score(dev_gold_labels, dev_pred_labels))

Baseline performance on dev: 0.720626631853786


### Some observations

If the pos_match option is turned off in the baseline checker, F1 **goes up** to 0.38

**89%** accuracy can be obtained simply by predicting the majority class -- not advice

### NTUA-IS results

In [7]:
def gr_classify(sent_list, sk, P_ab=True, P_c=True, imperative=True, spelling=False):
    
    # words from above with other example words they included - P_a
    pattern_pa = ["suggest","recommend","hopefully","go for","request","it would be nice","adding",
                   "should come with","should be able","could come with", "i need" , "we need","needs", 
                   "would like to","would love to","allow","add", "helpful", "allow", "disallow", "idea",
                   "consider"]

    # Goldberg et al.
    pattern_pc = [r'.*would\slike.*if.*', r'.*i\swish.*', r'.*i\shope.*', r'.*i\swant.*', 
                  r'.*hopefully.*', r".*if\sonly.*", r".*would\sbe\sbetter\sif.*", r".*should.*",
                  r".*would\sthat.*", r".*can't\sbelieve.*didn't.*", r".*don't\sbelieve.*didn't.*", 
                  r".*do\swant.*", r".*i\scan\shas.*"]
    
    # pattern list P_c rules for subtask A
    pattern_pc += [r'.*should\s(not|be|take|include|start).*', r'.*be\sbetter.*', r'.*that\sway.*',
                   r'.*so\sthat.*', r'.*why\snot.*', r'.*suggestion\sis.*', r'.*good\ssolution.*',
                   r'.*the\sidea.*', r'.*to\sallow.*', r'.*would\smake.*', r'.*(will|would)\sbe.*',
                   r'.*(to|would|could)\senable\s(i|would|id)\s(like|prefer).*', r'.*am\sasking\sfor.*',
                   r'.*look\sinto.*', r'.*make\sit.*', r'.*at\sleast.*', r'.*we\sneed.*']
    compiled_pc = [re.compile(patt) for patt in pattern_pc]
    
    # pattern list P_b rules for subtask B (and possibly the same for subtask A)
    # pattern list P_b rules for subtask A
    pattern_pb = [r'.*do\snot.*', r'.*if\sonly.*', r'.*(so|before|can|for|if)\syou.*', 
                   r'.*you\s(will|need|can|may).*', r'.*(make|be)\ssure.*', r'.*watch\sout.*', 
                   r'.*(go|going|asking|wishing)\sfor.*', r'.*would\sadvise.*', 
                   r'.*(will|would|could)\sbe.*', r'.*be\s(prepared|careful|warned|forewarned).*',
                   r'.*(i/would/i\'d)\s(like|prefer).*', r'.*highly\srecommended.*', 
                   r'.*(look|looking)\s(into|for|up|around).*', r'.*why\snot.*', r'.*is\sthere.*',
                   r'.*we\sneed.*']
    compiled_pb = [re.compile(patt) for patt in pattern_pb]
        
    pos_pattern_strings = [r'^UH\sVBP.*', r'^MD\sRB\sPRP.*', r'^(VB|VBP).*', r'^MD.*', 
                           r'^(DT|RB|PRP|NN)\sVB.*']
    compiled_pos_patterns = [re.compile(patt) for patt in pos_pattern_strings]


    label_list = []
    for sent in sent_list:
        score = 0
        
        if len(sent.split()) < 5:
            score -=0.2
        
        clause_split = [a for a in re.split("[.,!?;]|(Please|please)", sent) if a not in [None, '', ' ', 'Please', 'please']]
        for clause in clause_split:
            clause_pos = TextBlob(clause).tags
            
            words = [i[0] for i in clause_pos]
            tags = [i[1] for i in clause_pos]
            
            # Correct misspells
            if spelling:
                words = [spell.correction(w) if w not in spell else w for w in words]
            
            if P_ab:            
                # Pattern P_a
                if any(elem in pattern_pa for elem in words):
                    score += 0.3

                # Pattern P_b
                for compiled_patt in compiled_pb:
                    joined_sent = " ".join(words)
                    matches = compiled_patt.findall(joined_sent)
                    if len(matches) > 0:
                        score += 0.1
            if P_c:
                # Pattern P_c
                for compiled_patt in compiled_pc:
                    joined_sent = " ".join(words)
                    matches = compiled_patt.findall(joined_sent)
                    if len(matches) > 0:
                        score += 0.25

            if imperative:
                # Imperative POS pattern check
                for compiled_pos_patt in compiled_pos_patterns:
                    joined_sent = " ".join(tags)
                    matches = compiled_pos_patt.findall(joined_sent)
                    if len(matches) > 0:
                        score += sk

        if score > 0.15:
            label_list.append(1)
        else:
            label_list.append(0)

    return label_list

## Preprocessing - lower case, spell check

In [8]:
sent_list = [a[0].lower() for a in sent_list]
test_sent_list = [a.lower() for a in test_sent_list]
dev_sent_list = [a.lower() for a in dev_sent_list]

misspell = 0
spell = SpellChecker()
spell.word_frequency.load_words([a for a in string.punctuation] + ['titles/subtitles', 'upload', 'simd'])
for sent in test_sent_list:
    for word in word_tokenize(sent):
        if word in spell:
            continue
        else:
#             print(word)
            misspell += 1
            
print(misspell)

929


In [9]:
test_pred_labels = gr_classify(test_sent_list, sk=0.25)
print("NTUA-IS rules performance on test:", f1_score(test_gold_labels, test_pred_labels))

NTUA-IS rules performance on test: 0.3234624145785877


In [10]:
dev_pred_labels = gr_classify(dev_sent_list, sk=0.25)
print("NTUA-IS rules performance on dev:", f1_score(dev_gold_labels, dev_pred_labels))

NTUA-IS rules performance on dev: 0.7147239263803681


### P_ab performance

In [11]:
test_pred_labels = gr_classify(test_sent_list, sk=0.0, P_ab=True, P_c=False, imperative=False)
print("NTUA-IS rules performance on test with p_ab:", f1_score(test_gold_labels, test_pred_labels))

NTUA-IS rules performance on test with p_ab: 0.3647798742138364


### P_c performance

In [12]:
test_pred_labels = gr_classify(test_sent_list, sk=0.0, P_ab=False, P_c=True, imperative=False)
print("NTUA-IS rules performance on test with p_c:", f1_score(test_gold_labels, test_pred_labels))

NTUA-IS rules performance on test with p_c: 0.44086021505376344


### Imperative performance

In [13]:
test_pred_labels = gr_classify(test_sent_list, sk=0.18, P_ab=False, P_c=False, imperative=True)
print("NTUA-IS rules performance on test with imperative:", f1_score(test_gold_labels, test_pred_labels))

NTUA-IS rules performance on test with imperative: 0.19760479041916168


# Subtask B Results

In [14]:
train_data_b = pd.read_csv('../SemEval_Task_9/Subtask-B-master/V1.4_Training.csv',
                         header=None, names=['id','sentence','label'])
dev_data_b = pd.read_csv('../SemEval_Task_9/Subtask-B-master/SubtaskB_Trial_Test_Labeled.csv', 
                       encoding='latin-1', header=0)
test_data_b = pd.read_csv('../SemEval_Task_9/Subtask-B-master/SubtaskB_EvaluationData_labeled.csv',
                        header=None, names=['id','sentence','label'])

sent_list_b = train_data_b.loc[:, ['sentence']].values.tolist()
gold_labels_b = train_data_b.loc[:,'label'].values.tolist()

test_sent_list_b = test_data_b.loc[:, 'sentence'].values.tolist()
test_gold_labels_b = test_data_b.loc[:,'label'].values.tolist()

dev_sent_list_b = dev_data_b.loc[:, 'sentence'].values.tolist()
dev_gold_labels_b = dev_data_b.loc[:,'label'].values.tolist()

In [15]:
print("1 is a suggestion, 0 is not.")
print("Distribution of Train set:", Counter(gold_labels_b))
print("Distribution of Dev set:", Counter(dev_gold_labels_b))
print("Distribution of Test set:", Counter(test_gold_labels_b))

1 is a suggestion, 0 is not.
Distribution of Train set: Counter({0: 6415, 1: 2085})
Distribution of Dev set: Counter({1: 404, 0: 404})
Distribution of Test set: Counter({0: 476, 1: 348})


## Baseline performance on test

In [16]:
test_pred_labels_b = classify(test_sent_list_b)
print("Baseline performance on test:", f1_score(test_gold_labels_b, test_pred_labels_b))

Baseline performance on test: 0.7321668909825034


In [26]:
def gr_classify_b(sent_list, sk, P_a=True, P_b=True, imperative=True, spelling=False):
    
    # words from above with other example words they included - P_a
#     keywords_pa = ["suggest","recommend","hopefully","go for","request","it would be nice","adding",
#                    "should come with","should be able","could come with", "i need" , "we need","needs", 
#                    "would like to","would love to","allow","add", "helpful", "allow", "disallow", "idea",
#                    "consider"]
    pattern_pa = ['avoid', 'beware', "don't", 'expect', 'remember', 'tip', 'advise', 'advice', 'recommended',
                  'recommendation', 'suggest', 'suggestion', 'ask', 'bring', 'pick', 'consider', 'spend', 
                  'expect', 'can', 'please', 'can', 'hopefully', 'enjoying', 'want', 'wanting', 'prefer']
    

#     # Goldberg et al.
#     pattern_pc = [r'.*would\slike.*if.*', r'.*i\swish.*', r'.*i\shope.*', r'.*i\swant.*', 
#                   r'.*hopefully.*', r".*if\sonly.*", r".*would\sbe\sbetter\sif.*", r".*should.*",
#                   r".*would\sthat.*", r".*can't\sbelieve.*didn't.*", r".*don't\sbelieve.*didn't.*", 
#                   r".*do\swant.*", r".*i\scan\shas.*"]
    
#     # pattern list P_c rules for subtask A
#     pattern_pc += [r'.*should\s(not|be|take|include|start).*', r'.*be\sbetter.*', r'.*that\sway.*',
#                    r'.*so\sthat.*', r'.*why\snot.*', r'.*suggestion\sis.*', r'.*good\ssolution.*',
#                    r'.*the\sidea.*', r'.*to\sallow.*', r'.*would\smake.*', r'.*(will|would)\sbe.*',
#                    r'.*(to|would|could)\senable\s(i|would|id)\s(like|prefer).*', r'.*am\sasking\sfor.*',
#                    r'.*look\sinto.*', r'.*make\sit.*', r'.*at\sleast.*', r'.*we\sneed.*']
#     compiled_pc = [re.compile(patt) for patt in pattern_pc]
    
    # pattern list P_b rules for subtask B (and possibly the same for subtask A)
    # pattern list P_b rules for subtask A
    pattern_pb = [r'.*do\snot.*', r'.*if\sonly.*', r'.*(so|before|can|for|if)\syou.*', 
                   r'.*you\s(will|need|can|may).*', r'.*(make|be)\ssure.*', r'.*watch\sout.*', 
                   r'.*(go|going|asking|wishing)\sfor.*', r'.*would\sadvise.*', 
                   r'.*(will|would|could)\sbe.*', r'.*be\s(prepared|careful|warned|forewarned).*',
                   r'.*(i/would/i\'d)\s(like|prefer).*', r'.*highly\srecommended.*', 
                   r'.*(look|looking)\s(into|for|up|around).*', r'.*why\snot.*', r'.*is\sthere.*',
                   r'.*we\sneed.*']
    compiled_pb = [re.compile(patt) for patt in pattern_pb]
        
    pos_pattern_strings = [r'^UH\sVBP.*', r'^MD\sRB\sPRP.*', r'^(VB|VBP).*', r'^MD.*', 
                           r'^(DT|RB|PRP|NN)\sVB.*']
    compiled_pos_patterns = [re.compile(patt) for patt in pos_pattern_strings]


    label_list = []
    for sent in sent_list:
        score = 0
        
        if len(sent.split()) < 5:
            score -=0.2
        
        clause_split = [a for a in re.split("[.,!?;]|(please)", sent) if a not in 
                        [None, '', ' ', 'please']]
        for clause in clause_split:
            clause_pos = TextBlob(clause).tags
            
            words = [i[0] for i in clause_pos]
            tags = [i[1] for i in clause_pos]
            
            # Correct misspells
            if spelling:
                words = [spell.correction(w) if w not in spell else w for w in words]
            
            if P_a:            
                # Pattern P_a
                if any(elem in pattern_pa for elem in words):
                    score += 0.25


            if P_b:
                # Pattern P_b
                for compiled_patt in compiled_pb:
                    joined_sent = " ".join(words)
                    matches = compiled_patt.findall(joined_sent)
                    if len(matches) > 0:
                        score += 0.25

            if imperative:
                # Imperative POS pattern check
                for compiled_pos_patt in compiled_pos_patterns:
                    joined_sent = " ".join(tags)
                    matches = compiled_pos_patt.findall(joined_sent)
                    if len(matches) > 0:
                        score += sk

        if score > 0.15:
            label_list.append(1)
        else:
            label_list.append(0)

    return label_list

## Preprocessing - lower case, spell check

In [18]:
sent_list_b = [a[0].lower() for a in sent_list_b]
test_sent_list_b = [a.lower() for a in test_sent_list_b]
dev_sent_list_b = [a.lower() for a in dev_sent_list_b]

from spellchecker import SpellChecker
import string

spell = SpellChecker(distance=1)
spell.word_frequency.load_words([a for a in string.punctuation])

In [27]:
test_pred_labels_b = gr_classify_b(test_sent_list_b, sk=0)
print("NTUA-IS rules performance on test:", f1_score(test_gold_labels_b, test_pred_labels_b))

NTUA-IS rules performance on test: 0.7296849087893864


In [20]:
dev_pred_labels_b = gr_classify_b(dev_sent_list_b, sk=0.25)
print("NTUA-IS rules performance on dev:", f1_score(dev_gold_labels_b, dev_pred_labels_b))

NTUA-IS rules performance on dev: 0.6313131313131313


### P_a performance

In [21]:
test_pred_labels_b = gr_classify_b(test_sent_list_b, sk=0.0, P_a=True, P_b=False, imperative=False)
print("NTUA-IS rules performance on test with P_a:", f1_score(test_gold_labels_b, test_pred_labels_b))

NTUA-IS rules performance on test with P_a: 0.4968944099378882


### P_b performance

In [22]:
test_pred_labels_b = gr_classify_b(test_sent_list_b, sk=0.0, P_a=False, P_b=True, imperative=False)
print("NTUA-IS rules performance on test with P_b:", f1_score(test_gold_labels_b, test_pred_labels_b))

NTUA-IS rules performance on test with P_b: 0.2035623409669211


### Imperative performance

In [23]:
test_pred_labels_b = gr_classify_b(test_sent_list_b, sk=0.15, P_a=False, P_b=False, imperative=True)
print("NTUA-IS rules performance on test with imperative:", f1_score(test_gold_labels_b, test_pred_labels_b))

NTUA-IS rules performance on test with imperative: 0.038674033149171276


### With spell correction on

In [24]:
test_pred_labels_b = gr_classify_b(test_sent_list_b, sk=0.0, spelling=True)
print("NTUA-IS rules performance on test:", f1_score(test_gold_labels_b, test_pred_labels_b))

NTUA-IS rules performance on test: 0.5625
