In [1]:
import pandas as pd
import numpy as np
from time import time
import re

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn import model_selection
from sklearn.model_selection import train_test_split

from sklearn.ensemble import AdaBoostRegressor, GradientBoostingRegressor
# needs to be installed first
from xgboost import XGBRegressor

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn
from nltk.metrics import jaccard_distance

from scipy.stats import pearsonr
from nltk.tag.perceptron import PerceptronTagger
from nltk import download
from nltk.corpus import treebank
download('treebank')

[nltk_data] Downloading package treebank to
[nltk_data]     C:\Users\Jim\AppData\Roaming\nltk_data...
[nltk_data]   Package treebank is already up-to-date!


True

### Initialize variables

In [2]:
stop_words = stopwords.words('english')
wnl = WordNetLemmatizer()

train_data = treebank.tagged_sents()

# Perceptron tagger
per = PerceptronTagger(load='false')
per.train(train_data)

file_prefix = lambda stage: stage + '/STS'

### Read input data (train/test set + golden standards) and clean data

In [3]:
def getData(filenames, file_prefix, **removals):
    """
    Reads data and corresponding golden standard file as the true label
    """
    input_prefix = file_prefix + '.input.'
    gs_prefix = file_prefix + '.gs.'
    input = pd.DataFrame()
    for filename in filenames:
        sentences = pd.read_csv(input_prefix + filename + '.txt',
                                sep='\t', names=['sentence1', 'sentence2'],
                                quoting=3)
        golden_standards = pd.read_csv(gs_prefix + filename + '.txt',
                                       names=['golden_standard'])
        dfX = pd.concat([sentences, golden_standards], axis=1)
        input = pd.concat([input, dfX])

    input.reset_index(drop=True)
    # Remove punctuation (and numbers)
    input[['sentence1', 'sentence2']] = input[['sentence1', 'sentence2']].apply(lambda col: sentClean(col, **removals))
    # Tokenize (remove stop words)
    input[['sentence1', 'sentence2']] = input[['sentence1', 'sentence2']].apply(lambda col: sentTokenize(col, **removals))
    return input

def sentClean(sents, **removals):
    """
    Clean data from punctuations and numbers
    """
    new_sent = []
    for sent in sents:
        if removals.get('numbers'):
            mod_sent = re.sub(r'[^A-Z a-z]', ' ', sent)
        else:
            mod_sent = re.sub(r'[^\w]', ' ', sent)  # |\b\w\b -> to take out single chars
        clean_sent = re.sub(r'[ ]+', ' ', mod_sent.strip())
        if len(clean_sent) == 0:
            clean_sent = sent
        new_sent.append(clean_sent)
    return new_sent

def sentTokenize(sents, **removals):
    """
    Tokenize sentences and remove stop words if requested
    """
    new_sent = []
    for sent in sents:
        tokens = nltk.word_tokenize(sent)
        if removals.get('stop_words'):
            tokens = [token for token in tokens if token not in stop_words]
        new_sent.append(tokens)
    return new_sent

### Helper functions for transforming sentences and finding their similarity

In [4]:
""" more orthodox and robust implementation """
def dice_coefficient(s1, s2, n=2):
    """dice coefficient 2nt/na + nb."""
    if isinstance((s1), list):
        a = ' '.join(s1)
    elif isinstance(s1, str):
        a = s1
    if isinstance((s2), list):
        b = ' '.join(s2)
    elif isinstance(s2, str):
        b = s2
    if not len(a) or not len(b): return 0.0
    if len(a) == 1:  a = a + u'.'
    if len(b) == 1:  b = b + u'.'

    a_bigram_list = []
    for i in range(len(a) - 1):
        a_bigram_list.append(a[i:i + n])
    b_bigram_list = []
    for i in range(len(b) - 1):
        b_bigram_list.append(b[i:i + n])

    a_bigrams = set(a_bigram_list)
    b_bigrams = set(b_bigram_list)
    overlap = len(a_bigrams & b_bigrams)
    dice_coeff = overlap * 2.0 / (len(a_bigrams) + len(b_bigrams))
    return dice_coeff

def myWSD(pt_pair, context):
    '''
    Returns the synset if it's a noun, a verb, an adverb
    or an adjective or the lowered word
    '''
    wn_postag_map = {'N': 'n',
             'V': 'v',
             'J': 'a',
             'R': 'r'
            }

    pt = wn_postag_map.get(pt_pair[1][0])
    word = pt_pair[0]
    if pt:
        return mylesk(context, word, pt)
    else:
        return word

### Lexical and semantic transformations

In [5]:
def lemmatize(p):
    if p[1][0] in {'N','V'}:
        return wnl.lemmatize(p[0].lower(), pos=p[1][0].lower())
    return p[0].lower()

def lemmas(sents):
    new_sent = []
    for sent in sents:
        pairs = per.tag(sent)
        new_sent.append([lemmatize(pair) for pair in pairs])
    return new_sent

def ngrams(sents, n, word=True):
    """
    Applying ngrams with default behaviour to apply it on words
    """
    new_sent = []
    n_old = n
    for sent in sents:
        n = n_old
        # Handle sentences smaller than n
        if len(sent) < n:
            n_old = n
            n = len(sent)
        if word:
            grams_lst = [w for w in nltk.ngrams(sent, n)]
            new_sent.append(grams_lst)
        else:
            sent_joined = ' '.join(sent)
            grams_lst = [''.join(ch) for ch in nltk.ngrams(sent_joined, n)]
            new_sent.append(grams_lst)
    return new_sent

def mylesk(context_sentence, ambiguous_word, pos=None, synsets=None):
    """
    Optimization of lesk for word disambiguation which uses the definitions
    and all the examples of all the synset and all of its hypernyms. It returns
    back the best matched hypernym.
    """
    context = set(context_sentence)
    hyper = lambda s: s.hypernyms()
    if synsets is None:
        synsets = wn.synsets(ambiguous_word)

    if pos:
        synsets = [ss for ss in synsets if str(ss.pos()) == pos]

    if not synsets:
        return ambiguous_word

    max_sense = []
    for ss in synsets:
        ss_lst = list(ss.closure(hyper))
        ss_lst.append(ss)
        for ss_hyper in ss_lst:
            max_sense.append(
                (len(context.intersection(ss_hyper.definition().split())), ss_hyper, ss)
                # (self.dice_coefficient(context, ss_hyper.definition()), ss_hyper, ss)
            )
            for ex in ss_hyper.examples():
                if ex:
                    max_sense.append(
                        (len(context.intersection(ex.split())), ss_hyper, ss)
                        # (self.dice_coefficient(context, ex), ss_hyper, ss)
                    )
    _, hyper, sense = max(max_sense)
    return hyper.lemmas()[0].name()

def lesk(sents):
    new_sent = []
    for sent in sents:
        pairs = per.tag(sent)
        new_sent.append([myWSD(pair,sent) for pair in pairs])
    return new_sent

### Model1 & Model2 construction

In [6]:
def modelFeatures(df_sents, files='', stage=''):
    """
    Constructing the features of the Model1 using lexical and semantic transformations
    """
    Xlesk = df_sents.apply(lesk)

    Mdice_lem = df_sents.apply(lambda x: dice_coefficient(x['sentence1'], x['sentence2']), axis=1)

    Mjac_lem = df_sents.apply(lambda x: jaccard_distance(set(x['sentence1']), set(x['sentence2'])), axis=1)

    Mlesk = Xlesk.apply(lambda x: dice_coefficient(x['sentence1'], x['sentence2']), axis=1)

    Xngram = df_sents.apply(lambda col: ngrams(col, 2))
    Mngram2 = Xngram.apply(lambda x: jaccard_distance(set(x['sentence1']), set(x['sentence2'])), axis=1)

    Xngram = df_sents.apply(lambda col: ngrams(col, 4))
    Mngram4 = Xngram.apply(lambda x: jaccard_distance(set(x['sentence1']), set(x['sentence2'])), axis=1)

    # Use of features without stop words
    removals = {
        'stop_words': True,
        'numbers': True
    }
    df_XtrainSW = getData(files, file_prefix(stage), **removals)

    df_XtrainSW = df_XtrainSW.drop(df_XtrainSW.columns[len(df_XtrainSW.columns) - 1], axis=1)

    Xlem = df_XtrainSW.apply(lemmas)

    Xngram = Xlem.apply(lambda col: ngrams(col, 1))
    Mngram1_sw = Xngram.apply(lambda x: jaccard_distance(set(x['sentence1']), set(x['sentence2'])), axis=1)

    Xngram = Xlem.apply(lambda col: ngrams(col, 3))
    Mngram2_sw = Xngram.apply(lambda x: jaccard_distance(set(x['sentence1']), set(x['sentence2'])), axis=1)
    
    df = pd.concat([Mdice_lem, Mjac_lem, Mlesk, Mngram2, Mngram4, Mngram1_sw, Mngram2_sw], axis=1)
    return df

def fit_model2(X):
    """
    The train model for the Model2 using Tf/Idf over bag of words 
    """
    bow = CountVectorizer(lowercase=False,
                          analyzer=lambda x: x)
    join_X = X['sentence1'] + X['sentence2']
    bow_Xtrn = bow.fit_transform(join_X)
    sents_tfidf = TfidfTransformer()
    tfidf_Xtrn = sents_tfidf.fit_transform(bow_Xtrn)
    return bow, sents_tfidf, tfidf_Xtrn

### Main function

In [7]:
filenames_train = ['MSRpar', 'MSRvid', 'SMTeuroparl']
filenames_test = ['MSRpar', 'MSRvid', 'SMTeuroparl',
                  'surprise.OnWN', 'surprise.SMTnews']

removals = {
            'stop_words': False,
            'numbers': True
            }

start_global = time()
# Tokenized raw train data
df_Xtrain = getData(filenames_train, file_prefix('train'), **removals)
labels_trn = df_Xtrain.iloc[:, -1]
df_Xtrain = df_Xtrain.drop(df_Xtrain.columns[len(df_Xtrain.columns) - 1], axis=1)

# Tokenized raw test data
df_Xtest = getData(filenames_test, file_prefix('test'), **removals)
labels_tst = df_Xtest.iloc[:, -1]
df_Xtest = df_Xtest.drop(df_Xtest.columns[len(df_Xtest.columns) - 1], axis=1)

# Base transformations of sentences
Xlem = df_Xtrain.apply(lemmas)
Xlem_tst = df_Xtest.apply(lemmas)

## Model1
start_model = time()
# Transform the train data
transformed_train = modelFeatures(Xlem, filenames_train, 'train')

# Model1 training algorithm
reg_model = AdaBoostRegressor()
# Construct feature of Model1 for ensemble model
kfold = model_selection.KFold(n_splits=10)
results_model1 = model_selection.cross_val_predict(reg_model, transformed_train, labels_trn, cv=kfold)

# Transformation of test data
transformed_test = modelFeatures(Xlem_tst, filenames_test, 'test')

# Make predictions over test data for Model1 (output of Model1)
pred_model = reg_model.fit(transformed_train, labels_trn).predict(transformed_test)
print()
print('Model1 Pearson Correlation:', pearsonr(pred_model.T.tolist(), labels_tst.tolist())[0])
duration_model = time() - start_model
print('Duration:', duration_model)

## Model2
start_model = time()
# Transform data 
bow, sents_tfidf, tfidf_Xtrn = fit_model2(Xlem)
# Model2 training algorithm
reg_bow = GradientBoostingRegressor()

# Construct feature of Model2 for ensemble model (output of Model2)
results_model2 = model_selection.cross_val_predict(reg_bow, tfidf_Xtrn, labels_trn, cv=kfold)
# join test sentences for applying bag of words with TF/IDF
join_Xlem_tst = Xlem_tst['sentence1'] + Xlem_tst['sentence2']
bow_Xtst = bow.transform(join_Xlem_tst)
tfidf_Xtst = sents_tfidf.transform(bow_Xtst)

# Make predictions over test data for Model2
pred_bow = reg_bow.fit(tfidf_Xtrn, labels_trn).predict(tfidf_Xtst)
print()
print('Model2 Pearson Correlation:', pearsonr(pred_bow.T.tolist(), labels_tst.tolist())[0])
duration_model = time() - start_model
print('Duration:', duration_model)

## Ensemble Model3
reg_final = XGBRegressor().fit(np.concatenate([results_model1.reshape(-1,1), results_model2.reshape(-1,1)], axis=1), labels_trn)
pred_final = reg_final.predict(np.concatenate([pred_model.reshape(-1,1), pred_bow.reshape(-1,1)], axis=1))
print()
print('Final Combined Model Pearson Correlation:', pearsonr(pred_final.T.tolist(), labels_tst.tolist())[0])
duration = time() - start_global
print('Global duration:', duration)


Model1 Pearson Correlation: 0.6318505133806376
Duration: 122.60720443725586

Model2 Pearson Correlation: 0.6255477067661559
Duration: 30.31155824661255

Final Combined Model Pearson Correlation: 0.7519449894980038
Global duration: 186.37055492401123


## Conclusions

The final correlation between the golden standard of the test sentences and our prediction model is 0.752. Overall, this shows that our model can find some textual similarities between sentences and give results quite close to the real ones.

We noticed through the pre-work that some simple features were giving good results like lemmas even when we were not adding more information (features). Synsets on the other hand were not giving good results alone, that's why we used them through our own optimization of lesk algorithm for disambiguating words. Furthermore, specific sizes of ngrams were proved to be quite informative as features of our model. Finally, the use of Tf/Idf metric over bag of words gave some unexpectedly good results because of similarities in the train and test set topics. We decided directly to keep this approach as another model because of the size of its feature space. Also, we didn't want to put in the same feature matrix so different kind of metrics.

At the end, we ended up using the following transformations on the data (sentences) before applying a similarity metric:
* lemmas (as a base transformation for the rest)
* n-grams
* word substitutions using word disambiguation with our optimization of lesk.
* bag of words

At this point, we need to specify that in order to get a better representation of a sentence we substituted words with the (first) lemma of the best matched hypernym synset. We thought that the hypernym between 2 words of different senteces can be the same more frequently than the synset of the word we wanted to disabiguate. Also, in order to avoid errors we return the same word in case there is no synset for a word.

The similarity metrics we used were:
* Dice coefficient
* Jaccard distance
* Tf/Idf (it's more like a statistic 

Dice coefficient values more the intersection of the sentences and uses bi-grams. It proved to give consistently better results, whenever we managed to use it, in comparison with jaccard distance.

Finally, we constructed 2 different models:
* Model1, which includes the following features:
    * Dice coefficient over lemmas
    * Dice coefficient over Lesk optimization
    * Jaccard distance over lemmas
    * Jaccard distance over lemmas without stopwords
    * Jaccard distance over lemmas of n-grams:
        * 2 words
        * 4 words
        * 3 words without stopwords
    
* Model2, which is the Tf/Idf statistic over bag of words.

Our problem is a regression problem. We tested different regressors and at the end we ended up using AdaBoostRegressor for Model1 and GradientBoostingRegressor for Model2.

In order to combine the models we applied an ensemble technique called stacking using K-folds, K=10, splitting our training set into 9 folds for training and 1 for prediction. In every fold we get a prediction over the prediction set; then we construct a vector with all the predictions which represents the testing error over the training folds. After doing this for both models, we train with the whole training dataset for getting the predictions/output of each model over the test set.

Then, after constructing a vector from the K-fold of each model, we use them as features for our ensemble model which will combine the results of the other 2 models we described. Like this, it would be like training over the (simulated) testing error. That technique boosted our model and made it much stronger since it combines the information from both and makes our final model learn some inaccuracies of the other 2 base models. For that model we used Extreme Gradient Boosting Regressor, which is a very famous algorithm used in most of AI challenges.

In general, we noticed that simple features we had tried during the course were giving better results than more complex ones; that's why at the end, our models don't use any kind of special features. Also, we took advantage of the fact that the source of the training and some of the test (3 out of 5) sets is the same and we used bag of words with Tf/Idf. Finally, the addition of similar transformations using the same similarity metric gave a rise to our final results.