In [None]:
import sys
sys.path.append('src')

***
## Module Imports

In [None]:
import nltk
import pandas as pd
import numpy as np
from scipy.stats import pearsonr
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from nltk.metrics.distance import jaccard_distance
from collections.abc import Iterable
from sklearn.model_selection import GridSearchCV, PredefinedSplit
from sklearn.metrics import make_scorer

In [None]:
from data_utils import load_data
from dimension.lexical import *
from dimension.syntactical import *

***
## Data

In [None]:
train_data, test_data = load_data('data/')
print(
    f"train_data samples: {len(train_data)}, test_data samples: {len(test_data)}"
)

In [None]:
train_data.head()

In [None]:
test_data.head()

***
## Similarity functions

In [None]:
def jaccard_similarity(s1, s2):
    assert isinstance(s1, Iterable), f"s1 must be an iterable, not {type(s1)}"
    assert isinstance(s2, Iterable), f"s2 must be an iterable, not {type(s2)}"
    return 1 - jaccard_distance(set(s1), set(s2))

In [None]:
def overlap_similarity(s1, s2):
    assert isinstance(s1, Iterable), f"s1 must be an iterable, not {type(s1)}"
    assert isinstance(s2, Iterable), f"s2 must be an iterable, not {type(s2)}"
    s1 = set(s1)
    s2 = set(s2)
    intersection = s1.intersection(s2)
    return len(intersection) / min(len(s1), len(s2))

In [None]:
def cosine_similarity(s1, s2):
    assert isinstance(s1, Iterable), f"s1 must be an iterable, not {type(s1)}"
    assert isinstance(s2, Iterable), f"s2 must be an iterable, not {type(s2)}"
    s1 = set(s1)
    s2 = set(s2)
    intersection = s1.intersection(s2)
    return len(intersection) / ((len(s1) * len(s2))**2)

In [None]:
def dice_similarity(s1, s2):
    assert isinstance(s1, Iterable), f"s1 must be an iterable, not {type(s1)}"
    assert isinstance(s2, Iterable), f"s2 must be an iterable, not {type(s2)}"
    s1 = set(s1)
    s2 = set(s2)
    intersection = s1.intersection(s2)
    return 2 * len(intersection) / (len(s1) + len(s2))

***
## Feature loading

### feature vector builder for dataframe of sentence pairs

Declaration of the function responsible for the iteration over the dataframe containing the sentence pairs (other columns shall be unused). Requires the sentences columns' to be named `"S1"` and `"S2"`.

Returns a numpy array of shape `(n_sentence_pairs, n_features)`

In [None]:
def get_features(df: pd.DataFrame):
    assert "S1" in df.columns, "S1 not in dataframe"
    assert "S2" in df.columns, "S2 not in dataframe"

    features = [None] * len(df)   #preallocated for memory efficiency

    for index, row in df.iterrows():
        sentence1, sentence2 = row['S1'], row['S2']

        # Get all words
        tokenized_1, tokenized_2 = get_tokenized_sentences(
            sentence1, sentence2, return_unique_words=False)
        tokenized_lc_1, tokenized_lc_2 = get_tokenized_sentences_lowercase(
            tokenized_1, tokenized_2, return_unique_words=False)

        # Get words without stopwords
        no_stopwords_1, no_stopwords_2 = filter_stopwords(
            tokenized_1, tokenized_2, return_unique_words=False)
        no_stopwords_lc_1, no_stopwords_lc_2 = filter_stopwords(
            tokenized_lc_1, tokenized_lc_2, return_unique_words=False)

        # Lemmas
        lemmatized_1, lemmatized_2 = get_lemmas(tokenized_1,
                                                tokenized_2,
                                                return_unique_words=False)
        lemmatized_lc_1, lemmatized_lc_2 = get_lemmas(
            tokenized_lc_1, tokenized_lc_2, return_unique_words=False)

        # Name entities
        sentence_ne_1, sentence_ne_2 = get_named_entities(
            tokenized_1, tokenized_2)
        
        #lemmas cleaned from stopwords
        stopwords_and_lemmas1, stopwords_and_lemmas2 = get_lemmas(
            no_stopwords_1, no_stopwords_2, return_unique_words=False)

        stopwords_and_lemmas_lc_1, stopwords_and_lemmas_lc_2 = get_lemmas(
            no_stopwords_lc_1, no_stopwords_lc_2, return_unique_words=False)
        
        # Name entities without stopwords in lowercase
        ne_no_stopwords_1, ne_no_stopwords_2 = filter_stopwords(
            sentence_ne_1, sentence_ne_2, return_unique_words=False, filter_and_return_in_lowercase=True)
        
        # Name entities without stopwords in lowercase and lemmas
        ne_no_stopwords_lemmas_1, ne_no_stopwords_lemmas_2 = get_lemmas( ne_no_stopwords_1, ne_no_stopwords_2, 
                                                                        return_unique_words=False)

        # Bigrams
        bigrams_1, bigrams_2 = get_ngrams(no_stopwords_1, no_stopwords_2, n=2)
        trigrams_1, trigrams_2 = get_ngrams(no_stopwords_1, no_stopwords_2, n=3)
        
        # Bigrams trigrams with sentence tokenizer 
        bigrams_sent_1, bigrams_sent_2 = get_ngrams_with_sent_tokenize(sentence1, sentence2, n=2)
        trigrams_sent_1, trigrams_sent_2 = get_ngrams_with_sent_tokenize(sentence1, sentence2, n=3)
        
        # Lesk
        lesk_1, lesk_2 = get_lesk_sentences(tokenized_1, tokenized_2)

        # Stemmer
        stemmed_1, stemmed_2 = get_stemmed_sentences(sentence1, sentence2)
        
        # ALL Features
        features[index] = [
            jaccard_similarity(tokenized_1, tokenized_2),
            jaccard_similarity(tokenized_lc_1, tokenized_lc_2),
            jaccard_similarity(no_stopwords_1, no_stopwords_2),
            jaccard_similarity(no_stopwords_lc_1, no_stopwords_lc_2),
            jaccard_similarity(lemmatized_1, lemmatized_2),
            jaccard_similarity(lemmatized_lc_1, lemmatized_lc_2),
            jaccard_similarity(sentence_ne_1, sentence_ne_2),
            jaccard_similarity(stopwords_and_lemmas1, stopwords_and_lemmas2),
            jaccard_similarity(stopwords_and_lemmas_lc_1, stopwords_and_lemmas_lc_2),
            jaccard_similarity(bigrams_1, bigrams_2),
            jaccard_similarity(trigrams_1, trigrams_2),
            jaccard_similarity(bigrams_sent_1, bigrams_sent_2),
            jaccard_similarity(trigrams_sent_1, trigrams_sent_2),
            jaccard_similarity(lesk_1, lesk_2),
            jaccard_similarity(stemmed_1, stemmed_2),
            
            dice_similarity(tokenized_1, tokenized_2),
            dice_similarity(tokenized_lc_1, tokenized_lc_2),
            dice_similarity(no_stopwords_1, no_stopwords_2),
            dice_similarity(no_stopwords_lc_1, no_stopwords_lc_2),
            dice_similarity(lemmatized_1, lemmatized_2),
            dice_similarity(lemmatized_lc_1, lemmatized_lc_2),
            dice_similarity(sentence_ne_1, sentence_ne_2),
            dice_similarity(stopwords_and_lemmas1, stopwords_and_lemmas2),
            dice_similarity(stopwords_and_lemmas_lc_1, stopwords_and_lemmas_lc_2),
            dice_similarity(bigrams_1, bigrams_2),
            dice_similarity(trigrams_1, trigrams_2),
            dice_similarity(bigrams_sent_1, bigrams_sent_2),
            dice_similarity(trigrams_sent_1, trigrams_sent_2),
            dice_similarity(lesk_1, lesk_2),
            dice_similarity(stemmed_1, stemmed_2)
        ]
        # BEST Features selection 
        """features[index] = [
        jaccard_similarity(tokenized_1, tokenized_2),
        jaccard_similarity(no_stopwords_1, no_stopwords_2),
        jaccard_similarity(lemmatized_1, lemmatized_2),
        jaccard_similarity(bigrams_1, bigrams_2),
        jaccard_similarity(trigrams_1, trigrams_2),
        dice_similarity(tokenized_1, tokenized_2),
        dice_similarity(no_stopwords_1, no_stopwords_2),
        dice_similarity(no_stopwords_lc_1, no_stopwords_lc_2),
        dice_similarity(sentence_ne_1, sentence_ne_2),
        dice_similarity(bigrams_1, bigrams_2),
        dice_similarity(trigrams_1, trigrams_2),
        dice_similarity(stopwords_and_lemmas_lc_1, stopwords_and_lemmas_lc_2),
        dice_similarity(lesk_1, lesk_2)
        ]"""
    return np.array(features)

In [None]:
# TEST CELL
def show_sentences_with_applications(train_head, results):
    for index, row in train_head.iterrows():
        sentence1, sentence2 = row['S1'], row['S2']
        gs = row['Gs']
        print(f"-{index}- ({gs})")
        print(sentence1)
        print(sentence2)
        if(results is not None):
            for result in results:
                print("-----------------------------")
                print(result[index][0])
                print(result[index][1])
        print("********************************************************")
        
def apply_function(data, method_to_apply, params = None):
    results = []
    for sentence1, sentence2 in data:
        if params is None:
            result1, result2 = method_to_apply(sentence1, sentence2)
        elif len(params) == 1:
            result1, result2 = method_to_apply(sentence1, sentence2, params[0])
        elif len(params) == 2:
            result1, result2 = method_to_apply(sentence1, sentence2, params[0], params[1])  
        results.append((result1, result2))
    return results


train_head = train_data.head()
sentence_list = [(row['S1'], row['S2']) for index, row in train_head.iterrows()]

results = []
tokens = apply_function(sentence_list, get_tokenized_sentences)
lesk = apply_function(tokens, get_lesk_sentences)
results.append(lesk)


show_sentences_with_applications(train_head, results)

In [None]:
# TEST cell don't delete it =D

first = "My Bonnie White lies over the ocean, in Picadilli Circus at 3:00pm."
second = "My Bonnie lied over the sea! Over the sea..."

tokenized_1, tokenized_2 = get_tokenized_sentences(first,
                                                   second,
                                                   return_unique_words=False)
no_stopwords_1, no_stopwords_2 = filter_stopwords(tokenized_1, tokenized_2, return_unique_words=False)
sentence_ne_1, sentence_ne_2 = get_named_entities(no_stopwords_1, no_stopwords_2)

print(tokenized_1)
print(tokenized_2)
print(sentence_ne_1)
print(sentence_ne_2)
#TEST cell

In [None]:
# TEST CELL
def show_sentences_with_applications(train_head, results):
    for index, row in train_head.iterrows():
        sentence1, sentence2 = row['S1'], row['S2']
        gs = row['Gs']
        print(f"-{index}- ({gs})")
        print(sentence1)
        print(sentence2)
        if(results is not None):
            for result in results:
                print("-----------------------------")
                print(result[index][0])
                print(result[index][1])
        print("********************************************************")
        
def apply_function(data, method_to_apply, params = None):
    results = []
    for sentence1, sentence2 in data:
        if params is None:
            result1, result2 = method_to_apply(sentence1, sentence2)
        elif len(params) == 1:
            result1, result2 = method_to_apply(sentence1, sentence2, params[0])
        elif len(params) == 2:
            result1, result2 = method_to_apply(sentence1, sentence2, params[0], params[1])  
        results.append((result1, result2))
    return results


train_head = train_data.head()
sentence_list = [(row['S1'], row['S2']) for index, row in train_head.iterrows()]

results = []
tokens = apply_function(sentence_list, get_tokenized_sentences)
results.append(tokens)

show_sentences_with_applications(train_head, results)

In [None]:
# TEST cell don't delete it =D

first = "My Bonnie White lies over the ocean, in Picadilli Circus at 3:00pm."
second = "My Bonnie lied over the sea! Over the sea..."

tokenized_1, tokenized_2 = get_tokenized_sentences(first,
                                                   second,
                                                   return_unique_words=False)
no_stopwords_1, no_stopwords_2 = filter_stopwords(tokenized_1, tokenized_2, return_unique_words=False)
sentence_ne_1, sentence_ne_2 = get_named_entities(no_stopwords_1, no_stopwords_2)

print(tokenized_1)
print(tokenized_2)
print(sentence_ne_1)
print(sentence_ne_2)
#TEST cell

### Features extraction

Using the function declared above, the features are extracted from the `train_data` dataframe. Also the Gold Standard is extracted from its column in the dataframe. The shapes for both numpy vectors are displayed. 

In [None]:
train_features = get_features(train_data)
train_gs = train_data['Gs'].to_numpy()
print(f"train_features.shape: {train_features.shape}")
print(f"train_gs.shape: {train_gs.shape}")

In [None]:
test_features = get_features(test_data)
test_gs = test_data['Gs'].to_numpy()
print(f"train_features.shape: {test_features.shape}")
print(f"train_gs.shape: {test_gs.shape}")

### Feature scaling

features are scaled using sklearns StandardScaler, where the mean is substracted for each feature and it's divided by the variance of the feature to obtain a unified feature space with zero mean and unit variance.

In [None]:
scaler = StandardScaler()
scaler.fit(train_features)
train_features_scaled = scaler.transform(train_features)
test_features_scaled = scaler.transform(test_features)

### Split definition for GridSearch

In [None]:
all_data = np.concatenate([train_features_scaled, test_features_scaled])
all_labels = np.concatenate([train_gs, test_gs])
test_fold = np.array([-1]*train_features_scaled.shape[0] + [0]*test_features_scaled.shape[0])
print(all_data.shape, test_fold.shape)
ps = PredefinedSplit(test_fold)

### SVR Training

In [None]:
pearson_scorer = make_scorer(lambda y, y_hat: pearsonr(y, y_hat)[0])

gammas = np.logspace(-6, -1, 6)
Cs = np.array([0.5, 1, 2, 4, 8, 10, 15, 20, 50, 100, 200, 375, 500, 1000])
epsilons = np.linspace(0.1, 1, 10)
param = dict(gamma=gammas, C=Cs, epsilon=epsilons)

svr = SVR(kernel='rbf', tol=1)
gssvr = GridSearchCV(svr,
                     param,
                     cv=ps,
                     scoring=pearson_scorer,
                     n_jobs=-1,
                     verbose=1)
gssvr = gssvr.fit(all_data, all_labels)

In [None]:
best_parameters = gssvr.best_params_
best_model = SVR(kernel='rbf', tol=1, **best_parameters)
train_predictions = best_model.fit(train_features_scaled, train_gs).predict(train_features_scaled)

### Test Inference

In [None]:
test_predictions = best_model.predict(test_features_scaled)

### Evaluation metrics

In [None]:
train_correlation = pearsonr(train_predictions, train_gs)[0]
test_correlation = pearsonr(test_predictions, test_gs)[0]

In [None]:
print('Train pearsonr: ', train_correlation)
print('Test pearsonr: ', test_correlation)
print('The best value of gamma:', gssvr.best_estimator_.gamma)
print('The best value of C:', gssvr.best_estimator_.C)
print('The best value of epsilon:', gssvr.best_estimator_.epsilon)

***

# Do not delete ;)

## Recorded Results
|SVR|jaccard|overlap|cosine|dice|jaccard+dice|no_stop_lc_dice|no_stop_lemmas_dice|no_stop_lemmas_lc_dice|forward_search_12_feats|mutual_info_sel_13_feats|pca_12_feats|forward_search_pca_6_feats|fs_pca_12_features|
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|train correlation|0.6402|0.6233|0.3732|0.6397|0.6379|0.6017|0.6465|0.649|0.6544|0.6364|0.6454|0.6669|0.6485|
|test_correlation|0.6208|0.6198|0.1056|0.6483|0.6087|0.5905|0.6634|0.652|0.6843|0.6428|0.665|0.6755|0.6848|
|C|10.0|500.0|100.0|375.0|1000.0|2.0|100.0|500.0|15.0|20.0|500.0|20.0|2.0|
|gamma|0.01|0.01|0.1|0.0001|0.001|0.1|0.01|1e-5|0.01|0.001|0.001|0.01|0.01|
|epsilon|0.9|1.0|1.0|0.9|0.2|0.1|0.9|0.1|0.8|0.3|1.0|0.8|0.4|