In [None]:
## Preprocessing
# TODO = [] 
# TF-IDF [] -> Term Frequency - Inverse Data Frequency (TF-IDF)

## FEATURE ENGINEERING
# Lexical Similarities -> Levenstein Distance, Jaccard Distance []
# FuzzyWuzzy []

## MODEL
# SVM 
# NN? CNN?

## Data Augmentation
# Back Translation => English --> Another Language --> English 
# AEDA


## Final Eval
# Pearson []
# Confidence Interval? []

# Semantic Textual Similarity (STS) Project

The objective of this project is to be able to determine the similarity between two sentences. One sentence is said to be "parraphrased" when the content (or message) is the same, but uses different words and or structure. 

An example from the trial set: 
 - The bird is bathing in the sink.

 - Birdie is washing itself in the water basin.

Here we are given a set of training and testing sets in which they are labeled with the "gs", on a scale of 0-5. 

|label|	description|
| :-: | :-: |
|5	| They are completely equivalent, as they mean the same thing.|
|4	| They are mostly equivalent, but some unimportant details differ.|
|3	| They are roughly equivalent, but some important information differs/missing.|
|2	| They are not equivalent, but share some details.|
|1	| They are not equivalent, but are on the same topic.|
|0	| They are on different topics.|

We need to create the following: 
- Read in the sentences as a total dataframe  --> Either load all three dataframes and then append them into a bigger one. 
- append the corresponding GS to the dataframe  --> Add this one to the previous df 
- Create a utils file in which we have all the features we want to create
- Show which features were created and how/why 
- we can then create a pipeline
    - Takes in all the features from before and makes them into a feature array 
    - Standardizes the values 
    - Outputs a simple N-D array with all the processed / calculated features 

- We need to create 3 variations: 
    1. "Standard" distance similarities 
    2. "XTRa Train" --> With more training data doing back-translation and AEDA 


*STEPS:*
1. Preprocess textual data 
    - Read in sentence pairs --> DONE
    - Tokenize --> DONE
    - Pos Tag ---> DONE
    - Remove stopwords and punctuation  --> DONE

2. Extract Features 
    - Similarity measures 
    - Word frequency 
    - Tf-IDF ?
3. Generate Extra Data (?)

In [1]:
import string
import nltk
from nltk import pos_tag
from nltk import word_tokenize, pos_tag, ne_chunk
from nltk.stem import WordNetLemmatizer
from nltk.metrics import jaccard_distance
from nltk.probability import FreqDist
from nltk.collocations import BigramCollocationFinder
from nltk.collocations import TrigramCollocationFinder
from nltk.wsd import lesk
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.corpus import sentiwordnet
from nltk.corpus import wordnet_ic
nltk.download('maxent_ne_chunker')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('words')
nltk.download('sentiwordnet')
nltk.download('wordnet_ic')
# setting the wordnet_ic 
brown_ic = wordnet_ic.ic('ic-brown.dat')

[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /Users/Eric/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package punkt to /Users/Eric/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/Eric/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /Users/Eric/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/Eric/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package words to /Users/Eric/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package sentiwordnet to
[nltk_data]     /Users/Eric/nltk_data...
[nltk_data]   Package sentiwordnet is already up-to-date!
[nltk_data] Downloadin

In [2]:
# Data Loader file with two functions
from data_loader_ import *

# Preprocessing file with several prerpocessing functions
from pre_processing import *

[nltk_data] Downloading package stopwords to /Users/Eric/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
# TRAINING PATH
TRAIN_PATH = './data/train/input/'
TRAIN_GS_PATH = './data/train/gs/'

# TEST PATH
TEST_PATH = 'data/test/input/'
TEST_GS_PATH = './data/test/gs/'

# Loading the Data 
X_train, y_train, X_test, y_test = load_sentences(TRAIN_PATH), load_gs(TRAIN_GS_PATH),load_sentences(TEST_PATH), load_gs(TEST_GS_PATH)


In [4]:
# Lets look at how many total values we have
print(f"We have a total of {X_train.shape[0]} sentence pairs")

We have a total of 2234 sentence pairs


In [96]:
import numpy as np

# choosing a random sentence and see if the pre-processing works 
#SEED = np.random.seed(42)
#samples = np.array(X_train.SentA).reshape(-1,)
#sample_sentence = np.random.choice(samples, 1)[0]

sample_sentence = X_train.iloc[0,:].SentA
sample_sentence2 = X_train.iloc[0,:].SentB
# original sentence 
print(sample_sentence)
# stripping the punctuation works 
print(strip_punctuation(sample_sentence))
# stripping the stopwords works 
print(strip_stopwords_punctuation(sample_sentence))
# saving the tokenized sentence 
toked_sent = strip_stopwords_punctuation(sample_sentence)
toked_sent = clean_replace_unwanted_chars(toked_sent)
# getting the POS-Tag
print(get_pos_tag(toked_sent))
#saving the pos_tagged sentence 
pos_tagged = get_pos_tag(toked_sent)
# getting the lemmas 
toked_sent_lemm = get_lemmas(pos_tagged)
print(toked_sent_lemm)


def check_sent(sentA, sentB):
    print(strip_punctuation(sentA))
    print(strip_punctuation(sentB))
    print(strip_stopwords_punctuation(sentA))
    print(strip_stopwords_punctuation(sentB))
    toked_sentA = strip_stopwords_punctuation(sentA)
    toked_sentA = clean_replace_unwanted_chars(toked_sentA)
    toked_sentB = strip_stopwords_punctuation(sentB)
    toked_sentB = clean_replace_unwanted_chars(toked_sentB)
    pos_taggedA = get_pos_tag(toked_sentA)
    pos_taggedB = get_pos_tag(toked_sentB)
    print(get_pos_tag(toked_sentA))
    print(get_pos_tag(toked_sentB))
    toked_sent_lemmA = get_lemmas(pos_taggedA)
    toked_sent_lemmB = get_lemmas(pos_taggedB)
    print(toked_sent_lemmA)
    print(toked_sent_lemmB)

a man is riding a bicycle.
['a', 'man', 'is', 'riding', 'a', 'bicycle']
['man', 'riding', 'bicycle']
[('man', 'NN'), ('riding', 'VBG'), ('bicycle', 'NN')]
['man', 'rid', 'bicycle']


In [93]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [152]:
# we get all the sentences 
# strip punctuation marks 
# extract the feature names (words)
# convert to dense -> list 
# check for the delta 
# are they "equal"? |token1| == |token2|
# calculate similarity of the top synsets (token1 to token2) and (token2 to token1)
# keep the similarity values for each of the sentences ! --> Feature for synset similarity!! 

# We only use the TF-IDF to look at the synset similarity measure of the two senses (sense1 and sense2)
vec = TfidfVectorizer()
resp = vec.fit_transform([sample_sentence, sample_sentence2])
feature_names = vec.get_feature_names()

dense_list = resp.todense().tolist()

# the sentences we used
indices = ['SentA','SentB']
# final df with the TFIDF
df = pd.DataFrame(dense_list, columns=feature_names).T


df['delta'] = df[0] - df[1]


#df.head()
kk = min(df.delta)
ki = max(df.delta)

abs(kk) == abs(ki)

from nltk.corpus import wordnet
word1 = df[df['delta']>0].index[0]
word2 = df[df['delta']<0].index[0]

#synset sim 
syn1 = wordnet.synsets(word1)
syn2 = wordnet.synsets(word2)

for syns in syn1:
    for syns2 in syn2:
        #print(syns, syns2)
        score = syns.wup_similarity(syns2)
        score2 = syns2.wup_similarity(syns)
        if (score != 0) and score is not None:
            print("(1)-->Score syn-syn2")
            print(syns, syns2, score)
        if (score2 != 0) and score2 is not None:
            print("(2)-->Score syn2-syn")
            print(syns2, syns, score2)



(1)-->Score syn-syn2
Synset('bicycle.n.01') Synset('motorcycle.n.01') 0.7272727272727273
(2)-->Score syn2-syn
Synset('motorcycle.n.01') Synset('bicycle.n.01') 0.7272727272727273
(1)-->Score syn-syn2
Synset('bicycle.n.01') Synset('bicycle.n.01') 1.0
(2)-->Score syn2-syn
Synset('bicycle.n.01') Synset('bicycle.n.01') 1.0
(2)-->Score syn2-syn
Synset('bicycle.v.01') Synset('bicycle.n.01') 0.13333333333333333
(1)-->Score syn-syn2
Synset('bicycle.v.01') Synset('motorcycle.n.01') 0.11764705882352941
(1)-->Score syn-syn2
Synset('bicycle.v.01') Synset('bicycle.n.01') 0.13333333333333333
(1)-->Score syn-syn2
Synset('bicycle.v.01') Synset('bicycle.v.01') 1.0
(2)-->Score syn2-syn
Synset('bicycle.v.01') Synset('bicycle.v.01') 1.0


In [139]:
check_sent(sample_sentence, sample_sentence2)

['a', 'man', 'is', 'riding', 'a', 'bicycle']
['a', 'man', 'is', 'riding', 'a', 'bike']
['man', 'riding', 'bicycle']
['man', 'riding', 'bike']
[('man', 'NN'), ('riding', 'VBG'), ('bicycle', 'NN')]
[('man', 'NN'), ('riding', 'VBG'), ('bike', 'NN')]
['man', 'rid', 'bicycle']
['man', 'rid', 'bike']


True

([Synset('bicycle.n.01'), Synset('bicycle.v.01')],
 [Synset('motorcycle.n.01'), Synset('bicycle.n.01'), Synset('bicycle.v.01')])

(1)-->Score syn-syn2
Synset('bicycle.n.01') Synset('motorcycle.n.01') 0.7272727272727273
(2)-->Score syn2-syn
Synset('motorcycle.n.01') Synset('bicycle.n.01') 0.7272727272727273
(1)-->Score syn-syn2
Synset('bicycle.n.01') Synset('bicycle.n.01') 1.0
(2)-->Score syn2-syn
Synset('bicycle.n.01') Synset('bicycle.n.01') 1.0
(2)-->Score syn2-syn
Synset('bicycle.v.01') Synset('bicycle.n.01') 0.13333333333333333
(1)-->Score syn-syn2
Synset('bicycle.v.01') Synset('motorcycle.n.01') 0.11764705882352941
(1)-->Score syn-syn2
Synset('bicycle.v.01') Synset('bicycle.n.01') 0.13333333333333333
(1)-->Score syn-syn2
Synset('bicycle.v.01') Synset('bicycle.v.01') 1.0
(2)-->Score syn2-syn
Synset('bicycle.v.01') Synset('bicycle.v.01') 1.0


In [95]:
df.T.head()

Unnamed: 0,0,1
bicycle,0.630099,0.0
bike,0.0,0.630099
is,0.448321,0.448321
man,0.448321,0.448321
riding,0.448321,0.448321


In [50]:
# we need to find the characters of all the sentences 
toked_sent_all = [nltk.word_tokenize(x) for x in all_sentences]
#toked_sent_all = sum(toked_sent_all, [])
#toked_sent_all_set = set(toked_sent_all)
len(toked_sent_all)


2234