In [None]:
## Preprocessing
# TODO = [] 

## FEATURE ENGINEERING
## Lexical Similarities:
# FuzzyWuzzy []

## MODEL
# SVM 

## Data Augmentation
# Back Translation => English --> Another Language --> English 

## Final Eval
# Pearson []
# Confidence Interval? []

# Semantic Textual Similarity (STS) Project

The objective of this project is to be able to determine the similarity between two sentences. One sentence is said to be "parraphrased" when the content (or message) is the same, but uses different words and or structure. 

An example from the trial set: 
 - The bird is bathing in the sink.

 - Birdie is washing itself in the water basin.

Here we are given a set of training and testing sets in which they are labeled with the "gs", on a scale of 0-5. 

|label|	description|
| :-: | :-: |
|5	| They are completely equivalent, as they mean the same thing.|
|4	| They are mostly equivalent, but some unimportant details differ.|
|3	| They are roughly equivalent, but some important information differs/missing.|
|2	| They are not equivalent, but share some details.|
|1	| They are not equivalent, but are on the same topic.|
|0	| They are on different topics.|

We need to create the following: 
- Read in the sentences as a total dataframe  --> Either load all three dataframes and then append them into a bigger one. 
- append the corresponding GS to the dataframe  --> Add this one to the previous df 
- Create a utils file in which we have all the features we want to create
- Show which features were created and how/why 
- we can then create a pipeline
    - Takes in all the features from before and makes them into a feature array 
    - Standardizes the values 
    - Outputs a simple N-D array with all the processed / calculated features 

- We need to create 3 variations: 
    1. "Standard" distance similarities 
    2. "XTRa Train" --> With more training data doing back-translation and AEDA 


*STEPS:*
1. Preprocess textual data 
    - Read in sentence pairs --> DONE
    - Tokenize --> DONE
    - Pos Tag ---> DONE
    - Remove stopwords and punctuation  --> DONE

2. Extract Features 
    - Similarity measures 
    - Word frequency 
    - Tf-IDF ?
3. Generate Extra Data (?)

In [1]:
import nltk
from nltk import word_tokenize, pos_tag, ne_chunk
from nltk.metrics import jaccard_distance
from fuzzywuzzy import fuzz
from nltk.probability import FreqDist
from nltk.collocations import BigramCollocationFinder
from nltk.collocations import TrigramCollocationFinder
from nltk.wsd import lesk
from nltk.corpus import wordnet
from nltk.corpus import sentiwordnet
from nltk.corpus import wordnet_ic
nltk.download('maxent_ne_chunker')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('words')
nltk.download('sentiwordnet')
nltk.download('wordnet_ic')
# setting the wordnet_ic 
brown_ic = wordnet_ic.ic('ic-brown.dat')

[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /Users/Eric/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package punkt to /Users/Eric/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/Eric/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /Users/Eric/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/Eric/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package words to /Users/Eric/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package sentiwordnet to
[nltk_data]     /Users/Eric/nltk_data...
[nltk_data]   Package sentiwordnet is already up-to-date!
[nltk_data] Downloadin

In [2]:
# Data Loader file with two functions: load_sentences 
from data_loader_ import *

# Preprocessing file with several prerpocessing functions
from pre_processing import *

# feature extraction 
from feature_extractor import *

[nltk_data] Downloading package stopwords to /Users/Eric/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
# TRAINING PATH
TRAIN_PATH = './data/train/input/'
TRAIN_GS_PATH = './data/train/gs/'

# TEST PATH
TEST_PATH = 'data/test/input/'
TEST_GS_PATH = './data/test/gs/'

# Loading the Data 
X_train, y_train, X_test, y_test = load_sentences(TRAIN_PATH), load_gs(TRAIN_GS_PATH),load_sentences(TEST_PATH), load_gs(TEST_GS_PATH)

#sepparating the sentences 
SA, SB = get_processed_sentences(X_train)

# Jaccard_Fuzzy_Lev
feature_df = jd_fuzz_lev(SA, SB, X_train)

In [4]:
# Lets look at how many total values we have
print(f"We have a total of {X_train.shape[0]} sentence pairs")

We have a total of 2234 sentence pairs


In [5]:
# Lexical similarities
# getting the synset similarity
# we use the lemmas of SA and SB

def get_similarities(word1, word2,brown_ic,ret_dict=None):

  '''
  input --> Word1, Word2
  Output --> dict of similarities 

  '''
  simil_dict = dict()

  def path_sim(word1,word2): 
    return word1.path_similarity(word2)
  def lch_sim(word1,word2): 
    return word1.lch_similarity(word2)
  def wup_sim(word1,word2): 
    return word1.wup_similarity(word2)
  def lin_sim(word1,word2,brown_ic): 
    '''
    needs information content (IC) of LCS (least common subsumer)
    '''
    return word1.lin_similarity(word2,brown_ic)

  simil_dict['PATH_SIMIL'] = path_sim(word1,word2)
  simil_dict['LCH_SIMIL'] = lch_sim(word1,word2)
  simil_dict['WUP_SIMIL'] = wup_sim(word1,word2)
  simil_dict['LIN_SIMIL'] = lin_sim(word1,word2,brown_ic)
  all_sims = [path_sim(word1,word2),lch_sim(word1,word2),wup_sim(word1,word2),lin_sim(word1,word2,brown_ic)]
  if ret_dict==True:
    return simil_dict
  elif ret_dict==False:
    return all_sims


def get_max_sim_synset(lemmaA, lemmaB,brown_ic):
  d = dict()
  # getting the synsets: 
  syn1 = wordnet.synsets(lemmaA)
  syn2 = wordnet.synsets(lemmaB)
  for asynset in syn1:
    for bsynset in syn2:  
      sims = get_similarities(asynset, bsynset,brown_ic)
      max_key = max(sims, key=sims.get)
      d[(lemmaA,lemmaB,max_key)] = sims[max_key]
      
      
      
      

  


In [6]:
sA = SA.SentA_lemmas[0]
sB = SB.SentB_lemmas[0]
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

d = dict()
l = []
for idx, x in enumerate(sA):
    #print(x, wordnet.synsets(x))
    syn1 = wordnet.synsets(x)
    syn2 = wordnet.synsets(sB[idx])
    for xx in syn1:
        for yy in syn2:  
            try:  
                sims = get_similarities(xx, yy, brown_ic,ret_dict=False)
                l.append(sims)
            except:
                continue
            #max_val = max(sims, key=sims.get)
            d[xx, yy] = sims
            
                
                


In [None]:
l1 = np.array(sum(l, [])).reshape(-1,1)
scaler.fit(l1)
vals = scaler.transform(l1)
vals

df = pd.DataFrame(d)
scaler.fit(df.values)
vals = scaler.transform(df.values)

df1 = pd.DataFrame(vals)
df1.columns = df.columns
d = df1.to_dict()

In [None]:
d = {'a':5,'b':3,'c':8}
k = max(d, key=d.get)
print(k, d[k])

# get key-value max
b = max(d.items(), key=lambda x: x[1])
print(b[0])

In [None]:
SB.head()

In [7]:

scaler1 = StandardScaler()
scaler1.fit(feature_df.values)
X_scaled = scaler1.transform(feature_df.values)

SA1, SB1 = get_processed_sentences(X_test)
feature_df_test = jd_fuzz_lev(SA1, SB1, X_test)
scaler = StandardScaler()
scaler.fit(feature_df_test.values)
X_scaled_test = scaler.transform(feature_df_test.values)

In [8]:
import sklearn
from sklearn.svm import SVR
from scipy.stats import pearsonr
svr = SVR(kernel = 'rbf', gamma = 0.01, C = 200, epsilon = 0.50, tol = 0.25)
svr.fit(X_scaled, y_train.values.reshape(-1,))

# Predict
test_predict = svr.predict(X_scaled_test)

In [9]:
correlation = pearsonr(test_predict, y_test.values.reshape(-1,))[0]
print("Pearson correlation:", correlation)

Pearson correlation: 0.11811386277788924


In [10]:
#pass a gridsearch over this 
from sklearn.model_selection import GridSearchCV

param = {'kernel' : ('linear', 'poly', 'rbf', 'sigmoid'),'C' : [1,5,10],'degree' : [3,8],'coef0' : [0.01,10,0.5],'gamma' : ('auto','scale')},

modelsvr = SVR()

grids = GridSearchCV(modelsvr,param,cv=5,n_jobs=-1,verbose=2)

grids.fit(X_scaled, y_train.values.reshape(-1,))

Fitting 5 folds for each of 144 candidates, totalling 720 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    8.3s


In [None]:
test_predict = grids.predict(X_scaled_test)
correlation = pearsonr(test_predict, y_test.values.reshape(-1,))[0]
print("Pearson correlation:", correlation)