In [None]:
## Preprocessing
# TODO = [] 

## FEATURE ENGINEERING
## Lexical Similarities:
# FuzzyWuzzy []

## MODEL
# SVM 

## Data Augmentation
# Back Translation => English --> Another Language --> English 

## Final Eval
# Pearson []
# Confidence Interval? []

# Semantic Textual Similarity (STS) Project

The objective of this project is to be able to determine the similarity between two sentences. One sentence is said to be "parraphrased" when the content (or message) is the same, but uses different words and or structure. 

An example from the trial set: 
 - The bird is bathing in the sink.

 - Birdie is washing itself in the water basin.

Here we are given a set of training and testing sets in which they are labeled with the "gs", on a scale of 0-5. 

|label|	description|
| :-: | :-: |
|5	| They are completely equivalent, as they mean the same thing.|
|4	| They are mostly equivalent, but some unimportant details differ.|
|3	| They are roughly equivalent, but some important information differs/missing.|
|2	| They are not equivalent, but share some details.|
|1	| They are not equivalent, but are on the same topic.|
|0	| They are on different topics.|

We need to create the following: 
- Read in the sentences as a total dataframe  --> Either load all three dataframes and then append them into a bigger one. 
- append the corresponding GS to the dataframe  --> Add this one to the previous df 
- Create a utils file in which we have all the features we want to create
- Show which features were created and how/why 
- we can then create a pipeline
    - Takes in all the features from before and makes them into a feature array 
    - Standardizes the values 
    - Outputs a simple N-D array with all the processed / calculated features 

- We need to create 3 variations: 
    1. "Standard" distance similarities 
    2. "XTRa Train" --> With more training data doing back-translation and AEDA 


*STEPS:*
1. Preprocess textual data 
    - Read in sentence pairs --> DONE
    - Tokenize --> DONE
    - Pos Tag ---> DONE
    - Remove stopwords and punctuation  --> DONE

2. Extract Features 
    - Similarity measures 
    - Word frequency 
    - Tf-IDF ?
3. Generate Extra Data (?)

In [1]:
import nltk
from nltk import word_tokenize, pos_tag, ne_chunk
from nltk.metrics import jaccard_distance
from fuzzywuzzy import fuzz
from nltk.probability import FreqDist
from nltk.collocations import BigramCollocationFinder
from nltk.collocations import TrigramCollocationFinder
from nltk.wsd import lesk
from nltk.corpus import wordnet
from nltk.corpus import sentiwordnet
from nltk.corpus import wordnet_ic
nltk.download('maxent_ne_chunker')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('words')
nltk.download('sentiwordnet')
nltk.download('wordnet_ic')
# setting the wordnet_ic 
brown_ic = wordnet_ic.ic('ic-brown.dat')

[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /Users/Eric/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package punkt to /Users/Eric/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/Eric/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /Users/Eric/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/Eric/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package words to /Users/Eric/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package sentiwordnet to
[nltk_data]     /Users/Eric/nltk_data...
[nltk_data]   Package sentiwordnet is already up-to-date!
[nltk_data] Downloadin

In [2]:
# Data Loader file with two functions
from data_loader_ import *

# Preprocessing file with several prerpocessing functions
from pre_processing import *

# feature extraction 
from feature_extractor import *

[nltk_data] Downloading package stopwords to /Users/Eric/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
# TRAINING PATH
TRAIN_PATH = './data/train/input/'
TRAIN_GS_PATH = './data/train/gs/'

# TEST PATH
TEST_PATH = 'data/test/input/'
TEST_GS_PATH = './data/test/gs/'

# Loading the Data 
X_train, y_train, X_test, y_test = load_sentences(TRAIN_PATH), load_gs(TRAIN_GS_PATH),load_sentences(TEST_PATH), load_gs(TEST_GS_PATH)

#sepparating the sentences 
SA, SB = get_processed_sentences(X_train)

In [4]:
# Lets look at how many total values we have
print(f"We have a total of {X_train.shape[0]} sentence pairs")

We have a total of 2234 sentence pairs


In [28]:
import Levenshtein as lev
from fuzzywuzzy import fuzz
# JACCARD SIMILARITY:

# creating a copy 
X_features = X_train.copy()
#dropping one of the columns 
X_features = X_features.drop('SentA',axis=1)
# empty column to append the jacc distance 
X_features['jacc_nopunc'] = ''
X_features['jacc_nopunc_stop'] = ''
X_features['jacc_lemmas'] = ''
X_features['fuzzy_ratio'] = ''
X_features['lev_ratio'] = ''
X_features['lev_distance'] = ''
for index in X_features.index:
  # no punctuation 
  X_features['jacc_nopunc'][index] = jd(SA['SentA_nopunc'][index], SB['SentB_nopunc'][index],is_set=True)
  # no punctuation or stopwords 
  X_features['jacc_nopunc_stop'][index] = jd(SA['SentA_nopunc_stop'][index], SB['SentB_nopunc_stop'][index], is_set=True)
  #lemmas 
  X_features['jacc_lemmas'][index] = jd(SA['SentA_lemmas'][index], SB['SentB_lemmas'][index],is_set=True)
  # FuzzyWuzzy String Matching
  X_features['fuzzy_ratio'][index] = fuzz.ratio(SA['SentA'][index].lower(), SB['SentB'][index].lower())
  # Levenshtein Ratio 
  X_features['lev_ratio'][index] = lev.ratio(SA['SentA'][index].lower(), SB['SentB'][index].lower())
  # Levenshtein Distance -> Number of edits for them to be the same 
  X_features['lev_distance'][index] = lev.distance(SA['SentA'][index].lower(), SB['SentB'][index].lower())
  
#dropping one of the columns 
X_features = X_features.drop('SentB',axis=1)
X_features

Unnamed: 0,jacc_nopunc,jacc_nopunc_stop,jacc_lemmas,fuzzy_ratio,lev_ratio,lev_distance
0,0.666667,0.5,0.5,90,0.897959,4
1,0.888889,1,1,89,0.894737,8
2,0.5,0.333333,0.333333,89,0.894737,3
3,0.7,0.5,0.5,89,0.894118,7
4,0.714286,0.5,0.5,89,0.892857,4
...,...,...,...,...,...,...
2229,0.318182,0.285714,0.285714,57,0.566667,56
2230,0.571429,0.714286,0.714286,69,0.692913,32
2231,0.5,0.4,0.4,67,0.666667,57
2232,0.263158,0.307692,0.416667,57,0.571429,62


In [15]:
# Extra features used: 
# FuzzyWuzzy String matching 
# Levenshtein Ratio 
# Levenshtein Distane 

X_features

Unnamed: 0,jacc_nopunc,jacc_nopunc_stop,jacc_lemmas,fuzzy_ratio,lev_ratio,lev_distance
0,0.666667,0.5,0.5,90,0.897959,4
1,0.888889,1,1,89,0.894737,8
2,0.5,0.333333,0.333333,89,0.894737,3
3,0.7,0.5,0.5,89,0.894118,7
4,0.714286,0.5,0.5,89,0.892857,4
...,...,...,...,...,...,...
2229,0.318182,0.285714,0.285714,57,0.566667,56
2230,0.571429,0.714286,0.714286,69,0.692913,32
2231,0.5,0.4,0.4,67,0.666667,57
2232,0.263158,0.307692,0.416667,57,0.571429,62
