# Importing dependencies

In [1]:
# Download spaCy's Deutsch trained pipelines
!python -m spacy download nl_core_news_sm

import json
import string
import nltk
import pandas as pd
import numpy as np
import spacy
import nl_core_news_sm
import en_core_web_sm

from enum import Enum
from nltk.corpus import stopwords
from scipy import spatial
from scipy import stats
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, HashingVectorizer
from sklearn.metrics import jaccard_score
from sklearn.metrics.pairwise import cosine_similarity
from IPython.display import clear_output

# Download NLTK's stopwords
nltk.download('stopwords')

# Download compressed assets from Google Drive
!gdown --id 1TTddIx7Bwwl2o3hYnMKDXxJEskiPMXde
!unrar x "assets.rar"

clear_output()
print('Successfuly downloaded dependencies')

Successfuly downloaded dependencies


# Enumerations

##### Enumerating vectorization techniques

In [2]:
class Vectorizer(Enum):
    TF_IDF_VECTORIZER = 'TfidfVectorizer'
    COUNT_VECTORIZER = 'CountVectorizer'
    HASHING_VECTORIZER = 'HashingVectorizer'

##### Enumerating similarity metrics

In [3]:
class Similarity(Enum):
    COSINE_SIMILARITY_PAIRWISE = 'cosine_similarity'
    COSINE_SIMILARITY = 'cosine'

##### Enumerating distance metrics

In [4]:
class Distance(Enum):
    EUCLIDEAN_DISTANCE = 'norm'
    MANHATTAN_DISTANCE = 'minkowski'
    MINKOWSKI_DISTANCE = 'abs_sum'

##### Enumerating correlation/accuracy metrics

In [5]:
class Correlation(Enum):
    PEARSON_CORRELATION = 'pearsonr'    
    SPEARMAN_CORRELATION = 'spearmanr'
    POINT_BISERIAL_CORRELATION = 'pointbiserialr'
    KENDALL_TAU_CORRELATION = 'kendalltau'

# Helper Methods

#### Mounting tables into a dictionary of dataframes

In [6]:
def get_raw_train_test() -> dict:
    training = pd.read_csv('assets/training_pairs.csv')
    testing = pd.read_csv('assets/testing_pairs.csv')

    return {'train': training, 'test': testing}

#### Fetching textual data from the residual json assets

In [7]:
def __get_json_text_by_id(file_id: str) -> str:
    try:
        file = open(f'assets/webpages/{file_id}.json')
        data = json.load(file)
        return data['text']
    except FileNotFoundError:
        return ''

#### Preprocessing dataframe

In [8]:
def preprocess_df(df: pd.DataFrame):
    # Retrieves textual data by pair_id
    df['Text1'] = df['pair_id'].apply(lambda cell: __get_json_text_by_id(cell.split('_')[0]))
    df['Text2'] = df['pair_id'].apply(lambda cell: __get_json_text_by_id(cell.split('_')[1]))

    # Remove unnecessary columns
    df.drop(df.columns.difference(['Text1', 'Text2', 'Overall']), axis=1, inplace=True)

    # Remove null & empty texts
    df['Text1'].replace('', None, inplace=True)
    df['Text2'].replace('', None, inplace=True)
    df.dropna(subset=['Text1', 'Text2'], inplace=True)

#### Implementing basic natural language processing procedures
* Removing Punctuation
* Removing stops words for both Deutsch and English
* Removing escape sequences
* Lowercasing all characters
* Lemmatizing words

In [9]:
def implement_nlp(df: pd.DataFrame, efficient: bool=True):
    remove_punctuation(df)
    remove_stop_words(df)
    remove_escape_sequences(df)
    lowercase_characters(df)
    if not efficient:
      lemmatize_words(df)

In [10]:
def remove_punctuation(df: pd.DataFrame, columns=None):
    if columns is None:
        columns = ['Text1', 'Text2']

    punctuation = list(string.punctuation)
    for column in columns:
        df[column] = df[column].apply(lambda row: ''.join([i for i in row if i not in punctuation]))

In [11]:
def remove_stop_words(df: pd.DataFrame):
    stop_words_de = stopwords.words('dutch')
    stop_words_en = stopwords.words('english')
    df['Text1'] = df['Text1'].apply(lambda row: ' '.join([i for i in row.split() if i not in stop_words_de]))
    df['Text2'] = df['Text2'].apply(lambda row: ' '.join([i for i in row.split() if i not in stop_words_en]))


In [12]:
def remove_escape_sequences(df: pd.DataFrame, columns=None):
    if columns is None:
        columns = ['Text1', 'Text2']

    escapes = ''.join([chr(char) for char in range(1, 32)])
    for column in columns:
        df[column] = df[column].apply(lambda row: ''.join([i for i in row if i not in escapes]))

In [13]:
def lowercase_characters(df: pd.DataFrame, columns=None):
    if columns is None:
        columns = ['Text1', 'Text2']

    for column in columns:
        df[column] = df[column].str.lower()

In [14]:
def lemmatize_words(df: pd.DataFrame):
    lemma_de = nl_core_news_sm.load()
    lemma_en = en_core_web_sm.load()
    df['Text1'] = df['Text1'].apply(lambda row: ' '.join([x.lemma_ for x in lemma_de(row)]))
    df['Text2'] = df['Text2'].apply(lambda row: ' '.join([x.lemma_ for x in lemma_en(row)]))

#### Converting textual data into an array of vectors based on a predefined vectorization technique

In [15]:
def vectorize_text(df: pd.DataFrame, columns=None, method: Vectorizer = Vectorizer.TF_IDF_VECTORIZER):
    if columns is None:
        columns = ['Text1', 'Text2']

    if method == Vectorizer.TF_IDF_VECTORIZER:
        vectorizer = TfidfVectorizer()
    elif method == Vectorizer.COUNT_VECTORIZER:
        vectorizer = CountVectorizer()
    else:
        vectorizer = HashingVectorizer()
        print('HashingVectorizer appears to have a much higher time complexity due to its high memory usage.')

    texts = []
    for column in columns:
        texts.extend(df[column])

    x = vectorizer.fit_transform(texts)
    texts_vectorized = x.toarray().tolist()

    count = 1
    for column in columns:
        df[f'Vector{count}'] = texts_vectorized[:len(df[column])]
        texts_vectorized = texts_vectorized[len(df[column]) - 1:]
        count = count + 1

#### Calculating similarity between two vectors

In [16]:
def calculate_similarity(df: pd.DataFrame, x='Vector1', y='Vector2',
                         method: Similarity = Similarity.COSINE_SIMILARITY_PAIRWISE) -> int:
    if method == Similarity.COSINE_SIMILARITY_PAIRWISE:
        similarity = cosine_similarity(df[x].tolist(), df[y].tolist()).diagonal().mean()
    else:
        similarities = np.array([])
        for index, row in df.iterrows():
            similarities = np.append(similarities, 1 - spatial.distance.cosine(row[x], row[y]))
        similarity = similarities.mean()

    return similarity

#### Calculating distance between two vectors

In [17]:
def calculate_distance(df: pd.DataFrame, x='Vector1', y='Vector2',
                       method: Distance = Distance.EUCLIDEAN_DISTANCE) -> int:
    if method == Distance.EUCLIDEAN_DISTANCE:
        distances = np.array([])
        for index, row in df.iterrows():
            distances = np.append(distances, np.linalg.norm(np.subtract(row[x], row[y])))
        distance = distances.mean()
    elif method == Distance.MINKOWSKI_DISTANCE:
        distances = np.array([])
        for index, row in df.iterrows():
            distances = np.append(distances, spatial.distance.minkowski(row[x], row[y], 3))
        distance = distances.mean()
    else:
        distances = np.array([])
        for index, row in df.iterrows():
            distances = np.append(distances, np.abs(np.subtract(row[x], row[y])).sum())
        distance = distances.mean()

    return distance

#### Calculating correlation between two vectors

In [18]:
def calculate_correlation(df: pd.DataFrame, x='Vector1', y='Vector2',
                         method: Correlation = Correlation.PEARSON_CORRELATION) -> int:
    if method == Correlation.PEARSON_CORRELATION:
        correlations = np.array([])
        for index, row in df.iterrows():
            correlations = np.append(correlations, stats.pearsonr(row[x], row[y])[0])
        correlation = correlations.mean()
    elif method == Correlation.SPEARMAN_CORRELATION:
        correlations = np.array([])
        for index, row in df.iterrows():
            correlations = np.append(correlations, stats.spearmanr(row[x], row[y])[0])
        correlation = correlations.mean()
    elif method == Correlation.POINT_BISERIAL_CORRELATION:
        correlations = np.array([])
        for index, row in df.iterrows():
            correlations = np.append(correlations, stats.pointbiserialr(row[x], row[y])[0])
        correlation = correlations.mean()
    else: 
        correlations = np.array([])
        for index, row in df.iterrows():
            correlations = np.append(correlations, stats.kendalltau(row[x], row[y])[0])
        correlation = correlations.mean()

    return correlation

# Methods invocation

In [19]:
train_test = get_raw_train_test()

train = train_test['train']
test = train_test['test']

preprocess_df(train)
preprocess_df(test)

implement_nlp(train, efficient=True)
implement_nlp(test, efficient=True)

vectorize_text(train)
vectorize_text(test)

print('Correlation')
print('-----------------------------------------------------------------------')
print(calculate_correlation(train, method=Correlation.PEARSON_CORRELATION))
print(calculate_correlation(train, method=Correlation.SPEARMAN_CORRELATION))
print(calculate_correlation(train, method=Correlation.POINT_BISERIAL_CORRELATION))
print(calculate_correlation(train, method=Correlation.KENDALL_TAU_CORRELATION))

print('\nSimilarity')
print('-----------------------------------------------------------------------')
print(calculate_similarity(train, method=Similarity.COSINE_SIMILARITY_PAIRWISE))
print(calculate_similarity(train, method=Similarity.COSINE_SIMILARITY))

print('\nDistance')
print('-----------------------------------------------------------------------')
print(calculate_distance(train, method=Distance.EUCLIDEAN_DISTANCE))
print(calculate_distance(train, method=Distance.MINKOWSKI_DISTANCE))
print(calculate_distance(train, method=Distance.MANHATTAN_DISTANCE))

Correlation
-----------------------------------------------------------------------
0.0006639005589506881
0.0039032062658923418
0.0006639005589506881
0.0038942992644011827

Similarity
-----------------------------------------------------------------------
0.002990701607000213
0.0029907016070002137

Distance
-----------------------------------------------------------------------
1.4120822060068667
0.6899844692837535
20.639931576108392
