In [None]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from app.utils import get_config
from app.tupperware import tupperware
from app.elastic import ElasticHelper
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from functools import partial
from gensim.parsing.preprocessing import (
    preprocess_documents,
    preprocess_string,
    strip_tags,
    strip_punctuation,
    strip_multiple_whitespaces,
    remove_stopwords,
    stem_text
)
import pandas as pd
import re
import time

### Data Preprocessing and Prep
You have to download the linkso dataset from: https://sites.google.com/view/linkso

In [None]:
# Number of rows to process from each file
DATA_SIZE = 10000

In [None]:
preprocessor = partial(preprocess_string, filters=[strip_tags, strip_punctuation, strip_multiple_whitespaces, remove_stopwords, stem_text])

In [None]:
pairs = dict(pd.read_csv('contractions.tsv', names=['contraction', 'split'], sep='\t').values)

In [None]:
def clean_text(string):
    string = str(string).lower()
    for contraction in pairs:
        try:
            string = re.sub(contraction, pairs[contraction], string)
        except TypeError as err:
            import pdb; pdb.set_trace()
    string = re.sub('\d+', 'number', string)
    string = preprocessor(string)
    return string

In [None]:
def get_freelancer_dataset():
    """
    :returns: (original_data, unprocessed_sentences, processed_sentences, tagged_documents)
    """
    config = tupperware(get_config())
    config

    db = ElasticHelper(config.elastic.hostname, config.elastic.port)
    db.connect()

    freelancer_posts = db.search(index_name='freelancer', size=10000)
    f_posts = [post['_source'] for post in freelancer_posts['hits']['hits']]
    sents = [post['title'] + ' ' + post['description'] for post in f_posts]
    
    tick = time.time()
    processed_sents = list(map(clean_text, sents))
    print(f"Time it took to clean text: {time.time() - tick}")
    
    tagged_docs = [TaggedDocument(string, [i]) for i, string in enumerate(processed_sents)]
    return freelancer_posts, sents, processed_sents, tagged_docs

In [None]:
def get_quora_data():
    """
    :returns: (original_data, unprocessed_sentences, processed_sentences, tagged_documents)
    """
    tick = time.time()
    quora_df = pd.read_csv('questions.csv', nrows=int(DATA_SIZE/2))
    quora_df = quora_df.rename({'is_duplicate': 'label'}, axis=1)
    print(f"Time taken to load quora text: {time.time() - tick}")
    
    questions = []
    questions += quora_df['question1'].tolist()
    questions += quora_df['question2'].tolist()
    
    tick = time.time()
    processed_qs = list(map(clean_text, questions))
    print(f"Time taken to clean quora text: {time.time() - tick}")
    
    tagged_questions = [TaggedDocument(string, [i]) for i, string in enumerate(processed_qs)]
    return quora_df, questions, processed_qs, tagged_questions

In [None]:
def get_linkso_data(java=False, python=False, javascript=False):
    """
    :returns: (original_data, unprocessed_sentences, processed_sentences, tagged_documents)
    """
    tick = time.time()
    if java:
        java_text = pd.read_csv('linkso/java/java_qid2all.txt', sep='\t', names=['qid', 'title', 'body', 'answer'], usecols=['qid', 'body'])
        java_relations = pd.read_csv('linkso/java/java_cosidf.txt', sep='\t', usecols=['qid1', 'qid2', 'label'])
        java_df = pd.merge(java_relations, java_text, left_on='qid1', right_on='qid', how='left')
        java_df = java_df.drop("qid", axis=1).rename({"body": "question1"}, axis=1)
        java_df = pd.merge(java_df, java_text, left_on='qid2', right_on='qid', how='left')
        java_df = java_df.drop("qid", axis=1).rename({"body": "question2"}, axis=1)
    else:
        java_df = pd.DataFrame(columns=['qid1', 'qid2', 'label', 'question1', 'question2'])

    if python:
        python_text = pd.read_csv('linkso/python/python_qid2all.txt', sep='\t', names=['qid', 'title', 'body', 'answer'], usecols=['qid', 'body'])
        python_relations = pd.read_csv('linkso/python/python_cosidf.txt', sep='\t', usecols=['qid1', 'qid2', 'label'])
        python_df = pd.merge(python_relations, python_text, left_on='qid1', right_on='qid', how='left')
        python_df = python_df.drop("qid", axis=1).rename({"body": "question1"}, axis=1)
        python_df = pd.merge(python_df, python_text, left_on='qid2', right_on='qid', how='left')
        python_df = python_df.drop("qid", axis=1).rename({"body": "question2"}, axis=1)
    else:
        python_df = pd.DataFrame(columns=['qid1', 'qid2', 'label', 'question1', 'question2'])

    if javascript:
        javascript_text = pd.read_csv('linkso/javascript/javascript_qid2all.txt', sep='\t', names=['qid', 'title', 'body', 'answer'], usecols=['qid', 'body'])
        javascript_relations = pd.read_csv('linkso/javascript/javascript_cosidf.txt', sep='\t', usecols=['qid1', 'qid2', 'label'])
        javascript_df = pd.merge(javascript_relations, javascript_text, left_on='qid1', right_on='qid', how='left')
        javascript_df = javascript_df.drop("qid", axis=1).rename({"body": "question1"}, axis=1)
        javascript_df = pd.merge(javascript_df, javascript_text, left_on='qid2', right_on='qid', how='left')
        javascript_df = javascript_df.drop("qid", axis=1).rename({"body": "question2"}, axis=1)
    else:
        javascript_df = pd.DataFrame(columns=['qid1', 'qid2', 'label', 'question1', 'question2'])
    
    linkso_data = pd.concat([java_df, python_df, javascript_df]).sample(int(DATA_SIZE/2))
    print(f"Time taken to load linkso data: {time.time() - tick}")
    
    questions = []
    questions += quora_df['question1'].tolist()
    questions += quora_df['question2'].tolist()
    
    tick = time.time()
    processed_qs = list(map(clean_text, questions))
    print(f"Time taken to clean linkso text: {time.time() - tick}")
    
    tagged_questions = [TaggedDocument(string, [i]) for i, string in enumerate(processed_qs)]
    return linkso_data, questions, processed_qs, tagged_questions

In [None]:
# f_data, f_qs, f_pqs, f_tqs = get_freelancer_dataset()

In [None]:
quora_df, quora_qs, quora_pqs, quora_tqs = get_quora_data()

In [None]:
linkso_df, linkso_qs, linkso_pqs, linkso_tqs = get_linkso_data(python=True)

In [None]:
linkso_df

### Model Training Doc2Vec

In [246]:
import sklearn.metrics as metrics
import matplotlib.pyplot as plt


class Doc2VecClassifier():
    
    def __init__(self, data_name, data_df, data_tqs):
        self.data_name = data_name
        self.data_df = data_df
        self.data_tqs = data_tqs
        self.model = doc2vec = Doc2Vec(dm=1, vector_size=32, window=4, min_count=2, epochs=30)
        self.model.clear_sims()
        
    def train(self):
        self.model.build_vocab(self.data_tqs)
        self.model.train(self.data_tqs, total_examples=self.model.corpus_count, epochs=self.model.epochs)
        
    def compute_similarity_scores(self):
        
        def compute_similarity(row):
            q1_vec = self.model.infer_vector(clean_text(row.question1)).reshape(1, -1)
            q2_vec = self.model.infer_vector(clean_text(row.question2)).reshape(1, -1)
            sim = cosine_similarity(q1_vec, q2_vec)[0][0]
            return sim

        self.data_df['scores'] = self.data_df.apply(compute_similarity, axis=1)
    
    def evaluate(self):
        print(self.data_name)
        # calculate the fpr and tpr for all thresholds of the classification
        fpr, tpr, threshold = metrics.roc_curve(self.data_df.label, self.data_df.scores)
        roc_auc = metrics.auc(fpr, tpr)

        # method I: plt
        plt.title('Receiver Operating Characteristic')
        plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
        plt.legend(loc = 'lower right')
        plt.plot([0, 1], [0, 1],'r--')
        plt.xlim([0, 1])
        plt.ylim([0, 1])
        plt.ylabel('True Positive Rate')
        plt.xlabel('False Positive Rate')
        plt.show()
        print(f'AUC: {roc_auc}')

In [None]:
quora_doc2vec = Doc2VecClassifier('quora', quora_df, quora_tqs)
quora_doc2vec.train()
quora_doc2vec.compute_similarity_scores()
quora_doc2vec.evaluate()

In [247]:
link_doc2vec = Doc2VecClassifier('link', linkso_df, linkso_tqs)
link_doc2vec.train()

'2020-09-27 01:49:46,402 - gensim.models.doc2vec - INFO - collecting all words and their counts'
'2020-09-27 01:49:46,404 - gensim.models.doc2vec - INFO - PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags'
'2020-09-27 01:49:46,441 - gensim.models.doc2vec - INFO - collected 7349 word types and 10000 unique tags from a corpus of 10000 examples and 51738 words'
'2020-09-27 01:49:46,442 - gensim.models.word2vec - INFO - Loading a fresh vocabulary'
'2020-09-27 01:49:46,461 - gensim.models.word2vec - INFO - effective_min_count=2 retains 4584 unique words (62% of original 7349, drops 2765)'
'2020-09-27 01:49:46,476 - gensim.models.word2vec - INFO - effective_min_count=2 leaves 48973 word corpus (94% of original 51738, drops 2765)'
'2020-09-27 01:49:46,528 - gensim.models.word2vec - INFO - deleting the raw counts dictionary of 7349 items'
'2020-09-27 01:49:46,530 - gensim.models.word2vec - INFO - sample=0.001 downsamples 32 most-common words'
'2020-09-27 01:49:46,531 - gen

In [267]:
link_doc2vec.compute_similarity_scores()

In [268]:
metrics.roc_curve(link_doc2vec.data_df.label, link_doc2vec.data_df.scores)

ValueError: unknown format is not supported

### Model Training TfIDF

In [None]:
tfidf = TfidfVectorizer()

In [None]:
tfidf.fit([' '.join(q) for q in quora_pqs])

In [None]:
def compute_similarity(row):
    q1_vec = tfidf.transform([' '.join(clean_text([row.question1]))]).reshape(1, -1)
    q2_vec = tfidf.transform([' '.join(clean_text([row.question2]))]).reshape(1, -1)
    sim = cosine_similarity(q1_vec, q2_vec)[0][0]
    return sim

quora_df['scores'] = quora_df.apply(compute_similarity, axis=1)

# calculate the fpr and tpr for all thresholds of the classification
fpr, tpr, threshold = metrics.roc_curve(quora_df.is_duplicate, quora_df.scores)
roc_auc = metrics.auc(fpr, tpr)

In [None]:
# method I: plt
import matplotlib.pyplot as plt
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
print(f'AUC: {roc_auc}')