# Training Doc2Vec Models

## Import the necessary libraries

In [None]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import collections
from collections import Counter
import re
import math
%matplotlib inline
%pprint

Pretty printing has been turned OFF


## Define functions

In [None]:
def read_csv(path):
    
    """This function reads a CSV file from a specified filepath, while preserving the data types of each variable.
    Source: https://stackoverflow.com/questions/50047237/how-to-preserve-dtypes-of-dataframes-when-using-to-csv/50051542#50051542"""
    
    # Read types first line of csv
    dtypes = {key:value for (key,value) in pd.read_csv(path, nrows=1).iloc[0].to_dict().items() if 'date' not in value}

    parse_dates = [key for (key,value) in pd.read_csv(path, 
                   nrows=1).iloc[0].to_dict().items() if 'date' in value]
    
    # Read the rest of the lines with the types from above
    return pd.read_csv(path, dtype=dtypes, parse_dates=parse_dates, skiprows=[1])

In [None]:
def cosine_similarity(A, B):
    
    """This function takes two vectors as numpy arrays and computes the cosine similarity between them."""

    dot = np.dot(A, B)
    norma = np.linalg.norm(A)
    normb = np.linalg.norm(B)
    cos = dot / (norma * normb)

    return cos

In [None]:
def sigmoid(x):
    
    """This function transforms a value using the sigmoid function."""
    
    return 1/(1+math.exp(-x))

In [None]:
def compute_sdq_norm(X1, X2, X3):

    """This function computes normalised SDQ for a given a set of three cosine similarity scores, where:
    1. X1 is the cosine similarity between Documents A and B;
    2. X2 is the cosine similarity between Documents B and C; and 
    3. X3 is the cosine similarity between Documents A and C.
    """
    
    sdq_abs = (sigmoid(X1) / (sigmoid(X2) * sigmoid(X3)))
    sdq_max = (sigmoid(1) / (sigmoid(-1) ** 2))
    sdq_min = (sigmoid(-1) / (sigmoid(1) ** 2))
    
    return  ((sdq_abs - sdq_min))/(sdq_max - sdq_min)

In [None]:
def train_doc2vec_model(corpus, vec_size, min_count, num_epochs, dm, alpha=0.025, min_alpha=0.00025):
    
    """This function trains Doc2Vec models."""
    
    # Instantiate the model
    model = Doc2Vec(vector_size=vec_size, 
                    min_count=min_count, 
                    epochs=num_epochs, 
                    dm=dm,
                    alpha=alpha, 
                    min_alpha=min_alpha) 
    
    # Build vocab given the corpus
    model.build_vocab(corpus)
    
    # Train model
    model.train(corpus, total_examples=model.corpus_count, epochs=model.epochs)
    
    return model

In [None]:
def average_sdq_norm(model, candidate_list):
    
    """This function computes the average SDQ norm across all triplets of candidate scam reports."""

    similarity_indices = []

    for i in range(len(candidate_list)):
        A = model.infer_vector(candidate_list[i][0].split())
        B = model.infer_vector(candidate_list[i][1].split())
        C = model.infer_vector(candidate_list[i][2].split())
        X1 = cosine_similarity(A, B)
        X2 = cosine_similarity(A, C)
        X3 = cosine_similarity(B, C)
        similarity_indices.append(compute_sdq_norm(X1, X2, X3))

    return round(np.mean(similarity_indices), 5)

In [None]:
def compute_self_similarity(model, corpus):
    
    """This function computes self-similarity index of a trained Doc2Vec model."""

    ranks = []

    for doc_id in range(len(corpus)): 

        # Use the trained model of each epoch to infer vectors for each document in the validation data
        inferred_vector = model.infer_vector(corpus[doc_id].words)

        # Using the inferred vector, compute similarities with documents in the training data (It should be most similar to itself).
        sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))

        # Create a list of document IDs in order of similarity to the current document
        doc_list = [docid for docid, sim in sims]

        # Extract the index position of the current document
        rank = doc_list.index(doc_id)

        # Append rank to the list, 'ranks'
        ranks.append(rank)

    # Perform a count of numbers in the list, 'ranks'
    c = Counter(ranks)

    # Compute the proportion of inferred vectors which correspond to the same document
    perc_self_similar = [(i, c[i] / len(ranks)) for i in c][0][1]

    return round(perc_self_similar, 5)

## Load the dataset

In [None]:
df = read_csv("Data/scam_data_4.csv")[['submission_id', 'preprocessed_text', 'scam_type']]
df.head()

Unnamed: 0,submission_id,preprocessed_text,scam_type
0,20200717-fBLC6F,they call me by whatsapp it was strange for th...,Impersonation Scam
1,20200717-yOxIAl,it happened this morning hrs i received a phon...,Phishing Scam
2,20200717-Tz5TyW,i rceived a call from a lady claiming to be ca...,Phishing Scam
3,20200716-O79B6r,details i received a call from what seemed lik...,Impersonation Scam
4,20200716-yIa3LH,an impersonated junior technical staff called ...,Phishing Scam


In [None]:
corpus = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[i]) for i, _d in enumerate(list(df['preprocessed_text']))]

Example of how the first scam report in the corpus looks like...

In [None]:
corpus[0]

TaggedDocument(words=['they', 'call', 'me', 'by', 'whatsapp', 'it', 'was', 'strange', 'for', 'this', 'number', 'but', 'dbs', 'bank', 'logo', 'convinced', 'me', 'to', 'answer', '.', 'the', 'whatsapp', 'profile', 'was', 'dbslogo', ',', 'so', 'i', 'answer', ',', 'and', 'they', 'say', 'my', 'card', 'was', 'blocked', '.', 'then', 'they', 'ask', 'me', 'my', 'identity', 'number', '....', 'this', 'finally', 'explain', 'me', 'that', 'it', 'was', 'false', '.', 'so', 'i', 'replied', 'saying', 'that', 'i', 'would', 'go', 'to', 'the', 'branch', 'for', 'my', 'card', '.', 'they', 'triedto', 'convince', 'me', 'to', 'do', 'at', 'the', 'phone', ',', 'but', 'i', 'will', 'not', 'give', 'them', 'any', 'data', '.', 'they', 'then', 'ended', 'the', 'call', '.'], tags=[0])

## Preparing candidate text

This section finds triplets of three scam reports, where two scam reports are similar to each other and a third scam report is dissimilar to the first two. Eight triplets have been identified and pre-selected, and saved as the CSV file, `scam_candidate.csv`.

In [None]:
# search_word = "sex scam"
# idx_list = []
# data_list = list(df['preprocessed_text'])
# for i in range(len(data_list)):
#     idx_list.append(i) if re.search(search_word.lower(), data_list[i].lower()) else None
# print(idx_list)

In [None]:
# # Store the indices of pre-inspected candidate reports in a list
# candidate_text_idx = [157, 159, 2751, 404, 10, 1841, 602, 711, 2944, 1161, 815, 2922, 3602, 3896, 1497, \
#                       1755, 1753, 2843, 322, 421, 758, 435, 449, 305]

# # Slice the original dataframe to subset the candidate reports only
# candidate_text = df.iloc[candidate_text_idx, [1,2]]

# # Save as CSV file
# candidate_text.to_csv("Data/scam_candidate.csv")

## Load candidate text

In [None]:
# Load the CSV file as a dataframe
candidate_text = pd.read_csv("Data/scam_candidate.csv")

# Create a list of candidate text
candidate_text_list = list(candidate_text['preprocessed_text'])

# Create a list of 3 lists each
candidate_text_list = [candidate_text_list[i*3:i*3+3] for i in range(int(len(candidate_text_list)/3))]

### An example of a set of candidate texts

In [None]:
e1 = candidate_text[['preprocessed_text']][0:3]
e1.style.set_properties(subset=['preprocessed_text'], **{'width': '500px'})

Unnamed: 0,preprocessed_text
0,"automated phone call claiming it was from ministry_of_health with urgent information and to press 3 for more details. then a person came on the line speaking chinese, when i spoke english they hung up."
1,1 received a voice automated call from ministry_of_health asking me to follow their instructions as there is urgent information required by me.2 no money was lost
2,"receiving calls recently from the number originating from switzerland, suspect it could be related to wangiri scam. usually is a missed call, but if attend i hear automated voice of a lady saying hello darling, which i believe is targeted for guys to attract and trigger to call back them to charge call money value. luckily read few articles about the scam and have blocked it."


### Another example of a set of candidate texts

In [None]:
e2 = candidate_text[['preprocessed_text']][3:6]
e2.style.set_properties(subset=['preprocessed_text'], **{'width': '500px'})

Unnamed: 0,preprocessed_text
3,"received a scam call today from . it was an indian female with a strong accent, claiming to be olivia smith from singtel calling to check on suspicious connections on my internet. she gave me a number to call back and said her employee identity was dcl00198. when i asked her to verify that she is a singtel officer, she hung up."
4,"a lady claimed her name as olivia, calling from singtel tech department, employee number tlc00198. told me there is people stealing my internet, asked me to open my laptop terminal and execute netstat command, then explained to me what she made up how my ip has been misused by both domestic and foreign people. she was about to transfer me to another so called technical engineer before i hang up. she said she is authorized to resolve this today and asked me to call her back at .called my telco company and verified, this is totally a scam. do not trust the scammers."
5,"since 1st august 2018, i am getting numerous calls from this 370 numbers.i have not pick up or called this number because its checked coming from lithuania country which never makes sense since i do not have any business or partners over there. i have blocked these numbers on my phone but apparently getting calls with 370 with new extension numbers.advice please do not call back or receive call. its a wangiri scam dr.ptbn"


## Training the Model

### Experiment with different parameters

Here, we train doc2vec models using different combinations of parameters. The objective is to select the set of parameters which produces the most optimal Doc2Vec model in terms of (1) Average Similarity Index and (2) Self Similarity.

In [None]:
num_epochs = [10, 25, 50, 100]
num_dim = [20, 30, 40, 50]
dm = [0, 1]
min_count = 2

In [None]:
avg_sdq_norm = []
self_similarity = []
suffix = 1

for i in range(len(num_epochs)):
    for j in range(len(num_dim)):
        for k in range(2):
            
            # Train the model
            model = train_doc2vec_model(corpus=corpus, vec_size=num_dim[j], min_count=2, num_epochs=num_epochs[i], dm=dm[k])
            
            # For each model, compute average similarity index when compared against the candidate set
            sdq_norm = average_sdq_norm(model, candidate_text_list)
            avg_sdq_norm.append(sdq_norm)
            
            # Also compute self similarity score
            ssi = compute_self_similarity(model, corpus)
            self_similarity.append(ssi)
            print("%d epochs, %d dimensions, dm = %d --> Normalised SDQ : %f | SSI: %f" % (num_epochs[i], num_dim[j], dm[k], sdq_norm, ssi))

            # Save each model 
            path = "Models/Doc2Vec/" + "doc2vec_model_" + str(suffix) + ".model"
            model.save(path)
            print("Model saved at " + path, "\n")
            suffix += 1

10 epochs, 20 dimensions, dm = 0 --> ASI: 0.158610 | SS: 0.714540
Model saved at Models/Doc2Vec/doc2vec_model_1.model 

10 epochs, 20 dimensions, dm = 1 --> ASI: 0.181750 | SS: 0.729910
Model saved at Models/Doc2Vec/doc2vec_model_2.model 

10 epochs, 30 dimensions, dm = 0 --> ASI: 0.157200 | SS: 0.731440
Model saved at Models/Doc2Vec/doc2vec_model_3.model 

10 epochs, 30 dimensions, dm = 1 --> ASI: 0.201230 | SS: 0.864730
Model saved at Models/Doc2Vec/doc2vec_model_4.model 

10 epochs, 40 dimensions, dm = 0 --> ASI: 0.157520 | SS: 0.714100
Model saved at Models/Doc2Vec/doc2vec_model_5.model 

10 epochs, 40 dimensions, dm = 1 --> ASI: 0.192390 | SS: 0.912170
Model saved at Models/Doc2Vec/doc2vec_model_6.model 

10 epochs, 50 dimensions, dm = 0 --> ASI: 0.157460 | SS: 0.689940
Model saved at Models/Doc2Vec/doc2vec_model_7.model 

10 epochs, 50 dimensions, dm = 1 --> ASI: 0.192450 | SS: 0.926440
Model saved at Models/Doc2Vec/doc2vec_model_8.model 

25 epochs, 20 dimensions, dm = 0 --> ASI

## Saving the results as CSV file

In [None]:
epoch_list = [[10] * 8, [25] * 8, [50] * 8, [100] * 8]
epoch_list = [j for i in epoch_list for j in i]

dim_list = [[20] * 2, [30] * 2, [40] * 2, [50] * 2]
dim_list = [[j for i in dim_list for j in i] * 4][0]

dm_list = [[0, 1] * 16][0]

In [None]:
doc2vec_results = pd.DataFrame({'num_epoch': epoch_list, 'num_dim': dim_list, 'dm_mode': dm_list, 'SDQ_norm':avg_sdq_norm, 'SSI':self_similarity})
doc2vec_results.head()

Unnamed: 0,num_epoch,num_dim,dm_mode,ASI,SS
0,10,20,0,0.15861,0.71454
1,10,20,1,0.18175,0.72991
2,10,30,0,0.1572,0.73144
3,10,30,1,0.20123,0.86473
4,10,40,0,0.15752,0.7141


In [None]:
doc2vec_results.to_csv("Results/doc2vec_results.csv")

## Varying number of epochs (num dim = 50; PV-DM)

In [None]:
num_epochs = [25, 50, 75, 100, 125, 150, 175, 200, 225, 250, 275, 300, 350, 400, 450, 500]

In [1]:
avg_sdq_norm = []
self_similarity = []
suffix = 1

for i in range(len(num_epochs)):   
    
    # Train the model
    model = train_doc2vec_model(corpus=corpus, vec_size=50, min_count=2, num_epochs=num_epochs[i], dm=1)

    # For each model, compute average SDQ norm when compared against the candidate set
    sdq_norm = average_sdq_norm(model, candidate_text_list)
    avg_sdq_norm.append(sdq_norm)

    # Also compute self similarity score
    ssi = compute_self_similarity(model, corpus)
    self_similarity.append(ssi)
    print("%d epochs, 50 dimensions, dm = PV-DM --> Normalised SDQ: %f | SSI: %f" % (num_epochs[i], sdq_norm, ssi))
    
    # Save each model 
    path = "Models/Doc2Vec_50D_PVDM/" + "doc2vec_model_" + str(suffix) + ".model"
    model.save(path)
    print("Model saved at " + path, "\n")
    suffix += 1

In [None]:
doc2vec_results_50D_PVDM = pd.DataFrame({'num_epochs': num_epochs, 'SDQ_norm':avg_sdq_norm, 'SSI':self_similarity})
doc2vec_results_50D_PVDM

In [None]:
doc2vec_results_50D_PVDM.to_csv("Results/doc2vec_results_50D_PVDM.csv")