# Generating Augmented Text

The objective of this Python notebook is to create synthetic text by applying text augmentation techniques on text in the "minority" classes of the dataset, while taking a random sample from the "majority" classes. The goal is to create a dataset with balanced classes.

## Import the necessary libraries

In [2]:
import pandas as pd
import random
from random import shuffle
import spacy
import nltk
from nltk import sent_tokenize
from nltk.corpus import wordnet
from nltk.corpus import stopwords
random.seed(1)

In [3]:
%pprint

Pretty printing has been turned OFF


## Define functions

### Random Swap

Randomly swap two words in the sentence n times

In [4]:
def swap_word(new_words):
	random_idx_1 = random.randint(0, len(new_words)-1)
	random_idx_2 = random_idx_1
	counter = 0
	while random_idx_2 == random_idx_1:
		random_idx_2 = random.randint(0, len(new_words)-1)
		counter += 1
		if counter > 3:
			return new_words
	new_words[random_idx_1], new_words[random_idx_2] = new_words[random_idx_2], new_words[random_idx_1] 
	return new_words

In [5]:
def random_swap(words, n=4):
	new_words = words.copy()
	for _ in range(n):
		new_words = swap_word(new_words)
	return new_words

### Random Deletion

Randomly delete words from the sentence with probability p

In [6]:
def random_deletion(words, p=0.4):

	#obviously, if there's only one word, don't delete it
	if len(words) == 1:
		return words

	#randomly delete words with probability p
	new_words = []
	for word in words:
		r = random.uniform(0, 1)
		if r > p:
			new_words.append(word)

	#if you end up deleting all words, just return a random word
	if len(new_words) == 0:
		rand_int = random.randint(0, len(words)-1)
		return [words[rand_int]]

	return new_words

### Random insertion

Randomly insert n words into the sentence

In [7]:
def random_insertion(words, n=3):
	new_words = words.copy()
	for _ in range(n):
		add_word(new_words)
	return new_words

In [8]:
def add_word(new_words):
	synonyms = []
	counter = 0
	while len(synonyms) < 1:
		random_word = new_words[random.randint(0, len(new_words)-1)]
		synonyms = get_synonyms(random_word)
		counter += 1
		if counter >= 10:
			return
	random_synonym = synonyms[0]
	random_idx = random.randint(0, len(new_words)-1)
	new_words.insert(random_idx, random_synonym)

In [9]:
def get_synonyms(word):
	synonyms = set()
	for syn in wordnet.synsets(word): 
		for l in syn.lemmas(): 
			synonym = l.name().replace("_", " ").replace("-", " ").lower()
			synonym = "".join([char for char in synonym if char in ' qwertyuiopasdfghjklzxcvbnm'])
			synonyms.add(synonym) 
	if word in synonyms:
		synonyms.remove(word)
	return list(synonyms)

### Other functions

In [10]:
def read_csv(path):
    
    """This function reads a CSV file from a specified filepath, while preserving the data types of each variable.
    Source: https://stackoverflow.com/questions/50047237/how-to-preserve-dtypes-of-dataframes-when-using-to-csv/50051542#50051542"""
    
    # Read types first line of csv
    dtypes = {key:value for (key,value) in pd.read_csv(path, nrows=1).iloc[0].to_dict().items() if 'date' not in value}

    parse_dates = [key for (key,value) in pd.read_csv(path, 
                   nrows=1).iloc[0].to_dict().items() if 'date' in value]
    
    # Read the rest of the lines with the types from above
    return pd.read_csv(path, dtype=dtypes, parse_dates=parse_dates, skiprows=[1])

In [11]:
def augment(data, n, scam_type):
    
    """This function creates n synthetic reports using three main methods and returns a dataframe:
    1. Random swap
    2. Random deletion
    3. Random insertion """
    
    report_list = list(data[data['scam_type'] == scam_type]['preprocessed_text'])
    func_list = [random_swap, random_deletion, random_insertion]
    
    augmented_reports = []
    
    for i in range(n): 

        augmented_report = []
        # Randomly select one report from the list of reports
        one_random_report = random.choice(report_list) 
        for sent in sent_tokenize(one_random_report):

            # For each sentence of the random report, apply a random function
            random_func = random.choice(func_list)
            if random_func == random_swap:
                sent = ' '.join(random_swap(sent.split(), n=4))
            elif random_func == random_deletion:
                sent = ' '.join(random_deletion(sent.split(), p=0.30))
            elif random_func == random_insertion:
                sent = ' '.join(random_insertion(sent.split(), n=2))

            # Append the augmented sentence 
            augmented_report.append(sent)

        augmented_reports.append(' '.join(augmented_report))
    
    return pd.DataFrame({"preprocessed_text": augmented_reports, "scam_type": scam_type})

In [12]:
def remove_stopwords(text_string):

    """This function takes a text string as input, tokenises it and returns a list of tokens without stopwords."""
    
    word_list = [word for word in nltk.word_tokenize(text_string) if not word in set(stopwords.words('english'))]
    text = ' '.join(word_list).replace(' .', '').replace(' ,', '').replace('< ', '<').replace(' >', '>')

    return text

In [13]:
def lemmatise(text_string):

    """This function takes a tokenised text string as input and returns another tokenised text string after lemmatisation."""

    list_of_tokens = [token.lemma_ for token in nlp(text_string)]
    text = ' '.join(list_of_tokens).replace('< ', '<').replace(' >', '>')
    
    return text

In [14]:
def data_top_n(data, n):
    
    """This function takes a scam dataframe and returns records corresponding to the top n categories."""
    
    count_df = data[['scam_type','preprocessed_text']].groupby('scam_type').aggregate({'preprocessed_text':'count'}).reset_index().sort_values('preprocessed_text',ascending=False)
    top_n_scam_types = list(count_df.nlargest(n, 'preprocessed_text').scam_type)
    
    return data[data['scam_type'].isin(top_n_scam_types)]

In [15]:
def augment_dataset(data, threshold=300):
    
    """This function takes a scam dataframe as input and does the following:
    1. For records corresponding to scam types equal to or above the threshold, take a random sample of the threshold size;
    2. For records corresponding to scam types below the threshold, perform text augmentation to make up the numbers."""
    
    count = dict(data.scam_type.value_counts())
    
    # Initialise an empty dataframe
    df = pd.DataFrame({"preprocessed_text": [], "scam_type": []})

    for scam_type, count in count.items():

        if count >= threshold:
            df = pd.concat([df, data[data.scam_type == scam_type].sample(threshold)])

        else:
            augment_num = threshold - count
            df = pd.concat([df, data[data.scam_type == scam_type], augment(data=data, n=augment_num, scam_type=scam_type)])

    return df

## Test on Dummy Text

In [16]:
test = "I received a call with an automated voice claiming to be from the Singapore Police Force."

### Random Swap

In [17]:
' '.join(random_swap(test.split()))

'I received with claiming a an automated voice call to be from Police Singapore Force. the'

### Random deletion

In [18]:
' '.join(random_deletion(test.split()))

'I call with an voice claiming be from Police Force.'

### Random Insertion

In [19]:
' '.join(random_insertion(test.split()))

'I received a call with an automated voice claiming to be from the machine controlled vocalization Singapore Police  Force.'

## Import dataset

In [20]:
data = read_csv("Data/scam_data_4.csv")[['preprocessed_text', 'scam_type']]
data.head()

Unnamed: 0,preprocessed_text,scam_type
0,they call me by whatsapp it was strange for th...,Impersonation Scam
1,it happened this morning hrs i received a phon...,Phishing Scam
2,i rceived a call from a lady claiming to be ca...,Phishing Scam
3,details i received a call from what seemed lik...,Impersonation Scam
4,an impersonated junior technical staff called ...,Phishing Scam


In [16]:
data = read_csv("Data/scam_data_4_v1.csv")[['preprocessed_text', 'scam_type']]
data.head()

Unnamed: 0,preprocessed_text,scam_type
0,they call me by whatsapp it was strange for th...,Impersonation Scam
1,it happened this morning hrs i received a phon...,Phishing Scam
2,i rceived a call from a lady claiming to be ca...,Phishing Scam
3,details i received a call from what seemed lik...,Impersonation Scam
4,an impersonated junior technical staff called ...,Phishing Scam


In [21]:
# Convert scam type from categorical to string data type 
data.scam_type = data.scam_type.astype('str')

# Extract records corresponding to top 6 scam types
data = data_top_n(data=data, n=6)

# Sanity check
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3818 entries, 0 to 4553
Data columns (total 2 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   preprocessed_text  3818 non-null   object
 1   scam_type          3818 non-null   object
dtypes: object(2)
memory usage: 89.5+ KB


In [22]:
count = dict(data.scam_type.value_counts())
count

{'Impersonation Scam': 1611, 'Online Purchase Scam': 812, 'Internet Love Scam': 707, 'Investment Scam': 264, 'Home/Room Rental Scam': 227, 'Credit-for-Sex Scam': 197}

## Perform text augmentation

Set a threshold (e.g. threshold = 400)
- For scam types with counts equal to or above the threshold, take a random sample of the threshold size
- For scam types with counts below the threshold, perform text augmentation to make up the numbers.

In [19]:
# Set a threshold
count_threshold = 400

# Perform text augmentation
augmented_data = augment_dataset(data=data, threshold=count_threshold)

# Check value counts of scam types
augmented_data.scam_type.value_counts()

Investment Scam          400
Home/Room Rental Scam    400
Credit-for-Sex Scam      400
Online Purchase Scam     400
Impersonation Scam       400
Internet Love Scam       400
Name: scam_type, dtype: int64

In [20]:
augmented_data

Unnamed: 0,preprocessed_text,scam_type
330,i received a call from the guy called me sayin...,Impersonation Scam
375,i received an unsolicited call in regard to my...,Impersonation Scam
984,just received a phone call. the voice is machi...,Impersonation Scam
1457,got an automated call from dhl singapore and p...,Impersonation Scam
3754,received a voice call in an automated voice in...,Impersonation Scam
...,...,...
198,girl original was from which me given rekka. w...,Credit-for-Sex Scam
199,i got this email times already.. and keep sayi...,Credit-for-Sex Scam
200,basically cute saw i girl on facebook and so t...,Credit-for-Sex Scam
201,counter collect kidney care price then assigne...,Credit-for-Sex Scam


## Remove stopwords and lemmatise text

In [21]:
nlp = spacy.load('en_core_web_sm')

In [22]:
for index, text in augmented_data.iterrows():

    stopwords_removed = remove_stopwords(text['preprocessed_text'])
    augmented_data.loc[index, 'stopwords_removed'] = stopwords_removed

    lemmatised = lemmatise(stopwords_removed)
    augmented_data.loc[index, 'lemmatised'] = lemmatised

augmented_data.head(10)

Unnamed: 0,preprocessed_text,scam_type,stopwords_removed,lemmatised
330,i received a call from the guy called me sayin...,Impersonation Scam,received call guy called saying dbs bank accou...,receive call guy call say dbs bank account blo...
375,i received an unsolicited call in regard to my...,Impersonation Scam,received unsolicited call regard mycredit card...,receive unsolicited call regard mycredit card ...
984,just received a phone call. the voice is machi...,Impersonation Scam,received phone call voice machinegenerated cla...,receive phone call voice machinegenerat claim ...
1457,got an automated call from dhl singapore and p...,Impersonation Scam,got automated call dhl singapore press talk so...,get automate call dhl singapore press talk som...
3754,received a voice call in an automated voice in...,Impersonation Scam,received voice call automated voice mandarin m...,receive voice call automate voice mandarin may...
72,a call came at. private message on jun from im...,Impersonation Scam,met attractive lady ask chat sex offered hrs r...,meet attractive lady ask chat sex offer hrs ra...
1237,"received an automated voice call, in a heavy c...",Impersonation Scam,received automated voice call heavy chinese pe...,receive automate voice call heavy chinese peop...
2377,avoid picking up this number. an automated voi...,Impersonation Scam,avoid picking number automated voice message c...,avoid pick number automate voice message chine...
2438,received a call from at. private message telli...,Impersonation Scam,received call private message telling recorded...,receive call private message tell recorded voi...
908,a woman with strong indian accent self declare...,Impersonation Scam,woman strong indian accent self declared singt...,woman strong indian accent self declare singte...


## Compute lengths of preprocessed and lemmatised text

In [23]:
augmented_data['len_preprocessed_text'] = augmented_data.apply(lambda x: len(x['preprocessed_text'].split()), axis=1)
augmented_data['len_lemmatised'] = augmented_data.apply(lambda x: len(x['lemmatised'].split()), axis=1)

In [24]:
augmented_data

Unnamed: 0,preprocessed_text,scam_type,stopwords_removed,lemmatised,len_preprocessed_text,len_lemmatised
330,i received a call from the guy called me sayin...,Impersonation Scam,received call guy called saying dbs bank accou...,receive call guy call say dbs bank account blo...,99,44
375,i received an unsolicited call in regard to my...,Impersonation Scam,received unsolicited call regard mycredit card...,receive unsolicited call regard mycredit card ...,107,55
984,just received a phone call. the voice is machi...,Impersonation Scam,received phone call voice machinegenerated cla...,receive phone call voice machinegenerat claim ...,64,30
1457,got an automated call from dhl singapore and p...,Impersonation Scam,got automated call dhl singapore press talk so...,get automate call dhl singapore press talk som...,340,155
3754,received a voice call in an automated voice in...,Impersonation Scam,received voice call automated voice mandarin m...,receive voice call automate voice mandarin may...,38,19
...,...,...,...,...,...,...
198,girl original was from which me given rekka. w...,Credit-for-Sex Scam,girl original given rekka whatsapp advertised ...,girl original give rekka whatsapp advertise na...,97,49
199,i got this email times already.. and keep sayi...,Credit-for-Sex Scam,got email times already. keep saying boss.,get email time already . keep say boss .,16,9
200,basically cute saw i girl on facebook and so t...,Credit-for-Sex Scam,basically cute saw girl facebook seems added s...,basically cute see girl facebook seem add star...,119,51
201,counter collect kidney care price then assigne...,Credit-for-Sex Scam,counter collect kidney care price assigned the...,counter collect kidney care price assign thera...,56,31


## Save as a CSV file

In [25]:
filename = "Data/scam_data_4_augmented_top_" + str(len(count)) + "_" + str(count_threshold) + ".csv"
augmented_data.to_csv(filename)