In [164]:
import pandas as pd
from transformers import pipeline
import nltk
from nltk.corpus import stopwords
from transformers import pipeline, set_seed
import random
from flashtext import KeywordProcessor
from ast import literal_eval
import re

In [11]:
sentence1 = "Must be an extrovert with an innate quality of easily connecting with people."
sentence2 = "You're self-motivated and decisive, but willing to make changes with minimal grumbling when the client demands it."
sentence3 = "Ideal candidates will be caring, compassinate, adaptable, flexible, commited to high standards of care and possess excelent communication skills both verbal and written."
sentence4 = "To be considered for this position you must be energetic, friendly, compassionat and detail oriented."
sentence5 = "We are looking for a young and driven candidate who can bring innovation into the organization."
sentence6 = "Are you the master of technology and passionate person we are looking for?"

### Use BERT for masking arbitrary words

In [3]:
stop_words = set(stopwords.words('english'))

In [66]:
model = pipeline('fill-mask', model='bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [149]:
def mask_random_words(sentence, stop_words, biased_phrase, masked_result=0):
    augmented_sentence = sentence
    temp = ""
    for _ in range(5):
        not_masked =  True
        for word in augmented_sentence.split(" "):
            random_num = random.random()
            if word not in stop_words and word not in biased_phrase and random_num <= 0.2 and not_masked:
                temp += "[MASK] "
                not_masked = False
            else:
                temp += word + " "
        if "[MASK]" in temp:
            result = model(temp)
            temp = temp.replace("[MASK]", result[masked_result]['token_str'])
        augmented_sentence = temp[:-1]
        temp = ""
            
    return augmented_sentence

In [104]:
print(sentence2)
mask_random_words(sentence2, stop_words, "decisive")

You're self-motivated and decisive, but willing to make changes with minimal grumbling when the client demands it.


"You're strong and decisive, but willing to make changes with no hesitation when the client demands ."

### Use GPT-2 to finish the sentence for you

In [9]:
generator = pipeline('text-generation', model='gpt2')
set_seed(42)

Downloading:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/523M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/0.99M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

In [49]:
def finish_sentence(sentence, num_generations=5, max_length=25):
    generated_sentences = []
    temp_generations = generator(sentence, max_length=max_length, num_return_sequences=num_generations)
    for generations in temp_generations:
        temp = generations['generated_text']
        temp = temp.split('.')[0]
        temp = temp + '.'
        generated_sentences.append(temp)
    return generated_sentences

In [53]:
finish_sentence("We are looking for a young and driven candidate")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


['We are looking for a young and driven candidate who can be an effective, independent voice in a number of regions," he said.',
 'We are looking for a young and driven candidate who looks to move quickly, and speaks fluently; wants to get involved with.',
 'We are looking for a young and driven candidate who will help us get closer to the political level and to get past difficult times.',
 'We are looking for a young and driven candidate to carry the message.',
 'We are looking for a young and driven candidate.']

### Use flashtext to replace words in sentence

In [100]:
processor = KeywordProcessor(case_sensitive = False)

In [98]:
processor.add_keyword('decisive','caring')
replaced = processor.replace_keywords(sentence2)
print(replaced)

You're self-motivated and caring, but willing to make changes with minimal grumbling when the client demands it.


### Augment Behavioural Bias Data

In [105]:
df_behavioural_bias = pd.read_csv("behavioural_bias_labeled_data.csv")

In [110]:
def is_sentence_biased(label):
    if 'Behavioural Stereotypes' in label:
        return True
    return False

df_behavioural_bias['is_biased'] = df_behavioural_bias['label'].map(lambda x:is_sentence_biased(x))

In [113]:
to_augment = df_behavioural_bias.loc[df_behavioural_bias.is_biased == True]

In [160]:
# augment based on index from 0 to 208
index = 208
sentence_to_augment = to_augment['data'][index]
sentence_to_augment_label = literal_eval(to_augment['label'][index])
for i in range(2):
    for value in sentence_to_augment_label:
        temp_augmented = mask_random_words(sentence_to_augment, stop_words, sentence_to_augment[value[0]: value[1]], i)
        print(temp_augmented)

have a very aggressive development team and have a huge impact on the growth strategies used to become one of the cloud’s next great platforms.
Join a small, aggressive marketing campaign and have a big impact on the company you need to become one of the cloud’s next great platforms.


In [179]:
# use masking to augment biased sentences in labeled data
df_augmented_sentences = pd.DataFrame(columns=to_augment.columns)
for index, row in to_augment.iterrows():
    sentence_to_augment = row['data']
    sentence_to_augment_label = literal_eval(row['label'])
    for i in range(3):
        for value in sentence_to_augment_label:
            temp_augmented = mask_random_words(sentence_to_augment, stop_words, sentence_to_augment[value[0]: value[1]], i)
            new_label = []
            for label in sentence_to_augment_label:
                if sentence_to_augment[label[0]: label[1]] in temp_augmented:
                    find_old_bias = re.finditer(sentence_to_augment[label[0]: label[1]], temp_augmented)
                    for match in find_old_bias:
                        new_label.append([match.start(), match.end(), 'Behavioural Stereotypes'])
            df_augmented_sentences = df_augmented_sentences.append({'data':temp_augmented, 'label': new_label}, ignore_index = True)

In [197]:
df_augmented_sentences['is_biased'] =  df_augmented_sentences['is_biased'].fillna('augmented')

In [203]:
frames = [df_augmented_sentences, df_behavioural_bias]
merged_augmented_labeled = pd.concat(frames)

In [205]:
merged_augmented_labeled.loc[merged_augmented_labeled.is_biased != False].head()

Unnamed: 0,data,label,is_biased
0,We're looking for a young and stubborn new rec...,"[[30, 39, Behavioural Stereotypes]]",augmented
1,still looking for a talented and stubborn fash...,"[[33, 42, Behavioural Stereotypes]]",augmented
2,and looking for a brave and stubborn new styli...,"[[28, 37, Behavioural Stereotypes]]",augmented
3,always looking for a talented and self-confide...,"[[34, 48, Behavioural Stereotypes]]",augmented
4,just waiting for a beautiful and self-confiden...,"[[33, 47, Behavioural Stereotypes]]",augmented


In [223]:
merged_augmented_labeled.to_csv("merged_augmented_labeled.csv", index=False)

### Examples of masking augmentation
Join a small, aggressive <b>management</b> team and have a big impact on the <b>business</b> strategies used to become one of the cloud’s next great platforms.

Join a small, aggressive <b>marketing</b> team and have a big impact on the <b>growth</b> strategies used to become one of the cloud’s next great platforms.


In [224]:
merged_augmented_labeled

Unnamed: 0,data,label,is_biased
3,always looking for a talented and self-confide...,"[[34, 48, Behavioural Stereotypes]]",augmented
4,just waiting for a beautiful and self-confiden...,"[[33, 47, Behavioural Stereotypes]]",augmented
5,We're looking for a talented and self-confiden...,"[[33, 47, Behavioural Stereotypes]]",augmented
6,We're looking for a talented and self-sufficie...,"[[33, 48, Behavioural Stereotypes]]",augmented
7,always looking for a talented and self-suffici...,"[[34, 49, Behavioural Stereotypes]]",augmented
...,...,...,...
617,We have lots of ideas—some of them rather ambi...,"[[42, 51, 'Non-biased']]",False
618,All communication will be treated confidentially.,"[[34, 48, 'Non-biased']]",False
619,The role will be a confidant for the team and ...,"[[19, 29, 'Non-biased'], [92, 102, 'Non-biased']]",False
620,You’ll do this by applying your skills and int...,"[[98, 110, 'Non-biased'], [43, 53, 'Non-biased']]",False
