In [None]:
import pandas as pd
import numpy as np

In [None]:
#mount google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
project_path = '/content/drive/MyDrive/NLP/'
 

In [None]:
df = pd.read_csv(project_path + 'public_data.csv')
df_test = pd.read_csv(project_path + 'input_data.csv') 
########## For people running locally#####
#emails_df2 = pd.read_csv('enron_classification_df.csv') 

In [None]:
## Function for augmenting data using langauge translation
## Could not found free service for langauge translation, Use paid service like Azure, Google translator etc

from textblob import TextBlob
from textblob.translate import NotTranslated
import random
sr = random.SystemRandom()

language = ["es", "de", "fr", "ar", "te", "hi", "ja", "fa", "sq", "bg", "nl", "gu", "ig", "kk", "mt", "ps"]

def data_augmentation(message, language, aug_range=1):
    augmented_messages = []
    if hasattr(message, "decode"):
        message = message.decode("utf-8")

    for j in range(0,aug_range) :
        new_message = ""
        text = TextBlob(message)
        try:
            text = text.translate(to=sr.choice(language))   ## Converting to random langauge for meaningful variation
            text = text.translate(to="en")
        except NotTranslated:
            pass
        augmented_messages.append(str(text))

    return augmented_messages

In [None]:
## Dictionary for intent count
## Intent is column name
intent_count = df.label.value_counts().to_dict()

In [None]:
intent_count

{'bookflight': 260,
 'changeorder': 26,
 'changeseatassignment': 935,
 'checkbalance': 334,
 'checkclaimstatus': 448,
 'checkoffereligibility': 52,
 'checkserverstatus': 129,
 'closeaccount': 71,
 'disputecharge': 215,
 'expensereport': 358,
 'getboardingpass': 655,
 'getinformationintent': 194,
 'getpromotions': 10,
 'getproofofinsurance': 952,
 'getroutingnumber': 52,
 'getseatinfo': 221,
 'orderbreakfastintent': 28,
 'orderburgerintent': 285,
 'orderchecks': 40,
 'orderdessertintent': 291,
 'orderdrinkintent': 746,
 'orderpizzaintent': 978,
 'ordersaladintent': 298,
 'ordersideintent': 105,
 'providereceipt': 13,
 'replacecard': 72,
 'reportbrokenphone': 280,
 'reportbrokensoftware': 325,
 'reportlostcard': 426,
 'softwareupdate': 288,
 'startorder': 264,
 'startserviceintent': 1552,
 'stoporder': 10,
 'transfermoney': 231,
 'updateaddress': 370,
 'upgradeserviceintent': 12,
 'viewbillsintent': 84}

In [None]:
## Get max intent count to match other minority classes through data augmentation
import operator
max_intent_count = max(intent_count.items(), key=operator.itemgetter(1))[1]

In [None]:
## Loop to interate all messages
import numpy as np
import math
import tqdm
newdf = pd.DataFrame()
for intent, count in intent_count.items() :
    count_diff = max_intent_count - count    ## Difference to fill
    multiplication_count = math.ceil((count_diff)/count)  ## Multiplying a minority classes for multiplication_count times
    if (multiplication_count) :
        old_message_df = pd.DataFrame()
        new_message_df = pd.DataFrame()
        for message in tqdm.tqdm(df[df["label"] == intent]["message"]) :
            ## Extracting existing minority class batch
            dummy1 = pd.DataFrame([message], columns=['message'])
            dummy1["label"] = intent
            old_message_df = old_message_df.append(dummy1)
            
            ## Creating new augmented batch from existing minority class
            new_messages = data_augmentation(message, language, multiplication_count)
            dummy2 = pd.DataFrame(new_messages, columns=['message'])
            dummy2["label"] = intent
            new_message_df = new_message_df.append(dummy2)
        
        ## Select random data points from augmented data
        new_message_df=new_message_df.take(np.random.permutation(len(new_message_df))[:count_diff])
        
        ## Merge existing and augmented data points
        newdf = newdf.append([old_message_df,new_message_df])
    else :
        newdf = newdf.append(df[df["label"] == intent])

100%|██████████| 978/978 [28:29<00:00,  1.75s/it]
100%|██████████| 952/952 [21:42<00:00,  1.37s/it]
100%|██████████| 935/935 [20:39<00:00,  1.33s/it]
100%|██████████| 746/746 [42:21<00:00,  3.41s/it]
100%|██████████| 655/655 [23:08<00:00,  2.12s/it]
100%|██████████| 448/448 [25:45<00:00,  3.45s/it]
100%|██████████| 426/426 [25:43<00:00,  3.62s/it]
100%|██████████| 370/370 [33:57<00:00,  5.51s/it]
100%|██████████| 358/358 [30:32<00:00,  5.12s/it]
100%|██████████| 334/334 [29:33<00:00,  5.31s/it]
100%|██████████| 325/325 [34:15<00:00,  6.32s/it]
100%|██████████| 298/298 [35:39<00:00,  7.18s/it]
100%|██████████| 291/291 [31:43<00:00,  6.54s/it]
100%|██████████| 288/288 [29:08<00:00,  6.07s/it]
100%|██████████| 285/285 [35:53<00:00,  7.55s/it]
100%|██████████| 280/280 [30:26<00:00,  6.52s/it]
100%|██████████| 264/264 [29:55<00:00,  6.80s/it]
100%|██████████| 260/260 [24:38<00:00,  5.69s/it]
100%|██████████| 231/231 [34:44<00:00,  9.02s/it]
100%|██████████| 221/221 [18:58<00:00,  5.15s/it]


In [None]:
## Print count of all new data points
newdf.label.value_counts()

startserviceintent       1552
transfermoney            1552
disputecharge            1552
getinformationintent     1552
checkserverstatus        1552
ordersideintent          1552
viewbillsintent          1552
replacecard              1552
closeaccount             1552
checkoffereligibility    1552
getroutingnumber         1552
orderchecks              1552
orderbreakfastintent     1552
changeorder              1552
providereceipt           1552
upgradeserviceintent     1552
stoporder                1552
getseatinfo              1552
bookflight               1552
orderpizzaintent         1552
startorder               1552
getproofofinsurance      1552
changeseatassignment     1552
orderdrinkintent         1552
getboardingpass          1552
checkclaimstatus         1552
reportlostcard           1552
updateaddress            1552
expensereport            1552
checkbalance             1552
reportbrokensoftware     1552
ordersaladintent         1552
orderdessertintent       1552
softwareup

In [None]:
newdf.to_csv('newdf_full.csv', index=False)