In [None]:
# import libraries
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.pyplot import pie, axis, show
import seaborn as sns
from sklearn.model_selection import train_test_split

# global variables (change these per customer)
SAMPLE_SIZE = 40
LARGE_SAMPLE_SIZE = 20
FILE = 'file_name.csv'
DATE = 'MM.DD.YY'
IVA = 'CustomerName'

In [None]:
# import data
data = pd.read_csv(FILE)

# peek data
data.head()

In [None]:
# standardize data
# ------------------------------------------------------------------------

# rename columns
data = data.rename(columns={"Intent": "intent", "Clean": "utterance"})

# select only columns of interest
df = data[['intent', 'utterance']]

# peek data
df.head()

In [None]:
# drop redundant data
# ---------------------------------------------------------------------

# view intents with most utterances
duplicates = df.groupby(df.columns.tolist(), as_index = False).size()
duplicates = duplicates.sort_values(by = ['size'], ascending = False)

# peek data
duplicates.head()

In [None]:
# drop redundant data
# ------------------------------------------------------------------------------------------------

# row count before
before = df.shape[0]

print("# of rows before dropping redundant data: ", before)

# remove any utterance containing one of the top repeated phrases
for i in range(5):
    df = df[(df["utterance"].str.contains(duplicates['utterance'].iloc[i]) == False)]
    temp = {'intent': duplicates['intent'].iloc[i], 'utterance': duplicates['utterance'].iloc[i]}
    df = pd.concat([df, pd.DataFrame([temp])], ignore_index=True)

# row count after
after = df.shape[0]

print("# of rows after dropping redundant data: ", after)

# difference
diff = before - after

print("# of redundant rows dropped: ", diff)

# peek data_unique
df

In [None]:
# drop duplicate data
# ------------------------------------------------------

# row count before
before = df.shape[0]
print("# of rows before dropping duplicates: ", before)

# drop duplicates
df = df.drop_duplicates()

# row count after
after = df.shape[0]
print("# of rows after dropping duplicates: ", after)

# difference
diff = before - after
print("Total # of duplicates: ", diff)

In [None]:
# create sample
# --------------------------------------------------------------------

# create series of intents with their counts
data_intents = df.groupby(['intent'])['intent'].count().sort_values()

# retreive intents with counts <= SAMPLE_SIZE
small_intents = data_intents[data_intents <= SAMPLE_SIZE]

# peek small_intents
small_intents

In [None]:
# create sample
# ----------------------------

# initialize sample dataframe
sample = df

# view
sample

In [None]:
# create sample
# -------------------------------------------------------------

# create temporary value storing column
sample['temp'] = False

# loop through columns and label intents within small_intents
for r in range(len(sample)):

    intent_name = sample['intent'].iloc[r]
    if (intent_name in small_intents):
        sample['temp'].iloc[r] = True

# add all true rows to the sample
sample = sample[sample['temp'] == True]

# remove temporary column
sample = sample.drop(columns = 'temp')

# view
sample

In [None]:
# create sample
# -------------------------------------------------------------

# retreive intents with counts > SAMPLE_SIZE
large_intents = data_intents[data_intents > SAMPLE_SIZE]

# retreive largest intents
largest_intents = large_intents.sort_values(ascending = False)

# dropping two
# -----------------------------------------------------------
# largest two
largest_intents = largest_intents.head(2)

# drop two largest
large_intents = large_intents.drop(large_intents.index[-1])
large_intents = large_intents.drop(large_intents.index[-1])

# dropping one
# -----------------------------------------------------------
# largest
#largest_intents = largest_intents.head(1)

# only largest
#large_intents = large_intents.drop(large_intents.index[-1])

# peek both
print('Large Intents: \n', large_intents)
print('\nLargest Intents: \n', largest_intents)

In [None]:
# create sample
# ---------------------------------------------------------------------------------------

# add large intents to sample
for i in range(len(large_intents)):
    temp = df.loc[df['intent'] == large_intents.index[i]].sample(n = SAMPLE_SIZE)
    sample = pd.concat([sample, temp], ignore_index=True)

# add largest intents to sample
for i in range(len(largest_intents)):
    temp = df.loc[df['intent'] == largest_intents.index[i]].sample(n = LARGE_SAMPLE_SIZE)
    sample = pd.concat([sample, temp], ignore_index=True)

# peek sample
sample

In [None]:
# clean sample
sample = sample.drop(columns = 'temp')

# preview sample
sample

In [None]:
# check sample for all intent representation
# ------------------------------------------------------

# raw data intent count
before = data['intent'].unique().size
print("# of unique intents in raw data: ", before)

# sample intent count
after = sample['intent'].unique().size
print("# of unique intents in sample: ", after)

# difference
diff = before - after
print("This should be zero: ", diff)

In [None]:
# seperate sample into train and test
sample_train, sample_test = train_test_split(sample, test_size = 0.2, train_size = 0.8)

# sort both samples
sample_train = sample_train.sort_values(by = 'intent').loc[:, ['utterance', 'intent']]
sample_test = sample_test.sort_values(by = 'intent').loc[:, ['utterance', 'intent']]

# peek one
sample_train.head()

In [None]:
# prints data point metrics

print(f'Train data points: {sample_train.shape[0]}.')
print(f'Test data points: {sample_test.shape[0]}.')

In [None]:
# assemble filenames
train_filename = (IVA + "_train_sample_" + DATE + ".csv")
test_filename = (IVA + "_test_sample_" + DATE + ".csv")

In [None]:
throw error to double check before exporting

In [None]:
# save files
sample_train.to_csv(train_filename, index=False)
sample_test.to_csv(test_filename, index=False)