# AI Tuning: EDA Template

### Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
import seaborn as sns
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')
from wordcloud import WordCloud
from collections import Counter
from nltk.util import ngrams

# Global Variables

In [None]:
# change variables per customer
FILENAME = "file_name.csv"

## Imported Data Manipulation

In [None]:
# import data into dataframe
data = pd.read_csv(FILENAME)

# extract date from filename
DATE = FILENAME.split('_')[5].replace('.csv', '')
#print(DATE)

# extract IVA name from filename
IVA = FILENAME.split('_')[0]

# double check date range
print("Min date: ", data['Date'].min())
print("Max data: ", data['Date'].max())

# view shape
print("Shape:", data.shape)

# view df
data.head()

In [None]:
# create array of intents
intents = []

# count intents in clean file
for item in data['Intent']:
    intents.append(item)

# use a set so all double occurrences are discarded
intent_names = set(intents)
print(f'We identified {len(intent_names)} different intents in the file.\n')

# counting the total occurrences for each intent
intent_counts = {} # dictionary

for intent in intent_names:
    count = 0
    for item in intents:
        if item == intent:
            count += 1
    intent_counts[intent] = count

# reorder dictionary based on key values
ordered_intent_values = {k: v for k, v in sorted(intent_counts.items(), key=lambda item: item[1])}
ordered_intent_values_reversed = {k: v for k, v in sorted(intent_counts.items(), key=lambda item: item[1], reverse=True)}

# create new arrays to separate
ordered_intents = list(ordered_intent_values.keys())
ordered_counts = list(ordered_intent_values.items())

keys = []
values = []
ratios = []

print(f'The identified intents and their counts/ratios are the following: ')

intent_items = ordered_intent_values.items()

for key, value in intent_items:
    keys.append(key)
    values.append(value)
    ratio = round(((value / data.shape[0]) * 100), 2)
    ratios.append(ratio)
    print(f'{key}: {value}, {ratio}%')

In [None]:
# lists need to be reversed
numbers = values[::-1]
labels = keys[::-1]

# set height
height = len(intent_names) * 0.25

# set figure size
plt.figure(figsize = (10, height))

# create bar plot with winter palette
sns.barplot(x = numbers, y = labels, palette = "winter")

# set x and y labels
plt.title(f'\n{IVA}: Intent Overview\n', fontweight="bold", fontsize = 14)
plt.xlabel("\nNumber of Utterances\n", fontweight="bold", fontsize = 12)
plt.ylabel("\nIntents\n", fontweight="bold", fontsize = 12)
plt.xticks(fontsize = 10)
plt.yticks(fontsize = 10)

# save the plot as an image
plt.savefig('intent_overview.png', bbox_inches = 'tight')

# show the plot
plt.show()

# close the plot to release memory
plt.close()

# Intent Analysis

In [None]:
# most frequent and least frequent intents
# change values to display more or less

least_frequent_intents = {}
most_frequent_intents = {}
dict_length_slice = len(ordered_intent_values) - 1

index = 0

for key, value in ordered_intent_values.items():
    index += 1
    if index <= 6 and value != 0: # eliminates 'nan'
        #print(index)
        least_frequent_intents[key] = value
    elif index > dict_length_slice - 3:
        #print(index)
        most_frequent_intents[key] = value

# list needs to be reversed
names = keys[::-1]
counts = values[::-1]

print(f'\nMost frequent intents: {most_frequent_intents}.')
print(f'\nLeast frequent intents: {least_frequent_intents}.')

In [None]:
# find intents of interest
print(f"\nIntents of Interest:")

intents_of_interest = []

for key, value, ratio in zip(keys, values, ratios):
  if key.lower() == 'other' or key.lower() == 'default fallback' or key.lower() == 'default fallback intent' or key.lower() == 'assistance' or key.lower() == 'agent' or key.lower() == 'agent intent' or key.lower() == 'transfer' or key.lower() == 'agent transfer' or key.lower() == 'operator' or key.lower() == 'customer-representative' or key.lower() == 'undefined':
    print(f'{key}, {value}, {ratio}%')
    intents_of_interest.append(key)

# set reference values
QUARTER = int(round(len(intent_names) / 4, 0))
EIGHTH = int(round(len(intent_names) / 8, 0))
SIXTEENTH = int(round(len(intent_names) / 16, 0))

print(f'\nQuarter value: {QUARTER}')
print(f'Eighth value: {EIGHTH}')
print(f'Sixteenth value: {SIXTEENTH}')

## Intents of Interest

In [None]:
# create new dataframe
critical = pd.DataFrame(columns=['Utterance', 'Intent', 'Date', 'Confidence'])

# find rows of interest
critical = data[data['Intent'].isin(intents_of_interest)]

# view dataframe
critical.head()

## Timeframe Analysis

In [None]:
# function to find the day of the week given any date
def find_weekday(date):

  # change date type
  date_object = datetime.strptime(date, '%Y-%m-%d')

  # find numerical representation
  num_of_week = date_object.weekday()

  # create date array to print nicer
  day_of_week = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]

  # return result
  return day_of_week[num_of_week]

# create series of dates
dates = data.groupby(['Date'])['Date'].count()

# initialize counts
max_date = dates.index[0]
max_count = 0
min_date = dates.index[0]
min_count = 1000
skew_date = dates.index[0]
skew_count = 0

# create dataframes per day
for i in range(dates.count()):

    # create daily dataframe
    filtered = data.loc[data['Date'] == dates.index[i]]

    # find date count of intents
    date_count = filtered.shape[0]

    if (date_count > max_count):
        max_count = date_count
        max_date = dates.index[i]
    elif (date_count < min_count):
        min_count = date_count
        min_date = dates.index[i]
    elif (date_count > skew_count):
        skew_count = date_count
        skew_date = dates.index[i]

print('Number of utterances: ', len(intents))
print('Timeframe: ', DATE)

# create series of dates
dates = data.groupby(['Date'])['Date'].count()

# create dataframe of dates
daily_df = pd.DataFrame(dates)

# rename column
daily_df.columns.values[0] = "Utterance Count"

# sort dataframe
daily_df = daily_df.sort_values(by = 'Date')

# find average
average = daily_df['Utterance Count'].mean()
average = round(average, 2)

# find median
median = daily_df['Utterance Count'].median()
median = round(median, 2)

# plot
daily_df.plot(kind='line', figsize=(10, 6), label = 'Utterance Count')

# plot average
plt.axhline(average, color = 'r', linestyle = '--', label = 'Average Count')

# plot median
plt.axhline(median, color = 'y', linestyle = '--', label = 'Median Count')

# legend
plt.legend()

# label
plt.xlabel('\nDates', fontweight = "bold", fontsize = 12)
plt.ylabel('Utterance Count\n', fontweight = "bold", fontsize = 12)
plt.title(f'{IVA}: Utterance Count by Day\n', fontweight = "bold", fontsize = 14)
plt.xticks(fontsize = 10)
plt.yticks(fontsize = 10)

# save the plot as an image
plt.savefig('utterance_per_day.png', bbox_inches = 'tight')

# show the plot
plt.show()

# close the plot to release memory
plt.close()

# view details
print('Busiest Day: ' + max_date + " (" + find_weekday(max_date) + ")")
print('Slowest Day: ' + min_date + " (" + find_weekday(min_date) + ")")
print('Max Utterance Count: ' + str(max_count))
print('Min Utterance Count: ' + str(min_count))
print('Average Utterance Count: ' + str(average))
print('Median Utterance Count: ' + str(median))

## Lowest Confidence Analysis

In [None]:
def lowest_conf(data, split):

    # sort by confidence
    con_data = data.sort_values(by=['Confidence'], ascending = True)

    # group intents, set order ascending, and set data split
    order = con_data.groupby(["Intent"])["Confidence"].mean().sort_values(ascending = True).index[0:split]

    # set height
    height = split * 0.75

    # set figure size
    plt.figure(figsize = (10, height))

    # set x-axis to 1 to avoid misleading reader
    plt.xlim(0, 1.0)

    # create bar plot
    sns.barplot(y='Intent', x='Confidence', data=data, order=order, palette="winter")

    # label plot
    plt.title(f'{IVA}: Lowest Confidence Levels\n', fontweight = "bold", fontsize = 14)
    plt.xlabel("\nConfidence Score", fontweight = "bold", fontsize = 12)
    plt.ylabel("Intent Name\n", fontweight = "bold", fontsize = 12)
    plt.xticks(fontsize = 10)
    plt.yticks(fontsize = 10)

    # save the plot as an image
    plt.savefig('lowest_confidence.png', bbox_inches='tight')

    # show the plot
    plt.show()

    # close the plot to release memory
    plt.close()

# view
# lowest_conf(data, SIXTEENTH)
# lowest_conf(data, EIGHTH)
lowest_conf(data, QUARTER)

## Highest Confidence Analysis

In [None]:
def lowest_conf(data, split):

    # sort by confidence
    con_data = data.sort_values(by=['Confidence'], ascending = True)

    # group intents, set order ascending, and set data split
    order = con_data.groupby(["Intent"])["Confidence"].mean().sort_values(ascending = True).index[0:split]

    # set height
    height = split * 0.75

    # set figure size
    plt.figure(figsize = (10, height))

    # set x-axis to 1 to avoid misleading reader
    plt.xlim(0, 1.0)

    # create bar plot
    sns.barplot(y='Intent', x='Confidence', data=data, order=order, palette="winter")

    # label plot
    plt.title(f'{IVA}: Lowest Confidence Levels\n', fontweight = "bold", fontsize = 14)
    plt.xlabel("\nConfidence Score", fontweight = "bold", fontsize = 12)
    plt.ylabel("Intent Name\n", fontweight = "bold", fontsize = 12)
    plt.xticks(fontsize = 10)
    plt.yticks(fontsize = 10)

    # save the plot as an image
    plt.savefig('lowest_confidence.png', bbox_inches='tight')

    # show the plot
    plt.show()

    # close the plot to release memory
    plt.close()

# view
# lowest_conf(data, SIXTEENTH)
# lowest_conf(data, EIGHTH)
lowest_conf(data, QUARTER)

## Interest Distribution

In [None]:
def interest_dist(critical):

  # set height
  height = len(critical['Intent'].unique()) * 0.75

  # set figure size
  plt.figure(figsize = (10, height))

  # Calculate the number of utterances per intent
  utterances_per_intent = critical['Intent'].value_counts()

  # Reset the index to convert the Series to a DataFrame
  utterances_per_intent = utterances_per_intent.reset_index()

  # Rename the columns
  utterances_per_intent.columns = ['Intent', 'Count']

  # Plot the bar chart
  sns.barplot(y='Intent', x='Count', data=utterances_per_intent, palette='winter')

  # Set x and y labels
  plt.title(f'{IVA}: Number of Utterances per Intent\n', fontweight = "bold",fontsize = 14)
  plt.xlabel("\nNumber of Utterances", fontweight = "bold",fontsize = 12)
  plt.ylabel("Intents\n", fontweight = "bold",fontsize = 12)
  plt.xticks(fontsize = 10)
  plt.yticks(fontsize = 10)

  # Save the plot as an image
  plt.savefig('interest_distribution.png',bbox_inches = 'tight')

  # Show the plot
  plt.show()

  # Close the plot to release memory
  plt.close()

interest_dist(critical)

## Interest Confidence

In [None]:
def interest_conf(critical):

    # sort by confidence
    con_data = critical.sort_values(by=['Confidence'], ascending = False)

    # group intents, set order ascending, and set data split
    order = con_data.groupby(["Intent"])["Confidence"].mean().sort_values(ascending = False).index[:]

    # set height
    height = len(con_data['Intent'].unique()) * 0.75

    # set figure size
    plt.figure(figsize = (10, height))

    # set x-axis to 1 to avoid misleading reader
    plt.xlim(0, 1.0)

    # create bar plot
    sns.barplot(y = 'Intent', x = 'Confidence', data = con_data, order = order, palette = "winter")

    # label plot
    plt.title(f'{IVA}: Confidence Levels of Interest\n', fontweight = "bold", fontsize = 14)
    plt.xlabel("\nConfidence Score", fontweight = "bold", fontsize = 12)
    plt.ylabel("Intent Name\n", fontweight = "bold", fontsize = 12)
    plt.xticks(fontsize = 10)
    plt.yticks(fontsize = 10)

    # save the plot as an image
    plt.savefig('interest_confidence.png', bbox_inches='tight')

    # show the plot
    plt.show()

    # close the plot to release memory
    plt.close()

interest_conf(critical)

## Most Frequent Intents Donut

In [None]:
print('1. ', ordered_counts[-1])
print("2. ", ordered_counts[-2])
print("3. ", ordered_counts[-3])

# combine all the rest
alr = sum(counts) - counts[0] - counts[1] - counts[2] - counts[3]

# set size based on given values
sizes = [counts[0], counts[1], counts[2], counts[3], alr]

# create labels
labels = [ordered_intents[-1], ordered_intents[-2], ordered_intents[-3], ordered_intents[-4], 'ATR (all the rest)']

# set explosion
explode = (0.05, 0.05, 0.05, 0.05, 0.05)

# plot pie chart
plt.pie(sizes, colors=sns.color_palette('Set2'), labels=labels,
        autopct='%1.1f%%', pctdistance=0.85,
        explode=explode, textprops={'fontsize': 10})

# draw circle
centre_circle = plt.Circle((0, 0), 0.70, fc='white')
fig = plt.gcf()

# add circle to pie chart
fig.gca().add_artist(centre_circle)

# title
plt.title(f'{IVA}: Hits per Intent', fontweight = "bold")

# legend
plt.legend(labels, loc = "center right", bbox_to_anchor=(0, 1))

# save the plot as an image
plt.savefig('donut.png', bbox_inches = 'tight')

# show plot
plt.show()

# close the plot to release memory
plt.close()

## Intents with <10 hits Bar Chart

In [None]:
less_than_ten = {}

for key, value in intent_counts.items():
  if value <= 10:
    less_than_ten[key] = value

if less_than_ten != {}:

  print('Intents with less than 10 hits:')

  # sort array
  sorted_less_than_ten = sorted(less_than_ten.items(), key=lambda x: x[1], reverse=True)

  for key, value in sorted_less_than_ten:
    print(f'{key}: {value}')

  # plot graph for least frequently used intents (less than 10 hits)
  values = [item[1] for item in sorted_less_than_ten]
  labels = [item[0] for item in sorted_less_than_ten]

  # set height
  height = len(labels) * 0.5

  # set figure size
  plt.figure(figsize = (10, height))

  # Create bar plot with winter palette
  sns.barplot(x = values, y = labels, palette = "winter")

  # Set x and y labels
  plt.title(f'\n{IVA}: Intents with less than 10 hits\n', fontweight = "bold", fontsize = 14)
  plt.xlabel("\nNumber of utterances\n", fontweight = "bold", fontsize = 12)
  plt.ylabel("\nIntent name\n", fontweight = "bold", fontsize = 12)
  plt.xticks(fontsize = 10)
  plt.yticks(fontsize = 10)

  # save the plot as an image
  plt.savefig('less_than_ten.png', bbox_inches = 'tight')

  # show the plot
  plt.show()

  # close the plot to release memory
  plt.close()

## Least Frequent Bar Chart

In [None]:
# plot graph fro most frequently hit intents
values = list(least_frequent_intents.values())[::-1]
labels = list(least_frequent_intents.keys())[::-1]

# set height
height = len(labels) * 0.75

# set figure size
plt.figure(figsize = (10, height))

# Create bar plot with winter palette
sns.barplot(x = values, y = labels, palette = "winter")

# Set x and y labels
plt.title(f'\n{IVA}: Least frequently used intents\n', fontweight="bold", fontsize= 14)
plt.xlabel("\nNumber of utterances\n", fontweight="bold", fontsize=12)
plt.ylabel("\nIntent name\n", fontweight="bold", fontsize=12)
plt.xticks(fontsize = 10)
plt.yticks(fontsize = 10)

# save the plot as an image
plt.savefig('least_frequent.png', bbox_inches = 'tight')

# show the plot
plt.show()

# close the plot to release memory
plt.close()

## Most Frequent Bar Chart

In [None]:
# plot graph fro most frequently hit intents
values = list(most_frequent_intents.values())[::-1]
labels = list(most_frequent_intents.keys())[::-1]

# set height
height = len(labels) * 0.75

# set figure size
plt.figure(figsize = (10, height))

# Create bar plot with winter palette
sns.barplot(x = values, y = labels, palette = "winter")

# Set x and y labels
plt.title(f'\n{IVA}: Most frequently used intents\n', fontweight = "bold", fontsize = 14)
plt.xlabel("\nNumber of utterances\n", fontweight = "bold", fontsize = 12)
plt.ylabel("\nIntent name\n", fontweight = "bold", fontsize = 12)
plt.xticks(fontsize = 10)
plt.yticks(fontsize = 10)

# save the plot as an image
plt.savefig('most_frequent.png', bbox_inches = 'tight')

# show the plot
plt.show()

# close the plot to release memory
plt.close()