# AI Tuning: Topic Modelling

### Imports

In [None]:
import pandas as pd

import nltk, re

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.stem.wordnet import WordNetLemmatizer

from collections import Counter
from nltk.util import ngrams
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

import numpy as np

import spacy

#Sentiment
nltk.download('vader_lexicon')

#Creating requency distribution
from nltk.probability import FreqDist

from nltk.sentiment import SentimentIntensityAnalyzer

import plotly.express as px
import matplotlib.pyplot as plt
from matplotlib.pyplot import pie, axis, show
import seaborn as sns

from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from sklearn.feature_extraction.text import CountVectorizer

#Topic modeling using Latent Dirichlet Allocation (LDA) and Gensim
import gensim
from gensim.corpora import Dictionary
from gensim.models.ldamodel import LdaModel
from gensim.models import CoherenceModel

import gensim.corpora as corpora

#Importing pretty print
import pprint
from pprint import pprint

#Installing BERTopic for topic modeling (one of the tools)
%pip install bertopic
from bertopic import BERTopic

In [None]:
# @title Global Variables

# change variables per customer
FILENAME_MODELLING = "file_name"
FILENAME_EXTRACTION = "HF_file_name"

In [None]:
# @title Bigram Function

def bigrams(target_intent, words, length):

  # create bigrams
  bigrams_series = (pd.Series(nltk.ngrams(words, 2)).value_counts())[:length]

  # set height
  height = length * 0.25

  # set figure size
  plt.figure(figsize = (10, height))

  # plot the bar chart
  sns.barplot(y = bigrams_series.index, x = bigrams_series.values, palette = 'winter')

  # set labels
  plt.title('Most Frequent Bigrams\n', fontweight = "bold", fontsize = 14)
  plt.xlabel("\nNumber of Occurances", fontweight = "bold", fontsize = 12)
  plt.ylabel("Bigrams\n", fontweight = "bold", fontsize = 12)
  plt.xticks(fontsize = 10)
  plt.yticks(fontsize = 10)

  # set filename
  filename = 'bigram_' + target_intent + '.png'

  # save the plot as an image
  plt.savefig(filename, bbox_inches = 'tight')

  # show the plot
  plt.show()

  # close the plot to release memory
  plt.close()

In [None]:
# @title Trigram Function

def trigrams(target_intent, words, length):

  # create trigrams
  trigrams_series = (pd.Series(nltk.ngrams(words, 3)).value_counts())[:length]

  # set height
  height = length * 0.25

  # set figure size
  plt.figure(figsize = (10, height))

  # plot the bar chart
  sns.barplot(y = trigrams_series.index, x = trigrams_series.values, palette = 'winter')

  # set labels
  plt.title('Most Frequent Trigrams\n', fontweight = "bold",fontsize = 14)
  plt.xlabel("\nNumber of Occurances", fontweight = "bold",fontsize = 12)
  plt.ylabel("Trigrams\n", fontweight = "bold",fontsize = 12)
  plt.xticks(fontsize = 10)
  plt.yticks(fontsize = 10)

  # set filename
  filename = 'trigram_' + target_intent + '.png'

  # save the plot as an image
  plt.savefig(filename, bbox_inches = 'tight')

  # show the plot
  plt.show()

  # close the plot to release memory
  plt.close()

In [None]:
# @title Imported Data Manipulation

# import data into dataframe
data = pd.read_csv(FILENAME_MODELLING)
data_for_extraction = pd.read_csv(FILENAME_EXTRACTION)

# extract date from filename
DATE = FILENAME_MODELLING.split('_')[5].replace('.csv', '')

# extract IVA name from filename
IVA = FILENAME_MODELLING.split('_')[0]

# double check date range
print("Min date: ", data['Date'].min())
print("Max data: ", data['Date'].max())

# view shape
print("Shape:", data.shape)

# view df
data.head()

NB: Consider adding general 'discovery' step for customers with IVAs that require a lot of improvements.

# Default Fallback

In [None]:
# Intent name
default_fallback_name = 'ADD'

default_df = data[data['Intent'] == default_fallback_name]
default_df.head()

In [None]:
default_df_count = default_df.shape[0]
print(default_df_count)

In [None]:
agent_sentences_def = []
non_agent_sentences_def = []
# agent vs non agent utterances
for utterance in default_df['Utterance']:
    if 'assistant' in utterance or 'represent' in utterance or 'office' in utterance or 'corporate office' in utterance or 'customer service' in utterance or 'speak' in utterance or 'agent' in utterance or 'representative' in utterance or 'talk' in utterance or 'human' in utterance or 'analyst' in utterance or 'associate' in utterance or 'person' in utterance or 'support' in utterance or 'desk' in utterance or 'operator' in utterance or 'customer assistance' in utterance or 'assistance' in utterance or 'connect' in utterance or 'somebody' in utterance or 'someone' in utterance:
        agent_sentences_def.append(utterance)
    else:
        non_agent_sentences_def.append(utterance)
agent_count_def = len(agent_sentences_def)
non_agent_count_def = len(non_agent_sentences_def)

print(agent_count_def)
print(agent_sentences_def)

In [None]:
sizes = [agent_count_def, non_agent_count_def]
labels = ['Agent Transfer', 'All The Rest']

# explosion
explode = (0.05, 0.05)

# Pie Chart
plt.pie(sizes, colors=sns.color_palette('Set2'), labels=labels,
        autopct='%1.1f%%', pctdistance=0.85,
        explode=explode, textprops={'fontsize': 10})

# draw circle
centre_circle = plt.Circle((0, 0), 0.70, fc='white')
fig = plt.gcf()

# Adding Circle in Pie chart
fig.gca().add_artist(centre_circle)

# Adding Title of chart
plt.title(f'{IVA}: {default_fallback_name}: Agent Transfer Requests')

# Add Legends
plt.legend(labels, loc="upper right")

# set filename
filename = 'agent_count_' + default_fallback_name + '.png'

# save the plot as an image
plt.savefig(filename, bbox_inches = 'tight')

# Displaying Chart
plt.show()

In [None]:
all_words_def = []
length = []

for utterance in non_agent_sentences_def:
    words = word_tokenize(str(utterance))
    length.append(len(words))

    for word in words:
        all_words_def.append(word)

print(all_words_def)
print(f'Total words: {len(all_words_def)}.')

In [None]:
# prepare tokens for wordcloud
wordcloud_text = ' '.join([element for element in all_words_def])
print(type(wordcloud_text))

In [None]:
wordcloud = WordCloud(width = 1800, height = 1000, background_color = 'white').generate(wordcloud_text)
plt.imshow(wordcloud)
plt.axis('off')

# set filename
filename = 'cloud_' + default_fallback_name + '.png'

# save the plot as an image
plt.savefig(filename, bbox_inches = 'tight')

plt.show()

In [None]:
# Bag-of-words: top 20 key words
bag_of_words = Counter(all_words_def)

print('Most frequently used words are: ')
top_fifty = Counter.most_common(bag_of_words, 20)

# now printing as a list for better readability
for word in top_fifty:
  print(word)

In [None]:
# call bigram function
bigrams(default_fallback_name, all_words_def, 25)

In [None]:
# call trigram function
trigrams(default_fallback_name, all_words_def, 25)

## Topic Extraction

In [None]:
# declare topics
Topic1 = 'ADD'
Topic2 = 'ADD'
Topic3 = 'ADD'
Topic4 = 'ADD'
Topic5 = 'ADD'
Topic6 = 'ADD'

default_df_extract = data_for_extraction[data_for_extraction['Intent']==default_fallback_name]
default_df_extract.head()

In [None]:
# extracts non-agent sentences from HF file
agent_sentences_def_extract = []
non_agent_sentences_def_extract = []
# agent vs non agent utterances
for utterance in default_df_extract['Clean']:
    if 'assistant' in utterance or 'represent' in utterance or 'office' in utterance or 'corporate office' in utterance or 'customer service' in utterance or 'speak' in utterance or 'agent' in utterance or 'representative' in utterance or 'talk' in utterance or 'human' in utterance or 'analyst' in utterance or 'associate' in utterance or 'person' in utterance or 'support' in utterance or 'desk' in utterance or 'operator' in utterance or 'customer assistance' in utterance or 'assistance' in utterance or 'connect' in utterance or 'somebody' in utterance or 'someone' in utterance:
        agent_sentences_def_extract.append(utterance)
    else:
        non_agent_sentences_def_extract.append(utterance)
agent_count_def_extract = len(agent_sentences_def_extract)
non_agent_count_def_extract = len(non_agent_sentences_def_extract)

print(agent_count_def_extract)
print(agent_sentences_def_extract)

In [None]:
# create agent dataframe
default_agent_df = pd.DataFrame(agent_sentences_def_extract, columns=['Clean'])
default_agent_df['Original Intent'] = default_fallback_name
default_agent_df = default_agent_df.drop_duplicates()
default_agent_df['Recommended Action'] = 'Move training sentences to Agent Intent'
default_agent_df.head()

In [None]:
# creating non-agent sentences df
default_non_agent_df = pd.DataFrame(non_agent_sentences_def_extract, columns=['Clean'])

In [None]:
#find frequencies of top words and phrases in the top ngrams
topic1_def = default_non_agent_df[default_non_agent_df['Clean'].str.contains(Topic1)]
topic1_def.head()

In [None]:
topic1_def_count = topic1_def.shape[0]
print(topic1_def_count)

In [None]:
topic1_def['Original Intent'] = default_fallback_name
topic1_def = topic1_def.drop_duplicates()
topic1_def['Recommended Action'] = 'ADD'
topic1_def.head()

In [None]:
#find frequencies of top words and phrases in the top ngrams
topic2_def = default_non_agent_df[default_non_agent_df['Clean'].str.contains(Topic2)]
topic2_def.head()

In [None]:
topic2_def_count = topic2_def.shape[0]
print(topic2_def_count)

In [None]:
topic2_def['Original Intent'] = default_fallback_name
topic2_def = topic2_def.drop_duplicates()
topic2_def['Recommended Action'] = 'ADD'
topic2_def.head()

In [None]:
#find frequencies of top words and phrases in the top ngrams
topic3_def = default_non_agent_df[default_non_agent_df['Clean'].str.contains(Topic3)]
topic3_def.head()

In [None]:
topic3_def_count = topic3_def.shape[0]
print(topic3_def_count)

In [None]:
topic3_def['Original Intent'] = default_fallback_name
topic3_def = topic3_def.drop_duplicates()
topic3_def['Recommended Action'] = 'ADD'
topic3_def.head()

In [None]:
#find frequencies of top words and phrases in the top ngrams
topic4_def = default_non_agent_df[default_non_agent_df['Clean'].str.contains(Topic4)]
topic4_def.head()

In [None]:
topic4_def_count = topic4_def.shape[0]
print(topic4_def_count)

In [None]:
topic4_def['Original Intent'] = default_fallback_name
topic4_def = topic4_def.drop_duplicates()
topic4_def['Recommended Action'] = 'ADD'
topic4_def.head()

In [None]:
#find frequencies of top words and phrases in the top ngrams
topic5_def = default_non_agent_df[default_non_agent_df['Clean'].str.contains(Topic5)]
topic5_def.head()

In [None]:
topic5_def_count = topic5_def.shape[0]
print(topic5_def_count)

In [None]:
topic5_def['Original Intent'] = default_fallback_name
topic5_def = topic5_def.drop_duplicates()
topic5_def['Recommended Action'] = 'ADD'
topic5_def.head()

In [None]:
#find frequencies of top words and phrases in the top ngrams
topic6_def = default_non_agent_df[default_non_agent_df['Clean'].str.contains(Topic6)]
topic6_def.head()

In [None]:
topic6_def_count = topic6_def.shape[0]
print(topic6_def_count)

In [None]:
topic6_def['Original Intent'] = default_fallback_name
topic6_def = topic6_def.drop_duplicates()
topic6_def['Recommended Action'] = 'ADD'
topic6_def.head()

In [None]:
counts = [agent_count_def_extract, topic1_def_count, topic2_def_count, topic3_def_count, topic4_def_count, topic5_def_count, topic6_def_count]
labels = ['agent transfer', Topic1, Topic2, Topic3, Topic4, Topic5, Topic6]

# set height
height = len(labels) * 0.5

# set figure size
plt.figure(figsize = (10, height))

# create horizontal bar graph
sns.barplot(x = counts, y = labels, palette = "winter")

# set x and y labels
plt.title(f'\n{IVA}: Number of extracted utterances per topic going to {default_fallback_name}\n', fontweight="bold", fontsize = 14)
plt.xlabel("\nNumber of utterances\n", fontweight="bold", fontsize = 12)
plt.ylabel("\nSuggested Topics\n", fontweight="bold", fontsize = 12)
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)

# set filename
filename = 'topic_count_' + default_fallback_name + '.png'

# save the plot as an image
plt.savefig(filename, bbox_inches = 'tight')

# show the plot
plt.show()

In [None]:
# creating df with undefined selected utterances
def_selected = pd.concat([topic1_def, topic2_def, topic3_def, topic4_def, topic5_def, topic6_def], axis=0)
def_selected.shape

# Agent Intent

In [None]:
# Intent name
agent_intent_name = 'ADD'

agent_intent_df = data[data['Intent']==agent_intent_name]
agent_intent_df.head()

In [None]:
agent_intent_df_count = agent_intent_df.shape[0]
print(agent_intent_df_count)

In [None]:
agent_sentences_ag = []
non_agent_sentences_ag = []
# agent vs non agent utterances
for utterance in agent_intent_df['Utterance']:
    if 'assistant' in utterance or 'represent' in utterance or 'office' in utterance or 'corporate office' in utterance or 'customer service' in utterance or 'speak' in utterance or 'agent' in utterance or 'representative' in utterance or 'talk' in utterance or 'human' in utterance or 'analyst' in utterance or 'associate' in utterance or 'person' in utterance or 'support' in utterance or 'desk' in utterance or 'operator' in utterance or 'customer assistance' in utterance or 'assistance' in utterance or 'connect' in utterance or 'somebody' in utterance or 'someone' in utterance:
        agent_sentences_ag.append(utterance)
    else:
        non_agent_sentences_ag.append(utterance)
agent_count_ag = len(agent_sentences_ag)
non_agent_count_ag = len(non_agent_sentences_ag)

print(agent_count_ag)
print(agent_sentences_ag)

In [None]:
sizes = [agent_count_ag, non_agent_count_ag]
labels = ['Agent Transfer', 'All The Rest']

# explosion
explode = (0.05, 0.05)

# Pie Chart
plt.pie(sizes, colors=sns.color_palette('Set2'), labels=labels,
        autopct='%1.1f%%', pctdistance=0.85,
        explode=explode, textprops={'fontsize': 10})

# draw circle
centre_circle = plt.Circle((0, 0), 0.70, fc='white')
fig = plt.gcf()

# Adding Circle in Pie chart
fig.gca().add_artist(centre_circle)

# Adding Title of chart
plt.title(f'{IVA}: {agent_intent_name}: Agent Transfer Requests')

# Add Legends
plt.legend(labels, loc="upper right")

# set filename
filename = 'agent_count_' + agent_intent_name + '.png'

# save the plot as an image
plt.savefig(filename, bbox_inches = 'tight')

# Displaying Chart
plt.show()

In [None]:
all_words_ag = []
length = []

for utterance in non_agent_sentences_ag:
    words = word_tokenize(str(utterance))
    length.append(len(words))

    for word in words:
        all_words_ag.append(word)

print(all_words_ag)
print(f'Total words: {len(all_words_ag)}.')

In [None]:
# prepare tokens for wordcloud
wordcloud_text = ' '.join([element for element in all_words_ag])
print(type(wordcloud_text))

In [None]:
wordcloud = WordCloud(width = 1800, height = 1000, background_color = 'white').generate(wordcloud_text)
plt.imshow(wordcloud)
plt.axis('off')

# set filename
filename = 'cloud_' + agent_intent_name + '.png'

# save the plot as an image
plt.savefig(filename, bbox_inches = 'tight')

plt.show()

In [None]:
# Bag-of-words: top 20 key words
bag_of_words = Counter(all_words_ag)

print('Most frequently used words are: ')
top_fifty = Counter.most_common(bag_of_words, 20)

# now printing as a list for better readability
for word in top_fifty:
  print(word)

In [None]:
# call bigram function
bigrams(agent_intent_name, all_words_ag, 25)

In [None]:
# call bigram function
trigrams(agent_intent_name, all_words_ag, 25)

## Topic Extraction

In [None]:
# declare topics
Topic1 = 'ADD'
Topic2 = 'ADD'
Topic3 = 'ADD'
Topic4 = 'ADD'
Topic5 = 'ADD'
Topic6 = 'ADD'

agent_intent_extract = data_for_extraction[data_for_extraction['Intent'] == agent_intent_name]
agent_intent_extract.head()

In [None]:
# extracts non-agent sentences from HF file
agent_sentences_ag_extract = []
non_agent_sentences_ag_extract = []
# agent vs non agent utterances
for utterance in agent_intent_extract['Clean']:
    if 'assistant' in utterance or 'represent' in utterance or 'office' in utterance or 'corporate office' in utterance or 'customer service' in utterance or 'speak' in utterance or 'agent' in utterance or 'representative' in utterance or 'talk' in utterance or 'human' in utterance or 'analyst' in utterance or 'associate' in utterance or 'person' in utterance or 'support' in utterance or 'desk' in utterance or 'operator' in utterance or 'customer assistance' in utterance or 'assistance' in utterance or 'connect' in utterance or 'somebody' in utterance or 'someone' in utterance:
        agent_sentences_ag_extract.append(utterance)
    else:
        non_agent_sentences_ag_extract.append(utterance)
agent_count_ag_extract = len(agent_sentences_ag_extract)
non_agent_count_ag_extract = len(non_agent_sentences_ag_extract)

print(agent_count_ag_extract)
print(agent_sentences_ag_extract)

In [None]:
# creating non-agent sentences df
default_non_agent_ag = pd.DataFrame(non_agent_sentences_ag_extract, columns=['Clean'])

In [None]:
#find frequencies of top words and phrases in the top ngrams
topic1_ag = default_non_agent_ag[default_non_agent_ag['Clean'].str.contains(Topic1)]
topic1_ag.head()

In [None]:
topic1_ag_count = topic1_ag.shape[0]
print(topic1_ag_count)

In [None]:
topic1_ag['Original Intent'] = agent_intent_name
topic1_ag = topic1_ag.drop_duplicates()
topic1_ag['Recommended Action'] = 'ADD'
topic1_ag.head()

In [None]:
#find frequencies of top words and phrases in the top ngrams
topic2_ag = default_non_agent_ag[default_non_agent_ag['Clean'].str.contains(Topic2)]
topic2_ag.head()

In [None]:
topic2_ag_count = topic2_ag.shape[0]
print(topic2_ag_count)

In [None]:
topic2_ag['Original Intent'] = agent_intent_name
topic2_ag = topic2_ag.drop_duplicates()
topic2_ag['Recommended Action'] = 'ADD'
topic2_ag.head()

In [None]:
#find frequencies of top words and phrases in the top ngrams
topic3_ag = default_non_agent_ag[default_non_agent_ag['Clean'].str.contains(Topic3)]
topic3_ag.head()

In [None]:
topic3_ag_count = topic3_ag.shape[0]
print(topic3_ag_count)

In [None]:
topic3_ag['Original Intent'] = agent_intent_name
topic3_ag = topic3_ag.drop_duplicates()
topic3_ag['Recommended Action'] = 'ADD'
topic3_ag.head()

In [None]:
#find frequencies of top words and phrases in the top ngrams
topic4_ag = default_non_agent_ag[default_non_agent_ag['Clean'].str.contains(Topic4)]
topic4_ag.head()

In [None]:
topic4_ag_count = topic4_ag.shape[0]
print(topic4_ag_count)

In [None]:
topic4_ag['Original Intent'] = agent_intent_name
topic4_ag = topic4_ag.drop_duplicates()
topic4_ag['Recommended Action'] = 'ADD'
topic4_ag.head()

In [None]:
#find frequencies of top words and phrases in the top ngrams
topic5_ag = default_non_agent_ag[default_non_agent_ag['Clean'].str.contains(Topic5)]
topic5_ag.head()

In [None]:
topic5_ag_count = topic5_ag.shape[0]
print(topic5_ag_count)

In [None]:
topic5_ag['Original Intent'] = agent_intent_name
topic5_ag = topic5_ag.drop_duplicates()
topic5_ag['Recommended Action'] = 'ADD'
topic5_ag.head()

In [None]:
#find frequencies of top words and phrases in the top ngrams
topic6_ag = default_non_agent_ag[default_non_agent_ag['Clean'].str.contains(Topic6)]
topic6_ag.head()

In [None]:
topic6_ag_count = topic6_ag.shape[0]
print(topic6_ag_count)

In [None]:
topic6_ag['Original Intent'] = agent_intent_name
topic6_ag = topic6_ag.drop_duplicates()
topic6_ag['Recommended Action'] = 'ADD'
topic6_ag.head()

In [None]:
counts = [topic1_ag_count, topic2_ag_count, topic3_ag_count, topic4_ag_count, topic5_ag_count, topic6_ag_count]
labels = [Topic1, Topic2, Topic3, Topic4, Topic5, Topic6]

# set height
height = len(labels) * 0.75

# set figure size
plt.figure(figsize = (10, height))

# create horizontal bar graph
sns.barplot(x = counts, y = labels, palette = "winter")

# set x and y labels
plt.title(f'\n{IVA}: Number of extracted utterances per topic going to {agent_intent_name}\n', fontweight="bold", fontsize = 14)
plt.xlabel("\nSuggested Topics\n", fontweight="bold", fontsize = 12)
plt.ylabel("\nNumber of utterances\n", fontweight="bold", fontsize = 12)
plt.xticks(fontsize = 11)
plt.yticks(fontsize = 11)

# set filename
filename = 'topic_count_' + agent_intent_name + '.png'

# save the plot as an image
plt.savefig(filename, bbox_inches = 'tight')

# show the plot
plt.show()

In [None]:
# creating df with undefined selected utterances
ag_selected = pd.concat([topic1_ag, topic2_ag, topic3_ag, topic4_ag, topic5_ag, topic6_ag], axis=0)
ag_selected.shape

# Intent of Interest 1

In [None]:
# Intent name
intent1_name = 'ADD'

intent1_df = data[data['Intent'] == intent1_name]
intent1_df.head()

In [None]:
intent1_df_count = intent1_df.shape[0]
print(intent1_df_count)

In [None]:
agent_sentences_int1 = []
non_agent_sentences_int1 = []
# agent vs non agent utterances
for utterance in intent1_df['Utterance']:
    if 'assistant' in utterance or 'represent' in utterance or 'office' in utterance or 'corporate office' in utterance or 'customer service' in utterance or 'speak' in utterance or 'agent' in utterance or 'representative' in utterance or 'talk' in utterance or 'human' in utterance or 'analyst' in utterance or 'associate' in utterance or 'person' in utterance or 'support' in utterance or 'desk' in utterance or 'operator' in utterance or 'customer assistance' in utterance or 'assistance' in utterance or 'connect' in utterance or 'somebody' in utterance or 'someone' in utterance:
        agent_sentences_int1.append(utterance)
    else:
        non_agent_sentences_int1.append(utterance)
agent_count_int1 = len(agent_sentences_int1)
non_agent_count_int1 = len(non_agent_sentences_int1)

print(agent_count_int1)
print(agent_sentences_int1)

In [None]:
sizes = [agent_count_int1, non_agent_count_int1]
labels = ['Agent Transfer', 'All The Rest']

# explosion
explode = (0.05, 0.05)

# Pie Chart
plt.pie(sizes, colors=sns.color_palette('Set2'), labels=labels,
        autopct='%1.1f%%', pctdistance=0.85,
        explode=explode, textprops={'fontsize': 10})

# draw circle
centre_circle = plt.Circle((0, 0), 0.70, fc='white')
fig = plt.gcf()

# Adding Circle in Pie chart
fig.gca().add_artist(centre_circle)

# Adding Title of chart
plt.title(f'{IVA}: {intent1_name}: Agent Transfer Requests')

# Add Legends
plt.legend(labels, loc="lower right")

# set filename
filename = 'agent_count_' + intent1_name + '.png'

# save the plot as an image
plt.savefig(filename, bbox_inches = 'tight')

# Displaying Chart
plt.show()

In [None]:
all_words_int1 = []
length = []

for utterance in non_agent_sentences_int1:
    words = word_tokenize(str(utterance))
    length.append(len(words))

    for word in words:
        all_words_int1.append(word)

print(all_words_int1)
print(f'Total words: {len(all_words_int1)}.')

In [None]:
# prepare tokens for wordcloud
wordcloud_text = ' '.join([element for element in all_words_int1])
print(type(wordcloud_text))

In [None]:
wordcloud = WordCloud(width = 1800, height = 1000, background_color = 'white').generate(wordcloud_text)
plt.imshow(wordcloud)
plt.axis('off')

# set filename
filename = 'cloud_' + intent1_name + '.png'

# save the plot as an image
plt.savefig(filename, bbox_inches = 'tight')

plt.show()

In [None]:
# Bag-of-words: top 20 key words
bag_of_words = Counter(all_words_int1)

print('Most frequently used words are: ')
top_fifty = Counter.most_common(bag_of_words, 20)

# now printing as a list for better readability
for word in top_fifty:
  print(word)

In [None]:
# call bigram function
bigrams(intent1_name, all_words_int1, 25)

In [None]:
# call trigram function
trigrams(intent1_name, all_words_int1, 25)

## Topic Extraction

In [None]:
# declare topics
Topic1 = 'ADD'
Topic2 = 'ADD'
Topic3 = 'ADD'
Topic4 = 'ADD'
Topic5 = 'ADD'
Topic6 = 'ADD'

intent1_df_extract = data_for_extraction[data_for_extraction['Intent'] == intent1_name]
intent1_df_extract.head()

In [None]:
# extracts non-agent sentences from HF file
agent_sentences_int1_extract = []
non_agent_sentences_int1_extract = []
# agent vs non agent utterances
for utterance in intent1_df_extract['Clean']:
    if 'assistant' in utterance or 'represent' in utterance or 'office' in utterance or 'corporate office' in utterance or 'customer service' in utterance or 'speak' in utterance or 'agent' in utterance or 'representative' in utterance or 'talk' in utterance or 'human' in utterance or 'analyst' in utterance or 'associate' in utterance or 'person' in utterance or 'support' in utterance or 'desk' in utterance or 'operator' in utterance or 'customer assistance' in utterance or 'assistance' in utterance or 'connect' in utterance or 'somebody' in utterance or 'someone' in utterance:
        agent_sentences_int1_extract.append(utterance)
    else:
        non_agent_sentences_int1_extract.append(utterance)
agent_count_int1_extract = len(agent_sentences_int1_extract)
non_agent_count_int1_extract = len(non_agent_sentences_int1_extract)

print(agent_count_int1_extract)
print(agent_sentences_int1_extract)

In [None]:
# create agent dataframe
default_agent_int1 = pd.DataFrame(agent_sentences_int1_extract, columns=['Clean'])
default_agent_int1['Original Intent'] = intent1_name
default_agent_int1 = default_agent_int1.drop_duplicates()
default_agent_int1['Recommended Action'] = 'Move to Agent Intent'
default_agent_int1.head()

In [None]:
# creating non-agent sentences df
default_non_agent_int1 = pd.DataFrame(non_agent_sentences_int1_extract, columns=['Clean'])

In [None]:
#find frequencies of top words and phrases in the top ngrams
topic1_int1 = default_non_agent_int1[default_non_agent_int1['Clean'].str.contains(Topic1)]
topic1_int1.head()

In [None]:
topic1_int1_count = topic1_int1.shape[0]
print(topic1_int1_count)

In [None]:
topic1_int1['Original Intent'] = intent1_name
topic1_int1 = topic1_int1.drop_duplicates()
topic1_int1['Recommended Action'] = 'ADD'
topic1_int1.head()

In [None]:
#find frequencies of top words and phrases in the top ngrams
topic2_int1 = default_non_agent_int1[default_non_agent_int1['Clean'].str.contains(Topic2)]
topic2_int1.head()

In [None]:
topic2_int1_count = topic2_int1.shape[0]
print(topic2_int1_count)

In [None]:
topic2_int1['Original Intent'] = intent1_name
topic2_int1 = topic2_int1.drop_duplicates()
topic2_int1['Recommended Action'] = 'ADD'
topic2_int1.head()

In [None]:
#find frequencies of top words and phrases in the top ngrams
topic3_int1 = default_non_agent_int1[default_non_agent_int1['Clean'].str.contains(Topic3)]
topic3_int1.head()

In [None]:
topic3_int1_count = topic3_int1.shape[0]
print(topic3_int1_count)

In [None]:
topic3_int1['Original Intent'] = intent1_name
topic3_int1 = topic3_int1.drop_duplicates()
topic3_int1['Recommended Action'] = 'ADD'
topic3_int1.head()

In [None]:
#find frequencies of top words and phrases in the top ngrams
topic4_int1 = default_non_agent_int1[default_non_agent_int1['Clean'].str.contains(Topic4)]
topic4_int1.head()

In [None]:
topic4_int1_count = topic4_int1.shape[0]
print(topic4_int1_count)

In [None]:
topic4_int1['Original Intent'] = intent1_name
topic4_int1 = topic4_int1.drop_duplicates()
topic4_int1['Recommended Action'] = 'ADD'
topic4_int1.head()

In [None]:
#find frequencies of top words and phrases in the top ngrams
topic5_int1 = default_non_agent_int1[default_non_agent_int1['Clean'].str.contains(Topic5)]
topic5_int1.head()

In [None]:
topic5_int1_count = topic5_int1.shape[0]
print(topic5_int1_count)

In [None]:
topic5_int1['Original Intent'] = intent1_name
topic5_int1 = topic5_int1.drop_duplicates()
topic5_int1['Recommended Action'] = 'ADD'
topic5_int1.head()

In [None]:
#find frequencies of top words and phrases in the top ngrams
topic6_int1 = default_non_agent_int1[default_non_agent_int1['Clean'].str.contains(Topic6)]
topic6_int1.head()

In [None]:
topic6_int1_count = topic6_int1.shape[0]
print(topic6_int1_count)

In [None]:
topic6_int1['Original Intent'] = intent1_name
topic6_int1 = topic6_int1.drop_duplicates()
topic6_int1['Recommended Action'] = 'ADD'
topic6_int1.head()

In [None]:
counts = [agent_count_int1, topic1_int1_count, topic2_int1_count, topic3_int1_count, topic4_int1_count, topic5_int1_count, topic6_int1_count]
labels = ['Agent Transfer', Topic1, Topic2, Topic3, Topic4, Topic5, Topic6]

# set height
height = len(labels) * 0.75

# set figure size
plt.figure(figsize = (10, height))

# create horizontal bar graph
sns.barplot(x = counts, y = labels, palette = "winter")

# set x and y labels
plt.title(f'\n{IVA}: Number of extracted utterances per topic going to {intent1_name}\n', fontweight="bold", fontsize = 14)
plt.xlabel("\nSuggested Topics\n", fontweight="bold", fontsize = 12)
plt.ylabel("\nNumber of utterances\n", fontweight="bold", fontsize = 12)
plt.xticks(fontsize = 11)
plt.yticks(fontsize = 11)

# set filename
filename = 'topic_count_' + intent1_name + '.png'

# save the plot as an image
plt.savefig(filename, bbox_inches = 'tight')

# show the plot
plt.show()

In [None]:
# creating df with undefined selected utterances
int1_selected = pd.concat([default_agent_int1, topic1_int1, topic2_int1, topic3_int1, topic4_int1, topic5_int1, topic6_int1], axis=0)
int1_selected.shape

# Intent of Interest 2

In [None]:
# Intent name
intent2_name = 'ADD'

intent2_df = data[data['Intent'] == intent2_name]
intent2_df.head()

In [None]:
# extracts non-agent sentences from HF file
agent_sentences_int2 = []
non_agent_sentences_int2 = []
# agent vs non agent utterances
for utterance in intent2_df['Utterance']:
    if 'assistant' in utterance or 'represent' in utterance or 'office' in utterance or 'corporate office' in utterance or 'customer service' in utterance or 'speak' in utterance or 'agent' in utterance or 'representative' in utterance or 'talk' in utterance or 'human' in utterance or 'analyst' in utterance or 'associate' in utterance or 'person' in utterance or 'support' in utterance or 'desk' in utterance or 'operator' in utterance or 'customer assistance' in utterance or 'assistance' in utterance or 'connect' in utterance or 'somebody' in utterance or 'someone' in utterance:
        agent_sentences_int2.append(utterance)
    else:
        non_agent_sentences_int2.append(utterance)
agent_count_int2 = len(agent_sentences_int2)
non_agent_count_int2 = len(non_agent_sentences_int2)

print(agent_count_int2)
print(agent_sentences_int2)

In [None]:
intent2_df_count = intent2_df.shape[0]
print(intent2_df_count)

In [None]:
sizes = [agent_count_int2, non_agent_count_int2]
labels = ['Agent Transfer', 'All The Rest']

# explosion
explode = (0.05, 0.05)

# Pie Chart
plt.pie(sizes, colors=sns.color_palette('Set2'), labels=labels,
        autopct='%1.1f%%', pctdistance=0.85,
        explode=explode, textprops={'fontsize': 10})

# draw circle
centre_circle = plt.Circle((0, 0), 0.70, fc='white')
fig = plt.gcf()

# Adding Circle in Pie chart
fig.gca().add_artist(centre_circle)

# Adding Title of chart
plt.title(f'{IVA}: {intent2_name}: Agent Transfer Requests')

# Add Legends
plt.legend(labels, loc="upper right")

# set filename
filename = 'agent_count_' + intent2_name + '.png'

# save the plot as an image
plt.savefig(filename, bbox_inches = 'tight')

# Displaying Chart
plt.show()

In [None]:
all_words_int2 = []
length = []

for utterance in non_agent_sentences_int2:
    words = word_tokenize(str(utterance))
    length.append(len(words))

    for word in words:
        all_words_int2.append(word)

print(all_words_int2)
print(f'Total words: {len(all_words_int2)}.')

In [None]:
# prepare tokens for wordcloud
wordcloud_text = ' '.join([element for element in all_words_int2])
print(type(wordcloud_text))

In [None]:
wordcloud = WordCloud(width = 1800, height = 1000, background_color = 'white').generate(wordcloud_text)
plt.imshow(wordcloud)
plt.axis('off')

# set filename
filename = 'cloud_' + intent2_name + '.png'

# save the plot as an image
plt.savefig(filename, bbox_inches = 'tight')

plt.show()

In [None]:
# Bag-of-words: top 20 key words
bag_of_words = Counter(all_words_int2)

print('Most frequently used words are: ')
top_fifty = Counter.most_common(bag_of_words, 20)

# now printing as a list for better readability
for word in top_fifty:
  print(word)

In [None]:
# call bigram function
bigrams(intent2_name, all_words_int2, 25)

In [None]:
# call trigram function
trigrams(intent2_name, all_words_int2, 25)

## Topic Extraction

In [None]:
# declare topics
Topic1 = 'ADD'
Topic2 = 'ADD'
Topic3 = 'ADD'
Topic4 = 'ADD'
Topic5 = 'ADD'
Topic6 = 'ADD'

intent2_df_extract = data_for_extraction[data_for_extraction['Intent'] == intent2_name]
intent2_df_extract.head()

In [None]:
agent_sentences_int2_extract = []
non_agent_sentences_int2_extract = []
# agent vs non agent utterances
for utterance in intent2_df_extract['Clean']:
    if 'assistant' in utterance or 'represent' in utterance or 'office' in utterance or 'corporate office' in utterance or 'customer service' in utterance or 'speak' in utterance or 'agent' in utterance or 'representative' in utterance or 'talk' in utterance or 'human' in utterance or 'analyst' in utterance or 'associate' in utterance or 'person' in utterance or 'support' in utterance or 'desk' in utterance or 'operator' in utterance or 'customer assistance' in utterance or 'assistance' in utterance or 'connect' in utterance or 'somebody' in utterance or 'someone' in utterance:
        agent_sentences_int2_extract.append(utterance)
    else:
        non_agent_sentences_int2_extract.append(utterance)
agent_count_int2_extract = len(agent_sentences_int2_extract)
non_agent_count_int2_extract = len(non_agent_sentences_int2_extract)

print(agent_count_int2_extract)
print(agent_sentences_int2_extract)

In [None]:
# create agent dataframe
default_agent_int2 = pd.DataFrame(agent_sentences_int2_extract, columns=['Clean'])
default_agent_int2['Original Intent'] = intent1_name
default_agent_int2 = default_agent_int2.drop_duplicates()
default_agent_int2['Recommended Action'] = 'Move to Agent Intent'
default_agent_int2.head()

In [None]:
# creating non-agent sentences df
default_non_agent_int2 = pd.DataFrame(non_agent_sentences_int2_extract, columns=['Clean'])

In [None]:
#find frequencies of top words and phrases in the top ngrams
topic1_int2 = default_non_agent_int2[default_non_agent_int2['Clean'].str.contains(Topic1)]
topic1_int2.head()

In [None]:
topic1_int2_count = topic1_int2.shape[0]
print(topic1_int2_count)

In [None]:
topic1_int2['Original Intent'] = intent2_name
topic1_int2 = topic1_int2.drop_duplicates()
topic1_int2['Recommended Action'] = 'ADD'
topic1_int2.head()

In [None]:
#find frequencies of top words and phrases in the top ngrams
topic2_int2 = default_non_agent_int2[default_non_agent_int2['Clean'].str.contains(Topic2)]
topic2_int2.head()

In [None]:
topic2_int2_count = topic2_int2.shape[0]
print(topic2_int2_count)

In [None]:
topic2_int2['Original Intent'] = intent2_name
topic2_int2 = topic2_int2.drop_duplicates()
topic2_int2['Recommended Action'] = 'ADD'
topic2_int2.head()

In [None]:
#find frequencies of top words and phrases in the top ngrams
topic3_int2 = default_non_agent_int2[default_non_agent_int2['Clean'].str.contains(Topic3)]
topic3_int2.head()

In [None]:
topic3_int2_count = topic3_int2.shape[0]
print(topic3_int2_count)

In [None]:
topic3_int2['Original Intent'] = intent2_name
topic3_int2 = topic3_int2.drop_duplicates()
topic3_int2['Recommended Action'] = 'ADD'
topic3_int2.head()

In [None]:
#find frequencies of top words and phrases in the top ngrams
topic4_int2 = default_non_agent_int2[default_non_agent_int2['Clean'].str.contains(Topic4)]
topic4_int2.head()

In [None]:
topic4_int2_count = topic4_int2.shape[0]
print(topic4_int2_count)

In [None]:
topic4_int2['Original Intent'] = intent2_name
topic4_int2 = topic4_int2.drop_duplicates()
topic4_int2['Recommended Action'] = 'ADD'
topic4_int2.head()

In [None]:
#find frequencies of top words and phrases in the top ngrams
topic5_int2 = default_non_agent_int2[default_non_agent_int2['Clean'].str.contains(Topic5)]
topic5_int2.head()

In [None]:
topic5_int2_count = topic5_int2.shape[0]
print(topic5_int2_count)

In [None]:
topic5_int2['Original Intent'] = intent2_name
topic5_int2 = topic5_int2.drop_duplicates()
topic5_int2['Recommended Action'] = 'ADD'
topic5_int2.head()

In [None]:
#find frequencies of top words and phrases in the top ngrams
topic6_int2 = default_non_agent_int2[default_non_agent_int2['Clean'].str.contains(Topic6)]
topic6_int2.head()

In [None]:
topic6_int2_count = topic6_int2.shape[0]
print(topic6_int2_count)

In [None]:
topic6_int2['Original Intent'] = intent2_name
topic6_int2 = topic6_int2.drop_duplicates()
topic6_int2['Recommended Action'] = 'ADD'
topic6_int2.head()

In [None]:
counts = [agent_count_int2, topic1_int2_count, topic2_int2_count, topic3_int2_count, topic4_int2_count, topic5_int2_count, topic6_int2_count]
labels=['Agent Tranfer', Topic1, Topic2, Topic3, Topic4, Topic5, Topic6]

# set height
height = len(labels) * 0.75

# set figure size
plt.figure(figsize = (10, height))

# create horizontal bar graph
sns.barplot(x = counts, y = labels, palette = "winter")

# set x and y labels
plt.title(f'\n{IVA}: Number of extracted utterances per topic going to {intent2_name}\n', fontweight="bold", fontsize = 14)
plt.xlabel("\nSuggested Topics\n", fontweight="bold", fontsize = 12)
plt.ylabel("\nNumber of utterances\n", fontweight="bold", fontsize = 12)
plt.xticks(fontsize = 11)
plt.yticks(fontsize = 11)

# set filename
filename = 'topic_count_' + intent2_name + '.png'

# save the plot as an image
plt.savefig(filename, bbox_inches = 'tight')

# show the plot
plt.show()

In [None]:
# creating df with undefined selected utterances
int2_selected = pd.concat([default_agent_int1, topic1_int2, topic2_int2, topic3_int2, topic4_int2, topic5_int2, topic6_int2], axis=0)
int2_selected.shape

# Labelled File Creation

In [None]:
labelled_file = pd.concat([def_selected, ag_selected, int1_selected, int2_selected], axis=0)
labelled_file.shape

In [None]:
# export dataframe into .cvs file
labelled_file.to_csv(IVA + '_labeled.csv', encoding='utf-8', index=True, header=True)