# Chat Intents

## Applying labels

**Summary**

This notebook provides a way to automatically extract and apply labels to document clusters. See the `chatintents_tutorial.ipynb` notebook for a tutorial of the chatintents package, which simplifies and makes it easier to use the methods outlined below.

In [1]:
import collections
from pathlib import Path

import numpy as np
import pandas as pd
import spacy
from spacy import displacy

pd.set_option("display.max_rows", 600)
pd.set_option("display.max_columns", 500)
pd.set_option("max_colwidth", 400)

In [2]:
nlp = spacy.load("en_core_web_sm")

In [29]:
data_clustered = pd.read_csv('sample_clustered.csv')

In [30]:
data_clustered = data_clustered[['0', 'label_st1']]
data_clustered.sample(10)

Unnamed: 0,0,label_st1
64,service,2
652,service,2
536,owner price,3
226,bars tap water,3
571,dishes,3
568,atmosphere,3
680,place,0
156,dirt,3
22,Mexican street tacos staff,3
551,steak seafood plate,3


In [32]:
example_category = data_clustered[data_clustered['label_st1']==3].reset_index(drop=True)
example_category 

Unnamed: 0,0,label_st1
0,Crust,3
1,texture,3
2,selection menu prices,3
3,pho,3
4,potatoes,3
...,...,...
612,table salad,3
613,food flavor texture,3
614,Appetite,3
615,experience Sushi,3


In [33]:
example_doc = nlp(list(example_category['0'])[12])

print(f'{example_doc}\n')

for token in example_doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_ , token.is_stop)

interior

interior interior PROPN NNP ROOT False


In [34]:
displacy.render(example_doc, style="dep")

In [35]:
# fig = displacy.render(example_doc, style="dep", jupyter=False)
# output_path = Path("dependency_plot.svg") # you can keep there only "dependency_plot.svg" if you want to save it in the same folder where you run the script 
# output_path.open("w", encoding="utf-8").write(fig)

## Helper functions

In [36]:
def get_group(df, category_col, category):
    """
    Returns documents of a single category
    
    Arguments:
        df: pandas dataframe of documents
        category_col: str, column name corresponding to categories or clusters
        category: int, cluster number to return
    Returns:
        single_category: pandas dataframe with documents from a single category
    """
    
    single_category = df[df[category_col]==category].reset_index(drop=True)

    return single_category 

In [37]:
def most_common(lst, n_words):
    """
    Get most common words in a list of words
    
    Arguments:
        lst: list, each element is a word
        n_words: number of top common words to return
    
    Returns:
        counter.most_common(n_words): counter object of n most common words
    """
    counter=collections.Counter(lst)
    return counter.most_common(n_words)

In [38]:
def extract_labels(category_docs, print_word_counts=False):
    """
    Extract labels from documents in the same cluster by concatenating
    most common verbs, ojects, and nouns

    Argument:
        category_docs: list of documents, all from the same category or
                       clustering
        print_word_counts: bool, True will print word counts of each type in this category

    Returns:
        label: str, group label derived from concatentating most common
               verb, object, and two most common nouns

    """

    verbs = []
    dobjs = []
    nouns = []
    adjs = []
    
    verb = ''
    dobj = ''
    noun1 = ''
    noun2 = ''

    # for each document, append verbs, dobs, nouns, and adjectives to 
    # running lists for whole cluster
    for i in range(len(category_docs)):
        doc = nlp(category_docs[i])
        for token in doc:
            if token.is_stop==False:
                if token.dep_ == 'ROOT':
                    verbs.append(token.text.lower())

                elif token.dep_=='dobj':
                    dobjs.append(token.lemma_.lower())

                elif token.pos_=='NOUN':
                    nouns.append(token.lemma_.lower())
                    
                elif token.pos_=='ADJ':
                    adjs.append(token.lemma_.lower())

    # for printing out for inspection purposes
    if print_word_counts:
        for word_lst in [verbs, dobjs, nouns, adjs]:
            counter=collections.Counter(word_lst)
            print(counter)
    
    # take most common words of each form
    if len(verbs) > 0:
        verb = most_common(verbs, 1)[0][0]
    
    if len(dobjs) > 0:
        dobj = most_common(dobjs, 1)[0][0]
    
    if len(nouns) > 0:
        noun1 = most_common(nouns, 1)[0][0]
    
    if len(set(nouns)) > 1:
        noun2 = most_common(nouns, 2)[1][0]
    
    # concatenate the most common verb-dobj-noun1-noun2 (if they exist)
    label_words = [verb, dobj]
    
    for word in [noun1, noun2]:
        if word not in label_words:
            label_words.append(word)
    
    if '' in label_words:
        label_words.remove('')
    
    label = '_'.join(label_words)
    
    return label

In [39]:
def apply_and_summarize_labels(df, category_col):
    """
    Assign groups to original documents and provide group counts

    Arguments:
        df: pandas dataframe of original documents of interest to
            cluster
        category_col: str, column name corresponding to categories or clusters

    Returns:
        summary_df: pandas dataframe with model cluster assignment, number
                    of documents in each cluster and derived labels
    """
    
    numerical_labels = df[category_col].unique()
    
    # create dictionary of the numerical category to the generated label
    label_dict = {}
    for label in numerical_labels:
        current_category = list(get_group(df, category_col, label)['0'])
        label_dict[label] = extract_labels(current_category)
        
    # create summary dataframe of numerical labels and counts
    summary_df = (df.groupby(category_col)['0'].count()
                    .reset_index()
                    .rename(columns={'0':'count'})
                    .sort_values('count', ascending=False))
    
    # apply generated labels
    summary_df['label'] = summary_df.apply(lambda x: label_dict[x[category_col]], axis = 1)
    
    return summary_df

In [40]:
def combine_ground_truth(df_clusters, df_ground, key):
    """
    Combines dataframes of documents with extracted and ground truth labels
    
    Arguments:
        df_clusters: pandas dataframe, each row as a document with corresponding extracted label
        df_ground: pandas dataframe, each row as a document with corresponding ground truth label
        key: str, key to merge tables on
        
    Returns:
        df_combined: pandas dataframe, each row as a document with extracted and ground truth labels
    """
    df_combined = pd.merge(df_clusters, df_ground, on=key, how = 'left')
    return df_combined

### Manual inspection

In [42]:
example_category = list(get_group(data_clustered, 'label_st1', 2)['0'])
extract_labels(example_category, True)

Counter({'service': 39, 'atmosphere': 2, 'company': 1, 'served': 1})
Counter()
Counter({'service': 4, 'family': 1})
Counter()


'service_family'

### Without ground truth labels

In [43]:
cluster_summary = apply_and_summarize_labels(data_clustered, 'label_st1')
cluster_summary

Unnamed: 0,label_st1,count,label
4,3,617,service_food
1,0,77,place_bathroom
2,1,52,food_attitude
3,2,43,service_family
0,-1,13,experience


In [44]:
labeled_clusters = pd.merge(data_clustered, cluster_summary[['label_st1', 'label']], on='label_st1', how = 'left')
labeled_clusters.head()

Unnamed: 0,0,label_st1,label
0,place,0,place_bathroom
1,Crust,3,service_food
2,texture,3,service_food
3,selection menu prices,3,service_food
4,pho,3,service_food


If we don't have the ground truth labels (which is the primary use case for this), then the above tables would be the final results. In this case, since we do have the ground truth labels we can investigate how well our model did.

#### Count and name of most common category of generated labels and clusters