# Import Packages and Data

In [1]:
# Import Packages
import warnings
warnings.filterwarnings("ignore")

import pandas as pd

# Open AI & config
import os
import json
import openai
from dotenv import load_dotenv
load_dotenv()

# Clustering
from sklearn.cluster import KMeans

# Dimensionality Reduction
from sklearn.manifold import TSNE

# Visuals
import seaborn as sns
import matplotlib.pyplot as plt

# Similarity Search
from src.vector_similarity import get_embeddings, execute_similarity_search

In [2]:
# Make connections
openai.api_type = "azure"
openai.api_base = "https://crrc-t170-cvx-france.openai.azure.com/"
openai.api_version = "2023-09-15-preview"
openai.api_key = os.getenv("OPENAI_API_KEY")

In [3]:
# Variables

#============#
# Data
#============#
filepath = "C:\\Users\\hlmq\\OneDrive - Chevron\\Desktop\\Projects\\SETH employee questions\\"

# Input
filename = "SETH-questions-summaries.csv"

# Output

OUTPUT_DIRNAME = filepath
OUTPUT_filename = "SETH-questions-topicClusters.csv"


keep_columns = ['Question', 'Topic', 'summary']

#============#
# Model Config
#============#
k = 24          # Number of clusters to fit ("topics")


#============#
# Open AI prompts
#============#
cluster_naming_prompt = json.load(open("prompts/cluster_naming.json"))
cluster_summary_prompt = json.load(open("prompts/cluster_summary.json"))
positive_comment_prompt = json.load(open("prompts/positive_comments.json"))
concerns_prompt = json.load(open("prompts/concerns.json"))
popular_mentions_prompt = json.load(open("prompts/popular_mentions.json"))

In [4]:
# Import Data
data = pd.read_csv(str(filepath)+str(filename))

# Strip unused columns
data = data[keep_columns]

data.head()

Unnamed: 0,Question,Topic,summary
0,What leadership behavior changes need to be ma...,Performance,Identifying leadership behavior changes to imp...
1,I wanted to ask about unclear messaging coming...,Performance,Employee asks about unclear messaging from Sen...
2,"In the last few years, we have heard an overwh...",Performance,Questioning sudden deviation from positive mes...
3,The communication on where we missed and what ...,Performance,Clarity is needed on what actions are necessar...
4,Want to be respectful for sure but I have hear...,Performance,Employees are questioning the effectiveness of...


# Embed Customer Comments

In [5]:
# Generate embeddings from text
embeddings = get_embeddings(data['summary'].tolist())

# Cluster analysis

In [6]:
# Create cluster algorithm
km = KMeans(n_clusters=k, 
                max_iter=300, 
                tol=1e-04, 
                init='k-means++', 
                n_init=10, 
                random_state=42, 
                algorithm='auto')

# Fit clusters
km.fit(embeddings)

KMeans(n_clusters=24, random_state=42)

# Top observations for each cluster

In [7]:
# Prepare data for similarity search

# Centroid df
cluster_centroids_df = pd.DataFrame()
cluster_centroids_df['Cluster Label'] = range(0,k)
cluster_centroids_df.reset_index(inplace=True, drop=True)


# Observation df
observations_df = pd.DataFrame()
observations_df['Cluster Label'] = km.labels_.tolist()
observations_df['Question'] = data['Question']
observations_df['Topic'] = data['Topic']
observations_df['summary'] = data['summary']
observations_df.reset_index(inplace=True, drop=False)

In [8]:
# Execute Search
response = execute_similarity_search(3,                                 # How many observations per cluster?                   
                                     cluster_centroids_df,              # Cluster DF
                                     km.cluster_centers_.tolist(),      # Cluster embeddings
                                     observations_df,                   # Observations DF
                                     embeddings)                        # Observations embeddings

# Name each cluster

In [9]:
# =========== #
# Summarization -- Open AI API
# =========== #

def create_summary(data, prompt):
    """ Summarize a customer's comments using the GPT API. """
    
    # Prompt is taken from prompt.json
    # Add the customer description to the prompt.
    prompt[1]['content'] = ' '.join(data)

    try:
        # Generate the response from the model
        response = openai.ChatCompletion.create(
            engine="gpt-35-turbo",
            temperature=1,
            messages = prompt,
            max_tokens=100,
            top_p=0.5,
            frequency_penalty=0,
            presence_penalty=0,
            stop=None
            )
        return response['choices'][0]['message']['content']
    except Exception as e:
        print(f"EXCEPTION {e}\nTHROWN FOR {data}")
        return f"Error : {e}"


# =========== #
# Count Observations
# =========== #
def count_obs(data):
    return len(data)

In [10]:
# Execute Open AI Summaries

# Empty lists to store data
overall_original_names = []
generated_cluster_names = []
overall_counts = []
overall_summary = []
overall_positive_comments = []
overall_concerns = []
overall_popular_mentions = []
overall_raw_questions = []


for i in range(0,k):
    # slice on cluster number
    check = response[response['Cluster Label']==i]
    strings = check['summary'].tolist()
    raw_strings = check['Question'].tolist()

    # Create summaries
    cluster_generated_name = create_summary(strings, cluster_naming_prompt)
    cluster_count = count_obs(check)
    cluster_summary = create_summary(strings, cluster_summary_prompt)
    cluster_positive_comment = create_summary(strings, positive_comment_prompt)
    cluster_concerns = create_summary(raw_strings, concerns_prompt)
    cluster_popular_mentions = create_summary(strings, popular_mentions_prompt)

    # Record summaries
    generated_cluster_names.append(cluster_generated_name)
    overall_counts.append(cluster_count)
    overall_summary.append(cluster_summary)
    overall_positive_comments.append(cluster_positive_comment)
    overall_concerns.append(cluster_concerns)
    overall_popular_mentions.append(cluster_popular_mentions)


In [13]:
# Add summaries to cluster centroid df
cluster_centroids_df['Generated Topic Name'] = generated_cluster_names
cluster_centroids_df['Topic Size'] = overall_counts
cluster_centroids_df['Topic Summary'] = overall_summary
cluster_centroids_df['Positive Comments'] = overall_positive_comments
cluster_centroids_df['Concerns'] = overall_concerns
cluster_centroids_df['Popular Themes'] = overall_popular_mentions

In [16]:
len(cluster_centroids_df)

24

# Save DataFrames

In [15]:
# Make a join df to remove need to drop columns later.  (Adding cluster results to original observations.)
# join_df = pd.DataFrame()
# join_df['Cluster Label'] = cluster_centroids_df['Cluster Label']
# join_df['Topic Name'] = cluster_centroids_df['6 Word Summary']
# print(len(join_df))

# Add cluster name to observations dataframe
observations_df = pd.merge(observations_df, cluster_centroids_df, on='Cluster Label', how='left')

In [17]:
OUTPUT_filepath = "C:\\Users\\hlmq\\OneDrive - Chevron\\Desktop\\Projects\\SETH employee questions\\cluster results\\"

# Dataset where I performed semantic search of the top k observations against each cluster centroid.
filename_similar_obs = "Observations Close to Topic Clusters.csv"

# Dataset for time series analysis.  Customer Comments + Cluster Info + Created Date
filename_obs_with_cluster_labels = "Employee Questions with Cluster Labels.csv"


response.to_csv(str(OUTPUT_filepath)+str(filename_similar_obs))
observations_df.to_csv(str(OUTPUT_filepath)+str(filename_obs_with_cluster_labels))

# Appendix

In [17]:
# Use this if you want to export any data

#filepath = 'Models\\XGBoost_classifier\\out\\'
#now = datetime.now()
#current_time = now.strftime("%Y_%m_%d-%H_%M_%S")
#filename_submission = current_time + '_XGBoost_Classifier_Results.csv'
#output_data = y_test

#output_data.to_csv(filepath+filename_submission, sep=',', index = False)

In [18]:
# KMeans Elbow

# from sklearn.metrics import silhouette_score

# silhouette_scores = []
# for k in range(2, 7):
#     km = KMeans(n_clusters=k, 
#                 max_iter=300, 
#                 tol=1e-04, 
#                 init='k-means++', 
#                 n_init=10, 
#                 random_state=42, 
#                 algorithm='auto')
#     km.fit(embeddings)
#     silhouette_scores.append(silhouette_score(embeddings, km.labels_))

# fig, ax = plt.subplots()
# ax.plot(range(2, 7), silhouette_scores, 'bx-')
# ax.set_title('Silhouette Score Method')
# ax.set_xlabel('Number of clusters')
# ax.set_ylabel('Silhouette Scores')
# plt.xticks(range(2, 7))
# plt.tight_layout()
# plt.show()

In [19]:
# Possibly use for dimensionality reduction

"""
import pacmap

# PACMAP
embedding = pacmap.PaCMAP(random_state=42)
X_std_pacmap = embedding.fit_transform(X_std.to_numpy())

for l, c, m in zip(range(0, 3), cluster_colors[0:km_fit.n_clusters], ('^', 's', 'o')):
    ax2.scatter(X_std_pacmap[y == l, 0],
                X_std_pacmap[y == l, 1],
                color=c,
                label='cluster %s' % l,
                alpha=0.9,
                marker=m
                )
    
ax1.set_title("PCA Visualization")
ax2.set_title("PACMAP Visualization")

labels = np.unique(km_fit.labels_)
labels = ["cluster "+str(l) for l in labels]
fig.legend(labels, loc='lower center',ncol=len(labels), bbox_transform=(1,0),borderaxespad=-0.5)
plt.tight_layout()
plt.show()
"""

'\nimport pacmap\n\n# PACMAP\nembedding = pacmap.PaCMAP(random_state=42)\nX_std_pacmap = embedding.fit_transform(X_std.to_numpy())\n\nfor l, c, m in zip(range(0, 3), cluster_colors[0:km_fit.n_clusters], (\'^\', \'s\', \'o\')):\n    ax2.scatter(X_std_pacmap[y == l, 0],\n                X_std_pacmap[y == l, 1],\n                color=c,\n                label=\'cluster %s\' % l,\n                alpha=0.9,\n                marker=m\n                )\n    \nax1.set_title("PCA Visualization")\nax2.set_title("PACMAP Visualization")\n\nlabels = np.unique(km_fit.labels_)\nlabels = ["cluster "+str(l) for l in labels]\nfig.legend(labels, loc=\'lower center\',ncol=len(labels), bbox_transform=(1,0),borderaxespad=-0.5)\nplt.tight_layout()\nplt.show()\n'

In [20]:
# ============ #
#  Slice a df
# ============ #

# def slice_df_on_topic(data, topic):
#     return data[data['Topic__c']==topic]

In [21]:
## PUTS THE STARS BACK ON THE SCATTER PLOT (cluster centroids)

# # Centroids
# plt.scatter(
#         x= cluster_centroids_df['x'], y=cluster_centroids_df['y'],
#         c=cluster_centroids_df['Cluster Label'],
#         # annot=centroids_df_for_search['6 Word Summary'],
#         marker = '*',
#         edgecolors='red',
#         s=250
#         )