In [3]:
# !pip install -U sentence-transformers

In [29]:
import nltk
# nltk.download('stopwords')
# nltk.download('punkt')
# nltk.download('wordnet')
# nltk.download('omw-1.4')
import os, sys
import time

from pathlib import Path
import logging
import itertools
from tqdm.auto import tqdm

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split


import openai
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans

from openai.embeddings_utils import (
    get_embedding,
    distances_from_embeddings,
    tsne_components_from_embeddings,
    pca_components_from_embeddings,
    chart_from_components,
    indices_of_nearest_neighbors_from_distances,
    chart_from_components_3D
)


from capstone.config import CapstoneConfig
from capstone.data_access import DataClass
from capstone.features import Features
from capstone.evaluation import CustomEvaluation
from capstone.utils.constants import (
    TEXT,
    TARGET,
    ORIGINAL_TEXT,
    SPLIT,
    DEVELOP,
    TEST,
    PREDICTION,
    CORRECTED
)

import plotly.express as px

sns.set_style("darkgrid")
PARENT_PATH = Path(os.getcwd()).parent.absolute()
openai.api_key = os.getenv("OPENAI_API_KEY")
openai.organization = os.getenv("OPENAI_API_ORGANIZATION_KEY")


FORMAT = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
logging.basicConfig(format=FORMAT, level=logging.INFO)

In [30]:
config = CapstoneConfig()
config.CURRENT_PATH = PARENT_PATH

data = DataClass(config)
df = data.build()
df.head(3)

Unnamed: 0,Text,Standard mapping?
0,General Quality System Requirements […] Manage...,"[quality management, quality planning]"
1,General Quality System Requirements […] Manage...,[quality management]
2,General Quality System Requirements […] Manage...,[quality management]


In [31]:
features = Features(config)
df_features = features.build(df)

2022-12-06 13:34:33,938 - capstone.utils.utils - INFO - func:clean took: 1.88 sec
2022-12-06 13:34:34,548 - capstone.utils.utils - INFO - func:fit took: 2.49 sec
2022-12-06 13:34:36,299 - capstone.utils.utils - INFO - func:clean took: 1.75 sec
2022-12-06 13:34:36,529 - capstone.utils.utils - INFO - func:transform took: 1.98 sec
2022-12-06 13:34:36,530 - capstone.utils.utils - INFO - func:fit_transform took: 4.48 sec
2022-12-06 13:34:36,720 - capstone.utils.utils - INFO - func:clean took: 0.19 sec
2022-12-06 13:34:36,748 - capstone.utils.utils - INFO - func:transform took: 0.22 sec
2022-12-06 13:34:36,773 - capstone.utils.utils - INFO - func:build took: 4.73 sec


In [7]:
# dev = df_features[df_features[SPLIT].isin([DEVELOP])].copy()
# dev.reset_index(drop=True, inplace=True)
# Y_dev = dev[features.mlb.classes_].copy()

# test = df_features[df_features[SPLIT].isin([TEST])].copy()
# test.reset_index(drop=True, inplace=True)
# Y_test = test[features.mlb.classes_].copy()

In [8]:
# test = test[[TEXT, TARGET]]
# test

Unnamed: 0,Text,Standard mapping?
0,personnel personnel engaged nonclinical labora...,[non clinical research]
1,protocol document describes objective design m...,[clinical research]
2,control document document required quality man...,"[documentation management, production process ..."
3,qualification validation accordance gmp pharma...,[vmp]
4,product complaint complaint investigation foll...,"[capa, nc, rca]"
...,...,...
379,manufacturing control packaging operation ensu...,[labeling and packaging operations]
380,production manufacturing operation identificat...,[production process controls]
381,examination design class 4 ivd medical device ...,"[audit inspection, production process controls]"
382,financial aspect trial documented agreement sp...,[clinical research]


In [38]:
df_features['num_labels'] = [len(df_features[TARGET][row]) for row in range(df_features.shape[0])]
df_features['single_label'] = [df_features[TARGET][row][0] for row in range(df_features.shape[0])]
df_single = df_features[df_features['num_labels']==1]
df_single = df_single[["Original Text","single_label"]]
df_single.reset_index(drop=True, inplace=True)

In [39]:
df_single

Unnamed: 0,Original Text,single_label
0,Validation […] Process Validation Program […] ...,process & packaging validation
1,Quality Documents and Records […]\nDistributio...,documentation management
2,General requirements regarding clinical invest...,clinical research
3,Manufacturing control / Agreement […] The agre...,purchasing controls
4,General Quality System Requirements […] Risk M...,risk management
...,...,...
2782,"There should be periodic requalification, as w...",change controls
2783,(a) Management should determine and provide ad...,quality management
2784,Manufacturing control / Packaging operations [...,labeling and packaging operations
2785,Production […] Manufacturing operations […] Id...,production process controls


In [40]:
print(df_features.shape)
print(df_single.shape)

(3831, 553)
(2787, 2)


In [41]:
df_single.value_counts('single_label')

single_label
production process controls                388
clinical research                          257
labeling and packaging operations          225
non clinical research                      174
data management                            148
purchasing controls                        144
material and product controls              139
risk management                            119
quality management                          99
design controls                             92
laboratory controls                         87
documentation management                    81
stability programs                          81
udi and serialization                       68
vigilance                                   66
qrb                                         59
training                                    54
samd                                        54
audit inspection                            52
medical device post market surveillance     48
controled substances                        46


In [12]:
# train_ft.to_json(os.path.join(data.data_path, "ft_training.jsonl"), orient="records", lines=True, force_ascii=False)
# val_ft.to_json(os.path.join(data.data_path, "ft_validation.jsonl"), orient="records", lines=True, force_ascii=False)

In [44]:
# MODEL = "ada:ft-columbia-university:jnj-multi-label-2022-12-01-01-02-06"
MODEL = "text-similarity-ada-001"

In [50]:
def get_embedding(text, engine = MODEL):

    # replace newlines, which can negatively affect performance.
    time.sleep(5)
    text = text.replace("\n", " ")

    return openai.Embedding.create(input=[text], engine=engine)["data"][0]["embedding"]

In [56]:
import statistics
def purity_score(labels, clusters, num_clusters):
  labels_clusters = list(zip(labels, clusters))
  purity_scores = []
  for cluster in range(num_clusters):
    #print(cluster)
    cluster_i = [row for row in labels_clusters if row[1] == cluster]
    #print(cluster_i)
    cluster_i_label = [row[0] for row in cluster_i]
    #print(cluster_i_label)
    mode_label = statistics.mode(cluster_i_label)
    purity = sum([label == mode_label for label in cluster_i_label])/len(cluster_i_label)
    purity_scores.append(purity)
    print('Purity socre for cluster ', cluster, ' is: ', '{0:.12f}'.format(purity), "|   Mode label is:", mode_label)
  print('Average purity score over all clusters is: ', np.mean(purity_scores))

In [48]:
choice_of_label = ['medical device post market surveillance','process & packaging validation','md product cybersecurity','audit inspection','calibration']
df_test = df_single[df_single['single_label'].isin(choice_of_label)]
df_test.reset_index(drop=True, inplace=True)
df_test

Unnamed: 0,Original Text,single_label
0,Validation […] Process Validation Program […] ...,process & packaging validation
1,"At EU level, the following legislative acts ar...",md product cybersecurity
2,Contract Manufacture and Analysis […] The Cont...,audit inspection
3,The PSUR should be a “self-standing” document ...,medical device post market surveillance
4,"Secure Design and Manufacture [...] \nSafety, ...",md product cybersecurity
...,...,...
213,Medical Devices Registration and Filing […] \...,medical device post market surveillance
214,"Self-inspection, quality audits and suppliers’...",audit inspection
215,IEC 80002(series) Medical device software: Pro...,md product cybersecurity
216,"For each device manufacturers shall plan, esta...",medical device post market surveillance


In [51]:
df_test["Embedding"] = df_test["Original Text"].apply(get_embedding)

2022-12-06 15:25:34,788 - openai - INFO - message='Request to OpenAI API' method=post path=https://api.openai.com/v1/engines/text-similarity-ada-001/embeddings
2022-12-06 15:25:35,330 - openai - INFO - message='OpenAI API response' path=https://api.openai.com/v1/engines/text-similarity-ada-001/embeddings processing_ms=117 request_id=989b9364814237dc7bbab784ac43a7ff response_code=200
2022-12-06 15:25:40,336 - openai - INFO - message='Request to OpenAI API' method=post path=https://api.openai.com/v1/engines/text-similarity-ada-001/embeddings
2022-12-06 15:25:40,419 - openai - INFO - message='OpenAI API response' path=https://api.openai.com/v1/engines/text-similarity-ada-001/embeddings processing_ms=15 request_id=b5f424a39e9d6887bf1aff5adea3b294 response_code=200
2022-12-06 15:25:45,423 - openai - INFO - message='Request to OpenAI API' method=post path=https://api.openai.com/v1/engines/text-similarity-ada-001/embeddings
2022-12-06 15:25:45,595 - openai - INFO - message='OpenAI API respons

In [52]:
df_test.to_csv('../data/ada-embedding.csv',index=False)

In [53]:
num_clusters = 5
model = KMeans(n_clusters=num_clusters)
embeddings = list(df_test["Embedding"])
# Fit the embedding with kmeans clustering.
model.fit(embeddings)
# Get the cluster id assigned to each news headline.
cluster_assignment = model.labels_

In [59]:
clusters = list(cluster_assignment)
strings = df_test["Original Text"].tolist()
labels = df_test['single_label'].tolist()
strings = ['(True label: ' + labels[i] + ')' + strings[i] for i in range(len(labels))] 
purity_score(labels, clusters, num_clusters)
print()
pca_components_3d = pca_components_from_embeddings(embeddings, n_components=3)
pca_components_2d = pca_components_from_embeddings(embeddings, n_components=2)
  
pcadf = pd.DataFrame(pca_components_2d, 
  columns=["component_1", "component_2"],
)
pcadf["predicted_cluster"] = clusters 
pcadf["true_label"] = labels
pcadf["Text"] = df_test["Original Text"]

fig = px.scatter(pcadf, y="component_2", x="component_1", color="predicted_cluster",
                 symbol="true_label", hover_data=["true_label", "Text"])
fig.update_layout(uniformtext_mode='hide')
fig.update_layout(legend_orientation="h")
fig.show()

Purity socre for cluster  0  is:  1.000000000000 |   Mode label is: calibration
Purity socre for cluster  1  is:  0.844444444444 |   Mode label is: audit inspection
Purity socre for cluster  2  is:  0.931818181818 |   Mode label is: process & packaging validation
Purity socre for cluster  3  is:  1.000000000000 |   Mode label is: md product cybersecurity
Purity socre for cluster  4  is:  0.746031746032 |   Mode label is: medical device post market surveillance
Average purity score over all clusters is:  0.9044588744588744

