To run this notebook, you will need to install: pandas, openai, transformers, plotly, matplotlib, scikit-learn, torch (transformer dep), torchvision, and scipy.

In [7]:
# imports
import openai
import pandas as pd
import tiktoken
from sklearn.manifold import TSNE
import numpy as np

from openai.embeddings_utils import get_embedding
openai.api_key = "OPENAI_API_KEY"

In [16]:
# embedding model parameters
embedding_model = "text-embedding-ada-002"
embedding_encoding = "cl100k_base"  # this the encoding for text-embedding-ada-002
max_tokens = 8000  # the maximum for text-embedding-ada-002 is 8191


In [None]:
# load & inspect dataset
working_file = "../data/Issues.csv"

df = pd.read_csv(working_file, index_col=0)


df = df[["Key", "Summary", "Assignee", "Status", "Created", "Start", "DueDate", "Priority", "IssueType", "Size", "Team", "Reporter", "Description"]]
print(df)
df = df.dropna()

df["combined"] = (
    "Title: " + df.Summary.str.strip() + "; Content: " + df.Description.str.strip()
)
df.head(2)


In [18]:
# subsample to 1k most recent reviews and remove samples that are too long
top_n = 1000
df = df.sort_values("Created").tail(top_n * 2)  # first cut to first 2k entries, assuming less than half will be filtered out
df.drop("Created", axis=1, inplace=True)

encoding = tiktoken.get_encoding(embedding_encoding)

# omit reviews that are too long to embed
df["n_tokens"] = df.combined.apply(lambda x: len(encoding.encode(x)))
df = df[df.n_tokens <= max_tokens].tail(top_n)
len(df)


111

### Get embeddings and save them for future reuse

In [11]:
# This may take a few minutes
df["embedding"] = df.combined.apply(lambda x: get_embedding(x, engine=embedding_model))

saved_file = "../data/Embedding_Issues.csv"

df.to_csv(saved_file)