#### Loading and preparing the dataset

In [1]:
from datasets import load_dataset

In [8]:
# load my recently created dataset from HuggingFace Datasets hub
issues_dataset = load_dataset("rajknakka/github-issues-comments", split="train")
issues_dataset

Found cached dataset parquet (C:/Users/Raj/.cache/huggingface/datasets/rajknakka___parquet/rajknakka--github-issues-comments-d10cf254d383122f/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


Dataset({
    features: ['url', 'repository_url', 'labels_url', 'comments_url', 'events_url', 'html_url', 'id', 'node_id', 'number', 'title', 'user', 'labels', 'state', 'locked', 'assignee', 'assignees', 'milestone', 'comments', 'created_at', 'updated_at', 'closed_at', 'author_association', 'active_lock_reason', 'body', 'reactions', 'timeline_url', 'performed_via_github_app', 'state_reason', 'draft', 'pull_request', 'is_pull_request'],
    num_rows: 4900
})

In [9]:
# filter out pull requests as these tend to be rarely used for answering user queries and will introduce noise in our search engine
issues_dataset = issues_dataset.filter(lambda row: row['is_pull_request'] == False and len(row["comments"]) > 0)
issues_dataset

Loading cached processed dataset at C:\Users\Raj\.cache\huggingface\datasets\rajknakka___parquet\rajknakka--github-issues-comments-d10cf254d383122f\0.0.0\2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec\cache-be07580daa56c5b4.arrow


Dataset({
    features: ['url', 'repository_url', 'labels_url', 'comments_url', 'events_url', 'html_url', 'id', 'node_id', 'number', 'title', 'user', 'labels', 'state', 'locked', 'assignee', 'assignees', 'milestone', 'comments', 'created_at', 'updated_at', 'closed_at', 'author_association', 'active_lock_reason', 'body', 'reactions', 'timeline_url', 'performed_via_github_app', 'state_reason', 'draft', 'pull_request', 'is_pull_request'],
    num_rows: 1598
})

In [10]:
# keep the title, body, comments and html_url columns and remove the rest of the columns
columns_to_keep = ["title", "body", "comments", "html_url"]
columns = issues_dataset.column_names
columns_to_remove = set(columns_to_keep).symmetric_difference(columns)
prepared_dataset = issues_dataset.remove_columns(columns_to_remove)
prepared_dataset

Dataset({
    features: ['html_url', 'title', 'comments', 'body'],
    num_rows: 1598
})

In [7]:
prepared_dataset

Dataset({
    features: ['url', 'repository_url', 'labels_url', 'comments_url', 'events_url', 'html_url', 'id', 'node_id', 'number', 'title', 'user', 'labels', 'state', 'locked', 'assignee', 'assignees', 'milestone', 'comments', 'created_at', 'updated_at', 'closed_at', 'author_association', 'active_lock_reason', 'body', 'reactions', 'timeline_url', 'performed_via_github_app', 'state_reason', 'draft', 'pull_request', 'is_pull_request'],
    num_rows: 1598
})

#### Create embeddings with the comments and their context

In [11]:
# first conver the dataset to a pandas dataframe
prepared_dataset.set_format("pandas")
issues_df = prepared_dataset[:]

In [25]:
# inspect the dataframe to see the four columns
issues_df["comments"][1].tolist()



In [23]:
# explode the dataframe to have one row per comment
comments_df = issues_df.explode("comments", ignore_index=True)
comments_df.head(4)

Unnamed: 0,html_url,title,comments,body
0,https://github.com/huggingface/datasets/issues...,Support for data with instance-wise dictionary...,Hi ! We use the Arrow columnar format under th...,### Feature request\n\nI notice that when load...
1,https://github.com/huggingface/datasets/issues...,Return the audio filename when decoding fails ...,Hi ! The audio data don't always exist as file...,### Feature request\r\n\r\nReturn the audio fi...
2,https://github.com/huggingface/datasets/issues...,Return the audio filename when decoding fails ...,"Thanks @lhoestq, I wasn't aware of the decode ...",### Feature request\r\n\r\nReturn the audio fi...
3,https://github.com/huggingface/datasets/issues...,IndexError Not Solving -> IndexError: Invalid ...,https://colab.research.google.com/#scrollTo=AQ...,### Describe the bug\n\nin <cell line: 1>:1 ...


In [26]:
# done with pandas dataframe, convert back to HuggingFace dataset
from datasets import Dataset

comments_dataset = Dataset.from_pandas(comments_df)
comments_dataset

Dataset({
    features: ['html_url', 'title', 'comments', 'body'],
    num_rows: 5650
})