In [59]:

import pandas as pd

df_issue_all = pd.read_csv('data/templates_repo.csv')
df_issue = pd.read_csv('data/individual_issue_contents.csv')
df_pr = pd.read_csv('data/individual_pr_contents.csv')

In [72]:
import pandas as pd
import numpy as np

df_issue = df_pr[:5]

# Define the template categories and their corresponding patterns
categories = {
    "Greeting contributors": ["introduce", "thanks", "gratitude", "redirect", "oss culture", "donations"],
    "Explaining project guidelines": ["community documentation", "add tests", "update documentation", "format submission", "search first", "not ask questions", "run existing tests", "latest version", "target branch", "discuss first", "sign cla", "not disclose vulnerabilities", "atomic contributions", "rebase commits", "provide comments", "list contributors", "review others' prs"],
    "Collecting relevant information": ["summary", "type", "environment", "steps to reproduce", "expected/actual behavior", "related issues", "additional context", "motivation", "suggested solutions", "screenshot", "label", "log/debugging", "stack trace", "side effect", "location", "willingness", "assignee", "list of main changes", "status", "severity", "knowledge level"]
}

# Preprocess the template contents
preprocessed_contents = [template.lower() for template in df_issue['body']]

# Function to classify a sentence into a category
def classify_sentence(sentence):
    for category, patterns in categories.items():
        for pattern in patterns:
            if pattern in sentence:
                return category
    return "Unknown"  # Category for sentences that do not match any pattern

# Classify each sentence in the templates
template_classifications = []
for template in preprocessed_contents:
    sentences = template.split('\n')
    classifications = [classify_sentence(sentence) for sentence in sentences]
    template_classifications.append(classifications)

# Count the number of occurrences for each category
category_counts = {category: 0 for category in categories}
for classifications in template_classifications:
    for category in classifications:
        if category != "Unknown":
            category_counts[category] += 1

# Print the category counts
for category, count in category_counts.items():
    print("{}: {}".format(category, count))

# Identify the most common category for each template
template_categories = [max(classifications, key=classifications.count) if classifications else "Unknown" for classifications in template_classifications]

# Print the category for each template
for i, category in enumerate(template_categories):
    print("Template {}: {}".format(i+1, category))


Greeting contributors: 1
Explaining project guidelines: 1
Collecting relevant information: 7
Template 1: Unknown
Template 2: Unknown
Template 3: Unknown
Template 4: Unknown
Template 5: Unknown


In [47]:
from transformers import pipeline
classifier = pipeline("zero-shot-classification",
                      model="facebook/bart-large-mnli")

candidate_labels = ['Summary', 'Type', 'Environment', 'Steps to reproduce', 'Expected/Actual behavior', 'Related issues', 'Additional context', 'Screenshot', 'Log', 'Severity']

In [60]:

# iterate over all issues and extract the labels and get the total score for each lable across all issues
total_score = {}
from tqdm import tqdm

# get not null issues 
df_issue = df_issue[df_issue['body'].notnull()]
df_issue = df_issue[df_issue['body'] != ' ']
df_issue = df_issue[df_issue['body'] != '  ']
# split into two halves 
df_issue = df_issue.iloc[:int(len(df_issue)/2)]


for i in tqdm(range(len(df_issue))):
    results= classifier(df_issue.iloc[i]['body'], candidate_labels)
    print(results)
    score_for_a_file = {}
    for j in range(len(results['labels'])):
        # transpose the labels as columns and the scores as values
        # add repo name and issue file name 
        score_for_a_file[results['labels'][j]] = results['scores'][j]
        score_for_a_file['repo'] = df_issue.iloc[i]['repo']
        score_for_a_file['issue_file'] = df_issue.iloc[i]['issue_file']
    total_score[i] = score_for_a_file
    

# print the total score for each label
print(total_score)
print(len(df_issue))

 20%|██        | 1/5 [00:04<00:19,  4.77s/it]

{'sequence': "<!--\n  Please provide a clear and concise description of what the bug is. Include\n  screenshots if needed. Please test using the latest version of the relevant\n  React packages to make sure your issue has not already been fixed.\n-->\n\nReact version:\n\n## Steps To Reproduce\n\n1.\n2.\n\n<!--\n  Your bug will get fixed much faster if we can run your code and it doesn't\n  have dependencies other than React. Issues without reproduction steps or\n  code examples may be immediately closed as not actionable.\n-->\n\nLink to code example:\n\n<!--\n  Please provide a CodeSandbox (https://codesandbox.io/s/new), a link to a\n  repository on GitHub, or provide a minimal code example that reproduces the\n  problem. You may provide a screenshot of the application if you think it is\n  relevant to your bug report. Here are some tips for providing a minimal\n  example: https://stackoverflow.com/help/mcve.\n-->\n\n## The current behavior\n\n\n## The expected behavior\n", 'labels': 

 40%|████      | 2/5 [00:07<00:10,  3.66s/it]

{'sequence': '**Yes, I promise I\'ve read the [Contributions Guidelines](https://github.com/getify/You-Dont-Know-JS/blob/master/CONTRIBUTING.md)** (please feel free to remove this line).\n\n----\n\n**Please type "I already searched for this issue":**\n\n**Edition:** (1st or 2nd)\n\n**Book Title:**\n\n**Chapter:**\n\n**Section Title:**\n\n**Question:**\n', 'labels': ['Type', 'Additional context', 'Related issues', 'Steps to reproduce', 'Screenshot', 'Log', 'Expected/Actual behavior', 'Summary', 'Severity', 'Environment'], 'scores': [0.3140680193901062, 0.18901990354061127, 0.09696897119283676, 0.09348712116479874, 0.08452337980270386, 0.06173728406429291, 0.051031578332185745, 0.04978093504905701, 0.030410487204790115, 0.028972366824746132]}


 60%|██████    | 3/5 [00:13<00:09,  4.82s/it]

{'sequence': "Foreign translations are appreciated. However, any such requests should wait until the 2nd edition is complete.\n\n**I will not be accepting any new foreign translations for previous editions.**\n\n----\n\nPlease check these issues first:\n\n* https://github.com/getify/You-Dont-Know-JS/issues?utf8=%E2%9C%93&q=label%3A%22foreign+language+translations%22+\n* https://github.com/getify/You-Dont-Know-JS/issues/9\n* https://github.com/getify/You-Dont-Know-JS/issues/900\n* https://github.com/getify/You-Dont-Know-JS/issues/1378\n\nTo summarize, the steps for a foreign language translation are:\n\n1. Fork this repo\n2. Make your own translation entirely in your fork, preferably of all six books, but at a minimum of one whole book\n3. File an issue asking for a branch to be made on our main repo, named for that [language's ISO code](http://www.lingoes.net/en/translator/langcode.htm)\n4. Once the branch is created, you can PR to merge your translated work in\n5. Once the merge is co

 80%|████████  | 4/5 [00:16<00:04,  4.00s/it]

{'sequence': '**Yes, I promise I\'ve read the [Contributions Guidelines](https://github.com/getify/You-Dont-Know-JS/blob/master/CONTRIBUTING.md)** (please feel free to remove this line).\n\n----\n\n**Please type "I already searched for this issue":**\n\n**Edition:** (1st or 2nd)\n\n**Book Title:**\n\n**Chapter:**\n\n**Section Title:**\n\n**Problem:**\n', 'labels': ['Type', 'Additional context', 'Steps to reproduce', 'Screenshot', 'Related issues', 'Log', 'Expected/Actual behavior', 'Summary', 'Severity', 'Environment'], 'scores': [0.27921274304389954, 0.20820920169353485, 0.0985214114189148, 0.08654949069023132, 0.08408907055854797, 0.0658993050456047, 0.05716346204280853, 0.0541968047618866, 0.0375758558511734, 0.02858269028365612]}


100%|██████████| 5/5 [00:19<00:00,  3.91s/it]

{'sequence': '**Yes, I promise I\'ve read the [Contributions Guidelines](https://github.com/getify/You-Dont-Know-JS/blob/master/CONTRIBUTING.md)** (please feel free to remove this line).\n\n----\n\n**Please type "I already searched for this issue":**\n\n**Edition:** (1st or 2nd)\n\n**Book Title:**\n\n**Chapter:**\n\n**Section Title:**\n\n**Problem:**\n', 'labels': ['Type', 'Additional context', 'Steps to reproduce', 'Screenshot', 'Related issues', 'Log', 'Expected/Actual behavior', 'Summary', 'Severity', 'Environment'], 'scores': [0.27921274304389954, 0.20820920169353485, 0.0985214114189148, 0.08654949069023132, 0.08408907055854797, 0.0658993050456047, 0.05716346204280853, 0.0541968047618866, 0.0375758558511734, 0.02858269028365612]}
{0: {'Related issues': 0.2185780555009842, 'repo': 'facebook/react', 'issue_file': 'bug_report.md', 'Additional context': 0.1858135163784027, 'Steps to reproduce': 0.1413845270872116, 'Expected/Actual behavior': 0.0962735116481781, 'Type': 0.09152223914861




In [62]:
# write as csv file per repo 
df = pd.DataFrame.from_dict(total_score, orient='index')
# put as repo name as colum name and each label as column 
df = df.reset_index(drop=True)
df = df.rename(columns={'repo': 'repo_name'})
df.to_csv('data/issue_labels-test-10.csv', index=False)

In [51]:
# get average score for each label
avg_score = {}
for key in total_score:
    if key != 'repo' and key != 'issue_file':
        avg_score[key] = total_score[key]/len(df_issue)
    
avg_score

{'Additional context': 0.741646815177896,
 'Related issues': 0.7018280969301521,
 'Log': 0.5265731634969212,
 'Environment': 0.4886565209804041,
 'Expected/Actual behavior': 0.39079710676775126,
 'Screenshot': 0.4210901721245827,
 'Type': 0.638705986572979,
 'Steps to reproduce': 0.5836869422150784,
 'Severity': 0.3784270922558852,
 'Summary': 0.28673410863546533}

In [49]:
# write as csv file per repo 
df = pd.DataFrame.from_dict(total_score, orient='index')
# put as repo name as colum name and each label as column 
df = df.transpose()
df = df.reset_index(drop=True)
df = df.rename(columns={'repo': 'repo_name'})
df.to_csv('data/issue_labels-part-1.csv', index=False)

In [38]:
total_score_copy = total_score.copy()

In [39]:
# get the average score for each label in percentage
for key in total_score_copy:
    print(key, total_score_copy[key]/len(df_issue) * 100)
    


Related issues 9.242164197564124
Type 8.917043693363667
Screenshot 5.2369850330054755
Additional context 9.740497690439225
Summary 5.091745404154063
Log 7.404116675257683
Environment 6.341334345936775
Steps to reproduce 7.391885894536972
Severity 5.506631369143725
Expected/Actual behavior 4.766517766192555


In [37]:


# iterate over all issues and extract the labels and get the total score for each lable across all issues
total_score = {}

# get not null issues 
df_pr = df_pr[df_pr['body'].notnull()]
df_pr = df_pr[df_pr['body'] != ' ']
df_pr = df_pr[df_pr['body'] != '  ']
df_pr = df_pr.reset_index(drop=True)

# select 50 random prs
df_pr = df_pr.sample(n=1000, random_state=1)

for i in range(len(df_pr)):
    results= classifier(df_pr.iloc[i]['body'], candidate_labels, multi_label=True)
    for j in range(len(results['scores'])):
        if results['labels'][j] not in total_score:
            total_score[results['labels'][j]] = results['scores'][j]
        else:
            total_score[results['labels'][j]] += results['scores'][j]

# print the total score for each label
print(total_score)
print(len(df_pr))
    

KeyboardInterrupt: 