In [19]:

import pandas as pd

df_issue_all = pd.read_csv('data/templates_repo.csv')
df_issue = pd.read_csv('data/individual_issue_contents.csv')
df_pr = pd.read_csv('data/individual_pr_contents.csv')

In [20]:
from transformers import pipeline
classifier = pipeline("zero-shot-classification",
                      model="facebook/bart-large-mnli")

candidate_labels = ['Summary', 'Type', 'Environment', 'Steps to reproduce', 'Expected/Actual behavior', 'Related issues', 'Additional context', 'Screenshot', 'Log', 'Severity']

In [21]:

# iterate over all issues and extract the labels and get the total score for each lable across all issues
total_score = {}
from tqdm import tqdm

# get not null issues 
df_issue = df_issue[df_issue['body'].notnull()]
df_issue = df_issue[df_issue['body'] != ' ']
df_issue = df_issue[df_issue['body'] != '  ']

df_issue = df_issue.iloc[int(len(df_issue)/2):]

df_issue = df_issue.reset_index(drop=True)

for i in tqdm(range(len(df_issue))):
    results= classifier(df_issue.iloc[i]['body'], candidate_labels, multi_label=True)
    for j in range(len(results['scores'])):
        if results['labels'][j] not in total_score:
            total_score[results['labels'][j]] = results['scores'][j]
        else:
            total_score[results['labels'][j]] += results['scores'][j]
    total_score['repo'] = df_issue.iloc[i]['repo']
    total_score['issue_file'] = df_issue.iloc[i]['issue_file']

# print the total score for each label
print(total_score)
print(len(df_issue))

100%|██████████| 3143/3143 [8:02:58<00:00,  9.22s/it]   

{'Screenshot': 1333.5279111238196, 'Summary': 908.2945446299855, 'Related issues': 2207.674888310954, 'Additional context': 2332.6831313967705, 'Type': 2012.3376517063007, 'Steps to reproduce': 1842.7172833532095, 'Log': 1653.9070672942325, 'Environment': 1542.0549444421194, 'Severity': 1188.2451126333326, 'Expected/Actual behavior': 1233.4246628177352, 'repo': 'go-kratos_kratos', 'issue_file': 'question.md'}
3143





In [31]:

df_issue_scores = pd.read_csv('data/issue_labels.csv')
df_issue_scores.columns

Index(['Related issues', 'repo_name', 'issue_file', 'Additional context',
       'Steps to reproduce', 'Expected/Actual behavior', 'Type', 'Screenshot',
       'Log', 'Summary', 'Severity', 'Environment'],
      dtype='object')

In [34]:
# Exclude 'repo_name' and 'issue_file' from the columns to calculate average
columns = ['Related issues', 'Additional context', 'Steps to reproduce', 'Expected/Actual behavior',
           'Type', 'Screenshot', 'Log', 'Summary', 'Severity', 'Environment']

# Calculate the average of each column
average_labels = df_issue_scores[columns].mean() * 100

# Print the average labels
print(average_labels)


Related issues              18.581406
Additional context          20.673790
Steps to reproduce          17.256374
Expected/Actual behavior     9.926884
Type                         7.951364
Screenshot                   5.914838
Log                          4.712445
Summary                      7.267429
Severity                     3.486450
Environment                  4.229020
dtype: float64


In [39]:

categories = [
    "Greeting and Introduction",
    "Submission Guidelines and Formatting",
    "Documentation and Testing",
    "Issue/Problem Description",
    "Proposed Solutions and Suggestions",
    "Additional Context and Supporting Information",
    "Related Issues and References",
    "Log/Debugging Information",
    "Severity and Impact Assessment",
    "Collaboration and Review"
]

df_issue_scores_2 = pd.read_csv('data/issue_labels-2.csv')

# Calculate the average of each column
average_labels_2 = df_issue_scores_2[categories].mean() * 100

# Print the average labels
print(average_labels_2)

Greeting and Introduction                         4.499977
Submission Guidelines and Formatting              5.142337
Documentation and Testing                         5.678349
Issue/Problem Description                        19.638139
Proposed Solutions and Suggestions               12.175001
Additional Context and Supporting Information    26.391812
Related Issues and References                     8.811750
Log/Debugging Information                         7.091120
Severity and Impact Assessment                    4.727414
Collaboration and Review                          5.844103
dtype: float64


In [40]:

categories = [
    "Greeting and Introduction",
    "Submission Guidelines and Formatting",
    "Documentation and Testing",
    "Issue/Problem Description",
    "Proposed Solutions and Suggestions",
    "Additional Context and Supporting Information",
    "Related Issues and References",
    "Log/Debugging Information",
    "Severity and Impact Assessment",
    "Collaboration and Review"
]

df_pr_scores_2 = pd.read_csv('data/pr_labels-2.csv')

# Calculate the average of each column
average_pr_labels_2 = df_pr_scores_2[categories].mean() * 100

# Print the average labels
print(average_pr_labels_2)

Greeting and Introduction                         5.025954
Submission Guidelines and Formatting              6.096200
Documentation and Testing                        13.607567
Issue/Problem Description                        15.162385
Proposed Solutions and Suggestions                9.603613
Additional Context and Supporting Information    23.828308
Related Issues and References                     9.001904
Log/Debugging Information                         4.671558
Severity and Impact Assessment                    4.670882
Collaboration and Review                          8.331628
dtype: float64


In [35]:

df_pr_scores = pd.read_csv('data/pr_labels.csv')
df_pr_scores.columns

Index(['Summary', 'repo_name', 'pr_file', 'Additional context',
       'Steps to reproduce', 'Related issues', 'Screenshot',
       'Expected/Actual behavior', 'Type', 'Environment', 'Log', 'Severity'],
      dtype='object')

In [37]:
# Exclude 'repo_name' and 'issue_file' from the columns to calculate average
columns = ['Related issues', 'Additional context', 'Steps to reproduce', 'Expected/Actual behavior',
           'Type', 'Screenshot', 'Log', 'Summary', 'Severity', 'Environment']

# Calculate the average of each column
average_labels = df_pr_scores[columns].mean() * 100

# Print the average labels
print(average_labels)



Related issues              18.350389
Additional context          24.482078
Steps to reproduce          12.812169
Expected/Actual behavior     6.590542
Type                         8.913374
Screenshot                   5.695020
Log                          4.750893
Summary                     11.551874
Severity                     3.996439
Environment                  2.857222
dtype: float64


In [25]:
# write as csv file per repo 
df = pd.DataFrame.from_dict(total_score, orient='index')
# put as repo name as colum name and each label as column 
df = df.reset_index(drop=True)
df = df.rename(columns={'repo': 'repo_name'})
df.to_csv('data/issue_labels-part-2.csv', index=False)

In [2]:
import pandas as pd 

In [3]:
df_issue_labels_1 = pd.read_csv('data/issue_labels.csv')
df_issue_labels_2 = pd.read_csv('data/issue_labels-2.csv')

# combine the two dataframes and ignore the same columns 
df_issue_labels = pd.concat([df_issue_labels_1, df_issue_labels_2], axis=1, join='inner')

df_issue_labels.to_csv('data/issue_scores.csv', index=False)

In [None]:
df_pr_labels_1 = pd.read_csv('data/pr_labels.csv')
df_pr_labels_2 = pd.read_csv('data/pr_labels-2.csv')

df_pr_labels = pd.concat([df_pr_labels_1, df_pr_labels_2], axis=1, join='inner')
df_pr_labels.to_csv('data/pr_scores.csv', index=False)