In [1]:
import datasets
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import ast
import os
import tiktoken
from src.utils.git_utils import get_changed_files_between_commits, get_repo_content_on_commit, parse_changed_files_and_lines_from_diff

# Bug localization datasets metrics

In [25]:
df = pd.read_csv('/mnt/data/shared-data/lca/bug_localization_data/metrics.csv')
df[:5]

Unnamed: 0,id,text_id,repo_owner,repo_name,issue_url,pull_url,comment_url,links_count,issue_title,link_keyword,...,repo_files_without_tests_count,changed_symbols_count,changed_tokens_count,changed_lines_count,changed_files_without_tests_count,issue_symbols_count,issue_tokens_count,issue_lines_count,issue_links_count,issue_code_blocks_count
0,8543,thealgorithms/python/295/289,thealgorithms,python,https://github.com/TheAlgorithms/Python/issues...,https://github.com/TheAlgorithms/Python/pull/295,https://github.com/TheAlgorithms/Python/pull/295,1,ProjectEuler -- Problem 1 -- solv2.py -- Error,fixes,...,195,357,159,12,1,191,57.0,2,1,3
1,5531,electron/electron/8668/8555,electron,electron,https://github.com/electron/electron/issues/8555,https://github.com/electron/electron/pull/8668,https://github.com/electron/electron/pull/8668,1,Mac app store build uses non-public APIs,close,...,34,95,51,2,1,539,128.0,15,0,0
2,5532,electron/electron/8640/8608,electron,electron,https://github.com/electron/electron/issues/8608,https://github.com/electron/electron/pull/8640,https://github.com/electron/electron/pull/8640,1,Debug symbols not generated for Release build,close,...,34,418,140,8,2,154,36.0,1,0,0
3,5530,electron/electron/11103/11101,electron,electron,https://github.com/electron/electron/issues/11101,https://github.com/electron/electron/pull/11103,https://github.com/electron/electron/pull/11103,1,process.versions does not include new version ...,closes,...,1307,1064,244,27,3,261,71.0,14,0,0
4,6847,keras-team/keras/18352/15282,keras-team,keras,https://github.com/keras-team/keras/issues/15282,https://github.com/keras-team/keras/pull/18352,https://github.com/keras-team/keras/pull/18352,1,MobileNetV3 models can't infer the static shape,solve,...,677,100,39,2,1,5225,1229.0,103,3,1


# Repo metrics analysis

In [5]:
df[['repo_symbols_count', 'repo_tokens_count', 'repo_lines_count', 'repo_files_without_tests_count']].describe(percentiles=[.01, .25, .5, .75, .99])

Unnamed: 0,repo_symbols_count,repo_tokens_count,repo_lines_count,repo_files_without_tests_count
count,10195.0,10153.0,10195.0,10195.0
mean,6602220.0,1754613.0,148603.2,1085.957136
std,20911080.0,7355560.0,367393.2,2057.825737
min,321.0,78.0,9.0,1.0
1%,85117.22,18828.36,2582.52,15.0
25%,651008.0,144960.0,17952.0,117.0
50%,2115322.0,499530.0,53370.0,333.0
75%,5450736.0,1261706.0,144879.0,1114.5
99%,53255790.0,15595710.0,956599.0,7854.54
max,788372300.0,225649700.0,8688752.0,33644.0


In [6]:
# Remove none values repo_tokens_count
len(df[df['repo_tokens_count'].isnull()])

42

# Diff metrics analysis

In [8]:
df[['changed_symbols_count', 'changed_tokens_count', 'changed_lines_count', 'changed_files_count', 'changed_files_without_tests_count']].describe(percentiles=[.01, .25, .5, .75, .99])

Unnamed: 0,changed_symbols_count,changed_tokens_count,changed_lines_count,changed_files_count,changed_files_without_tests_count
count,10195.0,10195.0,10195.0,10195.0,10195.0
mean,4323.743,1382.341,72.649338,4.829917,3.996763
std,63908.77,28497.31,1437.670638,108.235589,104.134846
min,0.0,0.0,0.0,1.0,1.0
1%,38.0,8.0,1.0,1.0,1.0
25%,294.0,63.0,6.0,1.0,1.0
50%,759.0,163.0,15.0,2.0,1.0
75%,2044.5,428.0,42.0,3.0,3.0
99%,34027.12,7719.56,594.42,22.0,17.0
max,4513281.0,1649023.0,139716.0,10769.0,10385.0


In [9]:
ind = df['changed_files_count'].idxmax()
print(df.loc[ind]['issue_body'])

Confirm by changing [ ] to [x] below to ensure that it's a bug:
- [x] I've gone though the [API reference](https://docs.aws.amazon.com/sdk-for-go/v2/api/)
- [x] I've checked [AWS Forums](https://forums.aws.amazon.com) and [StackOverflow](https://stackoverflow.com/questions/tagged/aws-sdk-go) for answers
- [x] I've searched for [previous similar issues](https://github.com/aws/aws-sdk-go-v2/issues) and didn't find any solution
  
**Describe the bug**
When describing an InstanceType like "t3a.large" or "t3a.xlarge" in returned 'InstanceTypeInfo' information about vCPUs is empty - `VCpuInfo` is nil.

In comparison `MemoryInfo` is fine and has all the details.

aws-sdk-go v1 does not have this issue and works as expected.

**Version of AWS SDK for Go?**
v0.29.0

**Version of Go (`go version`)?**
v1.15.2

**To Reproduce (observed behavior)**

	reqInput := &ec2.DescribeInstanceTypesInput{
		InstanceTypes: []types.InstanceType{types.InstanceType("t3a.large")},
	}

	info, err := p.ec2.DescribeI

In [10]:
# Remove none values in changed_tokens_count
print("Failed to count tokens: {} rows to be deleted".format(
    len(df[df['changed_tokens_count'].isnull()]))
)

Failed to count tokens: 0 rows to be deleted


In [11]:
# Remove outliers with lots of changed files
print("Too much changed files: {} rows to be deleted".format(
    len(df[df['changed_files_count'] > 22]))
)

Too much changed files: 100 rows to be deleted


In [12]:
# Remove outliers with lots of changed lines
print("Too much changed files: {} rows to be deleted".format(
    len(df[df['changed_lines_count'] > 594]))
)

Too much changed files: 102 rows to be deleted


# Issue description analysis

In [14]:
df[['issue_symbols_count', 'issue_tokens_count', 'issue_lines_count', 'issue_links_count', 'issue_code_blocks_count']].describe(percentiles=[.01, .25, .5, .75, .99])

Unnamed: 0,issue_symbols_count,issue_tokens_count,issue_lines_count,issue_links_count,issue_code_blocks_count
count,10195.0,10194.0,10195.0,10195.0,10195.0
mean,1949.124865,521.842456,36.545463,0.945562,0.995586
std,4385.789636,1326.001422,59.989556,7.531692,1.321217
min,12.0,2.0,1.0,0.0,0.0
1%,60.0,13.0,1.0,0.0,0.0
25%,409.0,99.0,8.0,0.0,0.0
50%,895.0,227.0,22.0,0.0,1.0
75%,1963.5,525.0,45.0,1.0,2.0
99%,16651.92,4491.49,232.0,6.0,6.0
max,132846.0,51592.0,2402.0,601.0,31.0


In [15]:
ind = df['issue_tokens_count'].idxmax()
print(df.loc[ind]['issue_body'])

**Describe the bug**
After searching for (older) email, used Reply and Reply All before selecting "Download Complete Message" resulted in error.  First time returns K9 to the inbox list.  After repeated attempts, K9 will FC. 

**To Reproduce**
Steps to reproduce the behavior:
1. From inbox or unified inbox, searched for older message
2.  found message with "Download Complete Message" shown at bottom (doesn't matter if attachment or not)
3.  Did NOT click on "Download Complete Message" but instead selected Reply (and Reply all - same result)
4.  After clicking Reply or Reply all, K9 reverted to inbox list of email (did not show new email response)
5.  With (2 or 3) repeated attempts, K9 then crashed/FC

**Expected behavior**
Reply or Reply All yields a new message that can edited/typed and sent.

**Screenshots**
n/a

**Environment (please complete the following information):**
 - K-9 Mail version: 5.734
 - Android version: 10  (Samsung ver FFUE1)
 - Device: Galaxy S9 SM-G960F 
 - Accoun

In [16]:
# Remove outliers with lots of changed files
print("Too large issue description: {} rows to be deleted".format(
    len(df[df['issue_tokens_count'] > 4491]))
)

Too large issue description: 102 rows to be deleted


In [17]:
ind = df['issue_tokens_count'].idxmin()
print(df.loc[ind]['issue_body'])

Stephen requested


In [18]:
# Remove outliers with small issue description
print("Too small issue description: {} rows to be deleted".format(
    len(df[df['issue_tokens_count'] < 13]))
)

Too small issue description: 84 rows to be deleted


In [19]:
# Remove none values in issue_tokens_count
print("Too small issue description: {} rows to be deleted".format(
    len(df[df['issue_tokens_count'].isnull()]))
)

Too small issue description: 1 rows to be deleted


# Outliers filters
We consider 1-st and 99-th quantiles are acceptable for outliers filtration

In [20]:
initial = 4351064
len(df)

10195

In [21]:
changed_files_count_filter = len(df[df['changed_files_count'] <= 22])
changed_lines_count_filter = len(df[df['changed_lines_count'] <= 594])
issue_tokens_count_filter_lo = len(df[df['issue_tokens_count'] >= 13])
issue_tokens_count_filter_up = len(df[df['issue_tokens_count'] <= 4491])
tokens_filter = len(df.dropna())


print(len(df) - changed_files_count_filter, (len(df) - changed_files_count_filter) / initial * 100)
print(len(df) - changed_lines_count_filter, (len(df) - changed_lines_count_filter) / initial * 100)
print(len(df) - tokens_filter, (len(df) - tokens_filter) / initial * 100)
print(len(df) - issue_tokens_count_filter_lo, (len(df) - issue_tokens_count_filter_lo) / initial * 100)
print(len(df) - issue_tokens_count_filter_up, (len(df) - issue_tokens_count_filter_up) / initial * 100)

100 0.0022982884186488637
102 0.002344254187021841
43 0.0009882640200190116
85 0.0019535451558515345
103 0.00236723707120833


In [26]:
# Delete all none values
df.dropna(inplace=True)
print("{} rows left".format(len(df)))

10152 rows left


In [27]:
# Delete diff outliers
df = df[df['changed_files_count'] <= 22]
df = df[df['changed_lines_count'] <= 594]
print("{} rows left".format(len(df)))

9983 rows left


In [28]:
# Delete issue outliers
df = df[df['issue_tokens_count'] >= 13]
df = df[df['issue_tokens_count'] <= 4491]
print("{} rows left".format(len(df)))

9801 rows left
