Skip to content

Commit

Permalink
Merge pull request #142 from symflower/128-character-count
Browse files Browse the repository at this point in the history
Track how many characters were present in a model response and generated test files
  • Loading branch information
bauersimon committed May 28, 2024
2 parents 3f70a04 + 0b0458d commit 6b3b143
Show file tree
Hide file tree
Showing 11 changed files with 353 additions and 66 deletions.
76 changes: 67 additions & 9 deletions cmd/eval-dev-quality/cmd/evaluate_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,23 +51,25 @@ func atoiUint64(t *testing.T, s string) uint64 {
type extractMetricsMatch *regexp.Regexp

// extractMetricsLogsMatch is a regular expression to extract metrics from log messages.
var extractMetricsLogsMatch = extractMetricsMatch(regexp.MustCompile(`score=(\d+), coverage-statement=(\d+), files-executed=(\d+), processing-time=(\d+), response-no-error=(\d+), response-no-excess=(\d+), response-with-code=(\d+)`))
var extractMetricsLogsMatch = extractMetricsMatch(regexp.MustCompile(`score=(\d+), coverage-statement=(\d+), files-executed=(\d+), generate-tests-for-file-character-count=(\d+), processing-time=(\d+), response-character-count=(\d+), response-no-error=(\d+), response-no-excess=(\d+), response-with-code=(\d+)`))

// extractMetricsCSVMatch is a regular expression to extract metrics from CSV rows.
var extractMetricsCSVMatch = extractMetricsMatch(regexp.MustCompile(`(\d+),(\d+),(\d+),(\d+),(\d+),(\d+),(\d+)`))
var extractMetricsCSVMatch = extractMetricsMatch(regexp.MustCompile(`(\d+),(\d+),(\d+),(\d+),(\d+),(\d+),(\d+),(\d+),(\d+)`))

// extractMetrics extracts multiple assessment metrics from the given string according to a given regular expression.
func extractMetrics(t *testing.T, regex extractMetricsMatch, data string) (assessments []metrics.Assessments, scores []uint64) {
matches := (*regexp.Regexp)(regex).FindAllStringSubmatch(data, -1)

for _, match := range matches {
assessments = append(assessments, metrics.Assessments{
metrics.AssessmentKeyCoverageStatement: atoiUint64(t, match[2]),
metrics.AssessmentKeyFilesExecuted: atoiUint64(t, match[3]),
metrics.AssessmentKeyProcessingTime: atoiUint64(t, match[4]),
metrics.AssessmentKeyResponseNoError: atoiUint64(t, match[5]),
metrics.AssessmentKeyResponseNoExcess: atoiUint64(t, match[6]),
metrics.AssessmentKeyResponseWithCode: atoiUint64(t, match[7]),
metrics.AssessmentKeyCoverageStatement: atoiUint64(t, match[2]),
metrics.AssessmentKeyFilesExecuted: atoiUint64(t, match[3]),
metrics.AssessmentKeyGenerateTestsForFileCharacterCount: atoiUint64(t, match[4]),
metrics.AssessmentKeyProcessingTime: atoiUint64(t, match[5]),
metrics.AssessmentKeyResponseCharacterCount: atoiUint64(t, match[6]),
metrics.AssessmentKeyResponseNoError: atoiUint64(t, match[7]),
metrics.AssessmentKeyResponseNoExcess: atoiUint64(t, match[8]),
metrics.AssessmentKeyResponseWithCode: atoiUint64(t, match[9]),
})
scores = append(scores, atoiUint64(t, match[1]))
}
Expand Down Expand Up @@ -192,7 +194,10 @@ func TestEvaluateExecute(t *testing.T) {
metrics.AssessmentKeyResponseWithCode: 1,
},
}, []uint64{14})
// Assert non-deterministic behavior.
assert.Greater(t, actualAssessments[0][metrics.AssessmentKeyProcessingTime], uint64(0))
assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyGenerateTestsForFileCharacterCount], uint64(254))
assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyResponseCharacterCount], uint64(254))
assert.Equal(t, 1, strings.Count(output, "Evaluation score for"))
},
ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){
Expand All @@ -209,7 +214,10 @@ func TestEvaluateExecute(t *testing.T) {
metrics.AssessmentKeyResponseWithCode: 1,
},
}, []uint64{14})
// Assert non-deterministic behavior.
assert.Greater(t, actualAssessments[0][metrics.AssessmentKeyProcessingTime], uint64(0))
assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyGenerateTestsForFileCharacterCount], uint64(254))
assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyResponseCharacterCount], uint64(254))
},
"evaluation.log": nil,
"golang-summed.csv": func(t *testing.T, filePath, data string) {
Expand All @@ -222,7 +230,10 @@ func TestEvaluateExecute(t *testing.T) {
metrics.AssessmentKeyResponseWithCode: 1,
},
}, []uint64{14})
// Assert non-deterministic behavior.
assert.Greater(t, actualAssessments[0][metrics.AssessmentKeyProcessingTime], uint64(0))
assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyGenerateTestsForFileCharacterCount], uint64(254))
assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyResponseCharacterCount], uint64(254))
},
"models-summed.csv": func(t *testing.T, filePath, data string) {
actualAssessments := validateMetrics(t, extractMetricsCSVMatch, data, []metrics.Assessments{
Expand All @@ -234,7 +245,10 @@ func TestEvaluateExecute(t *testing.T) {
metrics.AssessmentKeyResponseWithCode: 1,
},
}, []uint64{14})
// Assert non-deterministic behavior.
assert.Greater(t, actualAssessments[0][metrics.AssessmentKeyProcessingTime], uint64(0))
assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyGenerateTestsForFileCharacterCount], uint64(254))
assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyResponseCharacterCount], uint64(254))
},
"README.md": func(t *testing.T, filePath, data string) {
validateReportLinks(t, data, []string{"symflower_symbolic-execution"})
Expand All @@ -259,7 +273,10 @@ func TestEvaluateExecute(t *testing.T) {
metrics.AssessmentKeyResponseWithCode: 2,
},
}, []uint64{28})
// Assert non-deterministic behavior.
assert.Greater(t, actualAssessments[0][metrics.AssessmentKeyProcessingTime], uint64(0))
assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyGenerateTestsForFileCharacterCount], uint64(393))
assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyResponseCharacterCount], uint64(393))
assert.Equal(t, 1, strings.Count(output, "Evaluation score for"))
},
ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){
Expand All @@ -283,8 +300,13 @@ func TestEvaluateExecute(t *testing.T) {
metrics.AssessmentKeyResponseWithCode: 1,
},
}, []uint64{14, 14})
// Assert non-deterministic behavior.
assert.Greater(t, actualAssessments[0][metrics.AssessmentKeyProcessingTime], uint64(0))
assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyGenerateTestsForFileCharacterCount], uint64(254))
assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyResponseCharacterCount], uint64(254))
assert.Greater(t, actualAssessments[1][metrics.AssessmentKeyProcessingTime], uint64(0))
assert.Equal(t, actualAssessments[1][metrics.AssessmentKeyGenerateTestsForFileCharacterCount], uint64(139))
assert.Equal(t, actualAssessments[1][metrics.AssessmentKeyResponseCharacterCount], uint64(139))
},
"golang-summed.csv": func(t *testing.T, filePath, data string) {
actualAssessments := validateMetrics(t, extractMetricsCSVMatch, data, []metrics.Assessments{
Expand All @@ -296,7 +318,10 @@ func TestEvaluateExecute(t *testing.T) {
metrics.AssessmentKeyResponseWithCode: 1,
},
}, []uint64{14})
// Assert non-deterministic behavior.
assert.Greater(t, actualAssessments[0][metrics.AssessmentKeyProcessingTime], uint64(0))
assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyGenerateTestsForFileCharacterCount], uint64(254))
assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyResponseCharacterCount], uint64(254))
},
"java-summed.csv": func(t *testing.T, filePath, data string) {
actualAssessments := validateMetrics(t, extractMetricsCSVMatch, data, []metrics.Assessments{
Expand All @@ -308,7 +333,10 @@ func TestEvaluateExecute(t *testing.T) {
metrics.AssessmentKeyResponseWithCode: 1,
},
}, []uint64{14})
// Assert non-deterministic behavior.
assert.Greater(t, actualAssessments[0][metrics.AssessmentKeyProcessingTime], uint64(0))
assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyGenerateTestsForFileCharacterCount], uint64(139))
assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyResponseCharacterCount], uint64(139))
},
"models-summed.csv": func(t *testing.T, filePath, data string) {
actualAssessments := validateMetrics(t, extractMetricsCSVMatch, data, []metrics.Assessments{
Expand All @@ -320,7 +348,10 @@ func TestEvaluateExecute(t *testing.T) {
metrics.AssessmentKeyResponseWithCode: 2,
},
}, []uint64{28})
// Assert non-deterministic behavior.
assert.Greater(t, actualAssessments[0][metrics.AssessmentKeyProcessingTime], uint64(0))
assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyGenerateTestsForFileCharacterCount], uint64(393))
assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyResponseCharacterCount], uint64(393))
},
"evaluation.log": nil,
"README.md": func(t *testing.T, filePath, data string) {
Expand Down Expand Up @@ -353,7 +384,10 @@ func TestEvaluateExecute(t *testing.T) {
metrics.AssessmentKeyResponseWithCode: 1,
},
}, []uint64{14})
// Assert non-deterministic behavior.
assert.Greater(t, actualAssessments[0][metrics.AssessmentKeyProcessingTime], uint64(0))
assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyGenerateTestsForFileCharacterCount], uint64(254))
assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyResponseCharacterCount], uint64(254))
assert.Equal(t, 1, strings.Count(output, "Evaluation score for"))
},
ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){
Expand All @@ -370,7 +404,10 @@ func TestEvaluateExecute(t *testing.T) {
metrics.AssessmentKeyResponseWithCode: 1,
},
}, []uint64{14})
// Assert non-deterministic behavior.
assert.Greater(t, actualAssessments[0][metrics.AssessmentKeyProcessingTime], uint64(0))
assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyGenerateTestsForFileCharacterCount], uint64(254))
assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyResponseCharacterCount], uint64(254))
},
"evaluation.log": nil,
"golang-summed.csv": func(t *testing.T, filePath, data string) {
Expand All @@ -383,7 +420,10 @@ func TestEvaluateExecute(t *testing.T) {
metrics.AssessmentKeyResponseWithCode: 1,
},
}, []uint64{14})
// Assert non-deterministic behavior.
assert.Greater(t, actualAssessments[0][metrics.AssessmentKeyProcessingTime], uint64(0))
assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyGenerateTestsForFileCharacterCount], uint64(254))
assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyResponseCharacterCount], uint64(254))
},
"models-summed.csv": func(t *testing.T, filePath, data string) {
actualAssessments := validateMetrics(t, extractMetricsCSVMatch, data, []metrics.Assessments{
Expand All @@ -395,7 +435,10 @@ func TestEvaluateExecute(t *testing.T) {
metrics.AssessmentKeyResponseWithCode: 1,
},
}, []uint64{14})
// Assert non-deterministic behavior.
assert.Greater(t, actualAssessments[0][metrics.AssessmentKeyProcessingTime], uint64(0))
assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyGenerateTestsForFileCharacterCount], uint64(254))
assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyResponseCharacterCount], uint64(254))
},
"README.md": func(t *testing.T, filePath, data string) {
validateReportLinks(t, data, []string{"symflower_symbolic-execution"})
Expand All @@ -412,7 +455,7 @@ func TestEvaluateExecute(t *testing.T) {
},

ExpectedOutputValidate: func(t *testing.T, output string, resultPath string) {
assert.Regexp(t, `Evaluation score for "symflower/symbolic-execution" \("code-no-excess"\): score=14, coverage-statement=10, files-executed=1, processing-time=\d+, response-no-error=1, response-no-excess=1, response-with-code=1`, output)
assert.Regexp(t, `Evaluation score for "symflower/symbolic-execution" \("code-no-excess"\): score=14, coverage-statement=10, files-executed=1, generate-tests-for-file-character-count=254, processing-time=\d+, response-character-count=254, response-no-error=1, response-no-excess=1, response-with-code=1`, output)
assert.Equal(t, 1, strings.Count(output, "Evaluation score for"))
},
ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){
Expand All @@ -429,7 +472,10 @@ func TestEvaluateExecute(t *testing.T) {
metrics.AssessmentKeyResponseWithCode: 1,
},
}, []uint64{14})
// Assert non-deterministic behavior.
assert.Greater(t, actualAssessments[0][metrics.AssessmentKeyProcessingTime], uint64(0))
assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyGenerateTestsForFileCharacterCount], uint64(254))
assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyResponseCharacterCount], uint64(254))
},
"evaluation.log": nil,
"golang-summed.csv": func(t *testing.T, filePath, data string) {
Expand All @@ -442,7 +488,10 @@ func TestEvaluateExecute(t *testing.T) {
metrics.AssessmentKeyResponseWithCode: 1,
},
}, []uint64{14})
// Assert non-deterministic behavior.
assert.Greater(t, actualAssessments[0][metrics.AssessmentKeyProcessingTime], uint64(0))
assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyGenerateTestsForFileCharacterCount], uint64(254))
assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyResponseCharacterCount], uint64(254))
},
"models-summed.csv": func(t *testing.T, filePath, data string) {
actualAssessments := validateMetrics(t, extractMetricsCSVMatch, data, []metrics.Assessments{
Expand All @@ -454,7 +503,10 @@ func TestEvaluateExecute(t *testing.T) {
metrics.AssessmentKeyResponseWithCode: 1,
},
}, []uint64{14})
// Assert non-deterministic behavior.
assert.Greater(t, actualAssessments[0][metrics.AssessmentKeyProcessingTime], uint64(0))
assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyGenerateTestsForFileCharacterCount], uint64(254))
assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyResponseCharacterCount], uint64(254))
},
"README.md": func(t *testing.T, filePath, data string) {
validateReportLinks(t, data, []string{"symflower_symbolic-execution"})
Expand Down Expand Up @@ -600,7 +652,10 @@ func TestEvaluateExecute(t *testing.T) {
metrics.AssessmentKeyResponseWithCode: 3,
},
}, []uint64{42})
// Assert non-deterministic behavior.
assert.Greater(t, actualAssessments[0][metrics.AssessmentKeyProcessingTime], uint64(0))
assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyGenerateTestsForFileCharacterCount], uint64(762))
assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyResponseCharacterCount], uint64(762))
assert.Equal(t, 1, strings.Count(output, "Evaluation score for"))
},
ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){
Expand All @@ -615,7 +670,10 @@ func TestEvaluateExecute(t *testing.T) {
metrics.AssessmentKeyResponseWithCode: 3,
},
}, []uint64{42})
// Assert non-deterministic behavior.
assert.Greater(t, actualAssessments[0][metrics.AssessmentKeyProcessingTime], uint64(0))
assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyGenerateTestsForFileCharacterCount], uint64(762))
assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyResponseCharacterCount], uint64(762))
},
"evaluation.log": func(t *testing.T, filePath, data string) {
assert.Contains(t, data, "Run 1/3")
Expand Down
8 changes: 6 additions & 2 deletions evaluate/evaluate_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -236,7 +236,9 @@ func TestEvaluate(t *testing.T) {
mockedModel: map[language.Language]map[string]metrics.Assessments{
languageGolang: map[string]metrics.Assessments{
repositoryPath: map[metrics.AssessmentKey]uint64{
metrics.AssessmentKeyResponseNoError: 1,
metrics.AssessmentKeyGenerateTestsForFileCharacterCount: 14,
metrics.AssessmentKeyResponseCharacterCount: 14,
metrics.AssessmentKeyResponseNoError: 1,
},
},
},
Expand Down Expand Up @@ -286,7 +288,9 @@ func TestEvaluate(t *testing.T) {
mockedModel: map[language.Language]map[string]metrics.Assessments{
languageGolang: map[string]metrics.Assessments{
repositoryPath: map[metrics.AssessmentKey]uint64{
metrics.AssessmentKeyResponseNoError: 1,
metrics.AssessmentKeyGenerateTestsForFileCharacterCount: 14,
metrics.AssessmentKeyResponseCharacterCount: 14,
metrics.AssessmentKeyResponseNoError: 1,
},
},
},
Expand Down
5 changes: 5 additions & 0 deletions evaluate/metrics/assessment.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,11 @@ var (
// AssessmentKeyCoverageStatement counts the cases where 100% coverage was reached.
AssessmentKeyCoverageStatement = RegisterAssessmentKey("coverage-statement", 10)

// AssessmentKeyResponseCharacterCount counts the number of characters of a response.
AssessmentKeyResponseCharacterCount = RegisterAssessmentKey("response-character-count", 0)
// AssessmentKeyGenerateTestsForFileCharacterCount counts the number of characters of a generated test file.
AssessmentKeyGenerateTestsForFileCharacterCount = RegisterAssessmentKey("generate-tests-for-file-character-count", 0)

// AssessmentKeyResponseNoError indicates that a model responded without error.
AssessmentKeyResponseNoError = RegisterAssessmentKey("response-no-error", 1)
// AssessmentKeyResponseWithCode indicates that a model responded with code.
Expand Down
18 changes: 10 additions & 8 deletions evaluate/metrics/assessment_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -137,22 +137,24 @@ func TestAssessmentString(t *testing.T) {

Assessment: NewAssessments(),

ExpectedString: "score=0, coverage-statement=0, files-executed=0, processing-time=0, response-no-error=0, response-no-excess=0, response-with-code=0",
ExpectedString: "score=0, coverage-statement=0, files-executed=0, generate-tests-for-file-character-count=0, processing-time=0, response-character-count=0, response-no-error=0, response-no-excess=0, response-with-code=0",
})

validate(t, &testCase{
Name: "Non-empty Metrics",

Assessment: Assessments{
AssessmentKeyCoverageStatement: 1,
AssessmentKeyFilesExecuted: 2,
AssessmentKeyResponseNoError: 3,
AssessmentKeyResponseNoExcess: 4,
AssessmentKeyResponseWithCode: 5,
AssessmentKeyProcessingTime: 200,
AssessmentKeyGenerateTestsForFileCharacterCount: 50,
AssessmentKeyResponseCharacterCount: 100,
AssessmentKeyCoverageStatement: 1,
AssessmentKeyFilesExecuted: 2,
AssessmentKeyResponseNoError: 3,
AssessmentKeyResponseNoExcess: 4,
AssessmentKeyResponseWithCode: 5,
AssessmentKeyProcessingTime: 200,
},

ExpectedString: "score=15, coverage-statement=1, files-executed=2, processing-time=200, response-no-error=3, response-no-excess=4, response-with-code=5",
ExpectedString: "score=15, coverage-statement=1, files-executed=2, generate-tests-for-file-character-count=50, processing-time=200, response-character-count=100, response-no-error=3, response-no-excess=4, response-with-code=5",
})
}

Expand Down
5 changes: 5 additions & 0 deletions evaluate/metrics/testing/assessments.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,5 +17,10 @@ func AssertAssessmentsEqual(t *testing.T, expected metrics.Assessments, actual m
expected[metrics.AssessmentKeyProcessingTime] = 0
actual[metrics.AssessmentKeyProcessingTime] = 0

expected[metrics.AssessmentKeyGenerateTestsForFileCharacterCount] = 0
actual[metrics.AssessmentKeyGenerateTestsForFileCharacterCount] = 0
expected[metrics.AssessmentKeyResponseCharacterCount] = 0
actual[metrics.AssessmentKeyResponseCharacterCount] = 0

assert.Truef(t, expected.Equal(actual), "expected:%s\nactual:%s", expected, actual)
}
Loading

0 comments on commit 6b3b143

Please sign in to comment.