Skip to content

Commit

Permalink
fix, Explicitly track if a $language/plain repository was solved caus…
Browse files Browse the repository at this point in the history
…e problem-counting approach can disqualify succeeding runs

Part of #127
  • Loading branch information
bauersimon authored and zimmski committed May 20, 2024
1 parent b207e28 commit df861dd
Show file tree
Hide file tree
Showing 2 changed files with 227 additions and 6 deletions.
17 changes: 13 additions & 4 deletions evaluate/evaluate.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ import (
"path/filepath"

"github.com/symflower/eval-dev-quality/evaluate/report"
"github.com/symflower/eval-dev-quality/language"
evallanguage "github.com/symflower/eval-dev-quality/language"
evalmodel "github.com/symflower/eval-dev-quality/model"
)

Expand All @@ -16,7 +16,7 @@ type Context struct {
Log *log.Logger

// Languages determines which language should be used for the evaluation, or empty if all languages should be used.
Languages []language.Language
Languages []evallanguage.Language

// Models determines which models should be used for the evaluation, or empty if all models should be used.
Models []evalmodel.Model
Expand All @@ -40,6 +40,7 @@ const RepositoryPlainName = "plain"
// Evaluate runs an evaluation on the given context and returns its results.
func Evaluate(ctx *Context) (assessments report.AssessmentPerModelPerLanguagePerRepository, totalScore uint64) {
// Check that models and languages can be evaluated by executing the "plain" repositories.
modelSucceededBasicChecksOfLanguage := map[evalmodel.Model]map[evallanguage.Language]bool{}
ctx.Log.Printf("Checking that models and languages can be used for evaluation")
// Ensure we report metrics for every model even if they are excluded.
assessments = report.NewAssessmentPerModelPerLanguagePerRepository(ctx.Models, ctx.Languages, ctx.RepositoryPaths)
Expand All @@ -55,6 +56,10 @@ func Evaluate(ctx *Context) (assessments report.AssessmentPerModelPerLanguagePer
modelID := model.ID()
languageID := language.ID()

if modelSucceededBasicChecksOfLanguage[model] == nil {
modelSucceededBasicChecksOfLanguage[model] = map[evallanguage.Language]bool{}
}

if r, ok := model.(evalmodel.SetQueryAttempts); ok {
r.SetQueryAttempts(ctx.QueryAttempts)
}
Expand All @@ -67,8 +72,10 @@ func Evaluate(ctx *Context) (assessments report.AssessmentPerModelPerLanguagePer
ps = append(ps, err)
}
if len(ps) > 0 {
ctx.Log.Printf("Excluding model %q since it was not able to solve the %q repository for language %q: %+v", modelID, repositoryPath, languageID, ps)
ctx.Log.Printf("Model %q was not able to solve the %q repository for language %q: %+v", modelID, repositoryPath, languageID, ps)
problemsPerModel[modelID] = append(problemsPerModel[modelID], ps...)
} else {
modelSucceededBasicChecksOfLanguage[model][language] = true
}
}
}
Expand Down Expand Up @@ -111,7 +118,9 @@ func Evaluate(ctx *Context) (assessments report.AssessmentPerModelPerLanguagePer
for _, model := range ctx.Models {
modelID := model.ID()

if len(problemsPerModel[modelID]) > 0 {
if !modelSucceededBasicChecksOfLanguage[model][language] {
log.Printf("Excluding model %q for language %q cause it did not succeed basic checks", model.ID(), language.ID())

continue
}

Expand Down
216 changes: 214 additions & 2 deletions evaluate/evaluate_test.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package evaluate

import (
"bytes"
"errors"
"os"
"path/filepath"
Expand Down Expand Up @@ -55,7 +56,9 @@ func TestEvaluate(t *testing.T) {
tc.Context.QueryAttempts = 1
}
tc.Context.ResultPath = temporaryPath
tc.Context.TestdataPath = filepath.Join("..", "testdata")
if tc.Context.TestdataPath == "" {
tc.Context.TestdataPath = filepath.Join("..", "testdata")
}
if tc.Context.Runs == 0 {
tc.Context.Runs = 1
}
Expand Down Expand Up @@ -146,7 +149,7 @@ func TestEvaluate(t *testing.T) {
})
}

t.Run("Failying model queries", func(t *testing.T) {
t.Run("Failing model queries", func(t *testing.T) {
{
languageGolang := &golang.Language{}
mockedModelID := "testing-provider/empty-response-model"
Expand Down Expand Up @@ -290,4 +293,213 @@ func TestEvaluate(t *testing.T) {
})
}
})

t.Run("Failing basic language checks should exclude model", func(t *testing.T) {
repositoryPlainPath := filepath.Join("golang", "plain")
repositoryNextPath := filepath.Join("golang", "next")

temporaryTestdataPath := t.TempDir()
assert.NoError(t, osutil.CopyTree(filepath.Join("..", "testdata", repositoryPlainPath), filepath.Join(temporaryTestdataPath, repositoryPlainPath)))
assert.NoError(t, osutil.CopyTree(filepath.Join("..", "testdata", repositoryPlainPath), filepath.Join(temporaryTestdataPath, repositoryNextPath)))
repositoryNextConfigPath := filepath.Join(temporaryTestdataPath, repositoryNextPath, "go.mod")
d, err := os.ReadFile(repositoryNextConfigPath)
require.NoError(t, err)
d = bytes.ReplaceAll(d, []byte("plain"), []byte("next"))
require.NoError(t, os.WriteFile(repositoryNextConfigPath, d, 0))

generateTestsForFilePlainSuccess := func(args mock.Arguments) {
require.NoError(t, os.WriteFile(filepath.Join(args.String(2), "plain_test.go"), []byte("package plain\nimport \"testing\"\nfunc TestFunction(t *testing.T){}"), 0600))
}
generateTestsForFilePlainSuccessMetrics := metrics.Assessments{
metrics.AssessmentKeyProcessingTime: 1,
}
generateTestsForFilePlainError := errors.New("generateTestsForFile error")

generateSuccess := func(mockedModel *modeltesting.MockModel) {
mockedModel.On("GenerateTestsForFile", mock.Anything, mock.Anything, mock.Anything, mock.Anything).Return(generateTestsForFilePlainSuccessMetrics, nil).Run(generateTestsForFilePlainSuccess).Once()
}
generateError := func(mockedModel *modeltesting.MockModel) {
mockedModel.On("GenerateTestsForFile", mock.Anything, mock.Anything, mock.Anything, mock.Anything).Return(nil, generateTestsForFilePlainError).Once()
}

{
languageGolang := &golang.Language{}
mockedModelID := "mocked-generation-model"
mockedModel := modeltesting.NewMockModelNamed(t, mockedModelID)

validate(t, &testCase{
Name: "Problems of previous runs shouldn't cancel successive runs",

Before: func(t *testing.T, logger *log.Logger, resultPath string) {
// Set up mocks, when test is running.
{
// Succeed on both "plain" runs.
generateSuccess(mockedModel)
generateSuccess(mockedModel)

// Error on the first run for the "next" repository.
generateError(mockedModel)
// Succeed on the second run for the "next" repository.
generateSuccess(mockedModel)
}
},
After: func(t *testing.T, logger *log.Logger, resultPath string) {
mockedModel.AssertNumberOfCalls(t, "GenerateTestsForFile", 4)
},

Context: &Context{
Languages: []language.Language{
&golang.Language{},
},

Models: []evalmodel.Model{
mockedModel,
},

RepositoryPaths: []string{
repositoryPlainPath,
repositoryNextPath,
},
TestdataPath: temporaryTestdataPath,

Runs: 2,
},

ExpectedAssessments: map[evalmodel.Model]map[language.Language]map[string]metrics.Assessments{
mockedModel: map[language.Language]map[string]metrics.Assessments{
languageGolang: map[string]metrics.Assessments{
repositoryPlainPath: map[metrics.AssessmentKey]uint64{
metrics.AssessmentKeyFilesExecuted: 2,
metrics.AssessmentKeyResponseNoError: 2,
},
repositoryNextPath: map[metrics.AssessmentKey]uint64{
metrics.AssessmentKeyFilesExecuted: 1,
metrics.AssessmentKeyResponseNoError: 1,
},
},
},
},
ExpectedTotalScore: 0,
ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){
filepath.Join(evalmodel.CleanModelNameForFileSystem(mockedModelID), "golang", "golang", "plain.log"): nil,
filepath.Join(evalmodel.CleanModelNameForFileSystem(mockedModelID), "golang", "golang", "next.log"): nil,
},
})
}
{
languageGolang := &golang.Language{}
mockedModelID := "mocked-generation-model"
mockedModel := modeltesting.NewMockModelNamed(t, mockedModelID)

validate(t, &testCase{
Name: "Solving basic checks once is enough",

Before: func(t *testing.T, logger *log.Logger, resultPath string) {
// Set up mocks, when test is running.
{
// Succeed on only one "plain" run.
generateError(mockedModel)
generateSuccess(mockedModel)

// Succeed on both "next" runs.
generateSuccess(mockedModel)
generateSuccess(mockedModel)
}
},
After: func(t *testing.T, logger *log.Logger, resultPath string) {
mockedModel.AssertNumberOfCalls(t, "GenerateTestsForFile", 4)
},

Context: &Context{
Languages: []language.Language{
&golang.Language{},
},

Models: []evalmodel.Model{
mockedModel,
},

RepositoryPaths: []string{
repositoryPlainPath,
repositoryNextPath,
},
TestdataPath: temporaryTestdataPath,

Runs: 2,
},

ExpectedAssessments: map[evalmodel.Model]map[language.Language]map[string]metrics.Assessments{
mockedModel: map[language.Language]map[string]metrics.Assessments{
languageGolang: map[string]metrics.Assessments{
repositoryPlainPath: map[metrics.AssessmentKey]uint64{
metrics.AssessmentKeyFilesExecuted: 1,
metrics.AssessmentKeyResponseNoError: 1,
},
repositoryNextPath: map[metrics.AssessmentKey]uint64{
metrics.AssessmentKeyFilesExecuted: 2,
metrics.AssessmentKeyResponseNoError: 2,
},
},
},
},
ExpectedTotalScore: 0,
ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){
filepath.Join(evalmodel.CleanModelNameForFileSystem(mockedModelID), "golang", "golang", "plain.log"): nil,
filepath.Join(evalmodel.CleanModelNameForFileSystem(mockedModelID), "golang", "golang", "next.log"): nil,
},
})
}
{
languageGolang := &golang.Language{}
mockedModelID := "mocked-generation-model"
mockedModel := modeltesting.NewMockModelNamed(t, mockedModelID)

validate(t, &testCase{
Name: "Never solving basic checks leads to exclusion",

Before: func(t *testing.T, logger *log.Logger, resultPath string) {
// Set up mocks, when test is running.
{
// Error on every "plain" run.
generateError(mockedModel)
generateError(mockedModel)
}
},
After: func(t *testing.T, logger *log.Logger, resultPath string) {
mockedModel.AssertNumberOfCalls(t, "GenerateTestsForFile", 2)
},

Context: &Context{
Languages: []language.Language{
&golang.Language{},
},

Models: []evalmodel.Model{
mockedModel,
},

RepositoryPaths: []string{
repositoryPlainPath,
repositoryNextPath,
},
TestdataPath: temporaryTestdataPath,

Runs: 2,
},

ExpectedAssessments: map[evalmodel.Model]map[language.Language]map[string]metrics.Assessments{
mockedModel: map[language.Language]map[string]metrics.Assessments{
languageGolang: map[string]metrics.Assessments{
repositoryPlainPath: map[metrics.AssessmentKey]uint64{},
repositoryNextPath: map[metrics.AssessmentKey]uint64{},
},
},
},
ExpectedTotalScore: 0,
ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){
filepath.Join(evalmodel.CleanModelNameForFileSystem(mockedModelID), "golang", "golang", "plain.log"): nil,
},
})
}
})
}

0 comments on commit df861dd

Please sign in to comment.