-
Notifications
You must be signed in to change notification settings - Fork 8
/
evaluation.jl
285 lines (248 loc) · 12.9 KB
/
evaluation.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
### For testing and eval
# This is a return_type for extraction when generating Q&A set with aiextract
@kwdef struct QAItem
question::String = ""
answer::String = ""
end
# This is for saving in JSON format for evaluation later
@kwdef struct QAEvalItem
source::String = ""
context::String = ""
question::String = ""
answer::String = ""
end
@kwdef struct QAEvalResult
source::AbstractString
context::AbstractString
question::AbstractString
answer::AbstractString
retrieval_score::Union{Number, Nothing} = nothing
retrieval_rank::Union{Int, Nothing} = nothing
answer_score::Union{Number, Nothing} = nothing
parameters::Dict{Symbol, Any} = Dict{Symbol, Any}()
end
"Provide the `final_rating` between 1-5. Provide the rationale for it."
@kwdef struct JudgeRating
rationale::Union{Nothing, String} = nothing
final_rating::Int
end
"`final_rating` is the average of all scoring criteria. Explain the `final_rating` in `rationale`"
@kwdef struct JudgeAllScores
relevance::Int
completeness::Int
clarity::Int
consistency::Int
helpfulness::Int
rationale::Union{Nothing, String} = nothing
final_rating::Float64
end
function Base.isvalid(x::QAEvalItem)
!isempty(x.question) && !isempty(x.answer) && !isempty(x.context)
end
# for equality tests
function Base.var"=="(x::Union{QAItem, QAEvalItem, QAEvalResult},
y::Union{QAItem, QAEvalItem, QAEvalResult})
typeof(x) == typeof(y) &&
all([getfield(x, f) == getfield(y, f) for f in fieldnames(typeof(x))])
end
# Nicer show method with some colors!
function Base.show(io::IO, t::Union{QAItem, QAEvalItem, QAEvalResult})
printstyled(io, "$(nameof(typeof(t))):\n", color = :green, bold = true)
for f in fieldnames(typeof(t))
printstyled(io, " ", f, color = :blue, bold = true)
println(io, ": ", getfield(t, f))
end
end
# Define how JSON3 should serialize/deserialize the struct into JSON files
JSON3.StructTypes.StructType(::Type{QAEvalItem}) = JSON3.StructTypes.Struct()
JSON3.StructTypes.StructType(::Type{QAEvalResult}) = JSON3.StructTypes.Struct()
"""
build_qa_evals(doc_chunks::Vector{<:AbstractString}, sources::Vector{<:AbstractString};
model=PT.MODEL_CHAT, instructions="None.", qa_template::Symbol=:RAGCreateQAFromContext,
verbose::Bool=true, api_kwargs::NamedTuple = NamedTuple(), kwargs...) -> Vector{QAEvalItem}
Create a collection of question and answer evaluations (`QAEvalItem`) from document chunks and sources.
This function generates Q&A pairs based on the provided document chunks, using a specified AI model and template.
# Arguments
- `doc_chunks::Vector{<:AbstractString}`: A vector of document chunks, each representing a segment of text.
- `sources::Vector{<:AbstractString}`: A vector of source identifiers corresponding to each chunk in `doc_chunks` (eg, filenames or paths).
- `model`: The AI model used for generating Q&A pairs. Default is `PT.MODEL_CHAT`.
- `instructions::String`: Additional instructions or context to provide to the model generating QA sets. Defaults to "None.".
- `qa_template::Symbol`: A template symbol that dictates the AITemplate that will be used. It must have placeholder `context`. Default is `:CreateQAFromContext`.
- `api_kwargs::NamedTuple`: Parameters that will be forwarded to the API endpoint.
- `verbose::Bool`: If `true`, additional information like costs will be logged. Defaults to `true`.
# Returns
`Vector{QAEvalItem}`: A vector of `QAEvalItem` structs, each containing a source, context, question, and answer. Invalid or empty items are filtered out.
# Notes
- The function internally uses `aiextract` to generate Q&A pairs based on the provided `qa_template`. So you can use any kwargs that you want.
- Each `QAEvalItem` includes the context (document chunk), the generated question and answer, and the source.
- The function tracks and reports the cost of AI calls if `verbose` is enabled.
- Items where the question, answer, or context is empty are considered invalid and are filtered out.
# Examples
Creating Q&A evaluations from a set of document chunks:
```julia
doc_chunks = ["Text from document 1", "Text from document 2"]
sources = ["source1", "source2"]
qa_evals = build_qa_evals(doc_chunks, sources)
```
"""
function build_qa_evals(doc_chunks::Vector{<:AbstractString},
sources::Vector{<:AbstractString};
model = PT.MODEL_CHAT, instructions = "None.",
qa_template::Symbol = :RAGCreateQAFromContext, verbose::Bool = true,
api_kwargs::NamedTuple = NamedTuple(), kwargs...)
##
@assert length(doc_chunks)==length(sources) "Length of `doc_chunks` and `sources` must be the same."
placeholders = only(aitemplates(qa_template)).variables # only one template should be found
@assert (:context in placeholders) "Provided Q&A Template $(qa_template) is not suitable. It must have placeholder: `context`."
##
cost_tracker = Threads.Atomic{Float64}(0.0)
output = asyncmap(zip(doc_chunks, sources)) do (context, source)
try
msg = aiextract(qa_template;
return_type = QAItem,
context,
instructions,
verbose,
model, api_kwargs)
Threads.atomic_add!(cost_tracker, PT.call_cost(msg, model)) # track costs
QAEvalItem(; context, msg.content.question, msg.content.answer, source)
catch e
verbose && @warn e
QAEvalItem()
end
end
verbose && @info "Q&A Sets built! (cost: \$$(round(cost_tracker[], digits=3)))"
return filter(isvalid, output)
end
"Returns 1.0 if `context` overlaps or is contained within any of the `candidate_context`"
function score_retrieval_hit(orig_context::AbstractString,
candidate_context::Vector{<:AbstractString})
1.0 * (any(occursin.(Ref(orig_context), candidate_context)) ||
any(occursin.(candidate_context, Ref(orig_context))))
end
"Returns Integer rank of the position where `context` overlaps or is contained within a `candidate_context`"
function score_retrieval_rank(orig_context::AbstractString,
candidate_context::Vector{<:AbstractString})
findfirst((occursin.(Ref(orig_context), candidate_context)) .||
(occursin.(candidate_context, Ref(orig_context))))
end
"""
run_qa_evals(qa_item::QAEvalItem, ctx::RAGResult; verbose::Bool = true,
parameters_dict::Dict{Symbol, <:Any}, judge_template::Symbol = :RAGJudgeAnswerFromContext,
model_judge::AbstractString, api_kwargs::NamedTuple = NamedTuple()) -> QAEvalResult
Evaluates a single `QAEvalItem` using RAG details (`RAGResult`) and returns a `QAEvalResult` structure. This function assesses the relevance and accuracy of the answers generated in a QA evaluation context.
# Arguments
- `qa_item::QAEvalItem`: The QA evaluation item containing the question and its answer.
- `ctx::RAGResult`: The RAG result used for generating the QA pair, including the original context and the answers.
Comes from `airag(...; return_context=true)`
- `verbose::Bool`: If `true`, enables verbose logging. Defaults to `true`.
- `parameters_dict::Dict{Symbol, Any}`: Track any parameters used for later evaluations. Keys must be Symbols.
- `judge_template::Symbol`: The template symbol for the AI model used to judge the answer. Defaults to `:RAGJudgeAnswerFromContext`.
- `model_judge::AbstractString`: The AI model used for judging the answer's quality.
Defaults to standard chat model, but it is advisable to use more powerful model GPT-4.
- `api_kwargs::NamedTuple`: Parameters that will be forwarded to the API endpoint.
# Returns
`QAEvalResult`: An evaluation result that includes various scores and metadata related to the QA evaluation.
# Notes
- The function computes a retrieval score and rank based on how well the context matches the QA context.
- It then uses the `judge_template` and `model_judge` to score the answer's accuracy and relevance.
- In case of errors during evaluation, the function logs a warning (if `verbose` is `true`) and the `answer_score` will be set to `nothing`.
# Examples
Evaluating a QA pair using a specific context and model:
```julia
qa_item = QAEvalItem(question="What is the capital of France?", answer="Paris", context="France is a country in Europe.")
ctx = RAGResult(source="Wikipedia", context="France is a country in Europe.", answer="Paris")
parameters_dict = Dict("param1" => "value1", "param2" => "value2")
eval_result = run_qa_evals(qa_item, ctx, parameters_dict=parameters_dict, model_judge="MyAIJudgeModel")
```
"""
function run_qa_evals(qa_item::QAEvalItem, ctx::RAGResult;
verbose::Bool = true, parameters_dict::Dict{Symbol, <:Any} = Dict{Symbol, Any}(),
judge_template::Symbol = :RAGJudgeAnswerFromContextShort,
model_judge::AbstractString = PT.MODEL_CHAT,
api_kwargs::NamedTuple = NamedTuple())
retrieval_score = score_retrieval_hit(qa_item.context, ctx.context)
retrieval_rank = score_retrieval_rank(qa_item.context, ctx.context)
# Note we could evaluate if RAGResult and QAEvalItem are at least using the same sources etc.
answer_score = try
msg = aiextract(judge_template; model = model_judge, verbose,
ctx.context,
ctx.question,
answer = ctx.final_answer,
return_type = JudgeAllScores, api_kwargs)
final_rating = if msg.content isa AbstractDict && haskey(msg.content, :final_rating)
# if return type parsing failed
msg.content[:final_rating]
else
# if return_type worked
msg.content.final_rating
end
catch e
verbose && @warn "Error in QA eval ($(qa_item.question)): $e"
nothing
end
return QAEvalResult(;
qa_item.source,
qa_item.context,
qa_item.question,
answer = ctx.final_answer,
retrieval_score,
retrieval_rank,
answer_score,
parameters = parameters_dict)
end
"""
run_qa_evals(index::AbstractChunkIndex, qa_items::AbstractVector{<:QAEvalItem};
api_kwargs::NamedTuple = NamedTuple(),
airag_kwargs::NamedTuple = NamedTuple(),
qa_evals_kwargs::NamedTuple = NamedTuple(),
verbose::Bool = true, parameters_dict::Dict{Symbol, <:Any} = Dict{Symbol, Any}())
Evaluates a vector of `QAEvalItem`s and returns a vector `QAEvalResult`.
This function assesses the relevance and accuracy of the answers generated in a QA evaluation context.
See `?run_qa_evals` for more details.
# Arguments
- `qa_items::AbstractVector{<:QAEvalItem}`: The vector of QA evaluation items containing the questions and their answers.
- `verbose::Bool`: If `true`, enables verbose logging. Defaults to `true`.
- `api_kwargs::NamedTuple`: Parameters that will be forwarded to the API calls. See `?aiextract` for details.
- `airag_kwargs::NamedTuple`: Parameters that will be forwarded to `airag` calls. See `?airag` for details.
- `qa_evals_kwargs::NamedTuple`: Parameters that will be forwarded to `run_qa_evals` calls. See `?run_qa_evals` for details.
- `parameters_dict::Dict{Symbol, Any}`: Track any parameters used for later evaluations. Keys must be Symbols.
# Returns
`Vector{QAEvalResult}`: Vector of evaluation results that includes various scores and metadata related to the QA evaluation.
# Example
```julia
index = "..." # Assuming a proper index is defined
qa_items = [QAEvalItem(question="What is the capital of France?", answer="Paris", context="France is a country in Europe."),
QAEvalItem(question="What is the capital of Germany?", answer="Berlin", context="Germany is a country in Europe.")]
# Let's run a test with `top_k=5`
results = run_qa_evals(index, qa_items; airag_kwargs=(;top_k=5), parameters_dict=Dict(:top_k => 5))
# Filter out the "failed" calls
results = filter(x->!isnothing(x.answer_score), results);
# See average judge score
mean(x->x.answer_score, results)
```
"""
function run_qa_evals(index::AbstractChunkIndex, qa_items::AbstractVector{<:QAEvalItem};
api_kwargs::NamedTuple = NamedTuple(),
airag_kwargs::NamedTuple = NamedTuple(),
qa_evals_kwargs::NamedTuple = NamedTuple(),
verbose::Bool = true, parameters_dict::Dict{Symbol, <:Any} = Dict{Symbol, Any}())
# Run evaluations in parallel
results = asyncmap(qa_items) do qa_item
# Generate an answer -- often you want the model_judge to be the highest quality possible, eg, "GPT-4 Turbo" (alias "gpt4t)
ragresult = airag(index; qa_item.question, return_all = true,
verbose, api_kwargs, airag_kwargs...)
# Evaluate the response
# Note: you can log key parameters for easier analysis later
run_qa_evals(qa_item,
ragresult;
parameters_dict,
verbose,
api_kwargs,
qa_evals_kwargs...)
end
success_count = count(x -> !isnothing(x.answer_score), results)
verbose &&
@info "QA Evaluations complete ($((success_count)/length(qa_items)) evals successful)!"
return results
end