In [1]:
import pandas as pd 
from langchain_community.vectorstores import FAISS

from model import Model
from retriever import Retriever
from prompts import get_prompt, combined_template, interp_template
from agent import RAGAgent, InterpAgent

In [2]:
df = pd.read_csv('random_sample.csv') 
#model = Model(model_name="llama3.3")
#model = Model(model_name="qwen:32b")
model = Model(model_name="mistral")
#retriever = Retriever("schema", 'embed', embed_model_name="all-MiniLM-L6-v2", df=df)
retriever = Retriever('hybrid', embed_model_name="all-MiniLM-L6-v2", db=FAISS)
prompt = get_prompt(combined_template)

processor = RAGAgent(retriever, prompt, model, df)
interp_prompt = get_prompt(interp_template)
interp = InterpAgent(interp_prompt, model)


In [3]:
# test
query = "What are the two lowest writing scores?"
ctx = processor.invoke(query,
                        temperature=0.3,
                        top_p=0.9,)
answer = interp.invoke(ctx, query,
                        temperature=0.8,
                        top_p=0.9,)
print(ctx)
print(answer)

{'code': " ```python\n   df['writing score'].sort_values().head(2)\n   ```", 'result': 11    22
9     41
Name: writing score, dtype: int64}
The question: What are the two lowest writing scores?
   The relative result: {'code': " ```python\n   df['writing score'].sort_values().head(2)\n   ```", 'result': 11    22
9     41
Name: writing score, dtype: int64}

The concluding response: According to the provided code and result, the two lowest writing scores are 11 and 22. These scores are associated with the row numbers 9 and 41 in your dataframe. It is essential to note that further context or additional analysis might be required for a comprehensive understanding of these scores within your dataset.


简单直接的问题

In [5]:
query = "What is the highest math score?"
ctx = processor.invoke(query,
                        temperature=0.3,
                        top_p=0.9,)
answer = interp.invoke(ctx, query)
print(ctx)
print(answer)

{'code': " ```python\n   df['math score'].max()\n   ```", 'result': 91}
The question: What is the highest math score?
   The relative result: {'code': " ```python\n   df['math score'].max()\n   ```", 'result': 91}
   The concluding response: Based on the provided Python code and its output, it can be concluded that the highest math score is 91.


In [None]:
processor.invoke("What is the lowest reading score?")

In [None]:
processor.invoke("What are the two lowest writing scores?")

In [None]:
processor.invoke("How many students whoes reading score more than 80?")

综合问题

In [6]:
processor.invoke("Which gender has a better math score?")

{'code': " ```python\n   df.groupby('gender')['math score'].mean()\n   ```",
 'result': gender
 female    60.375000
 male      66.166667
 Name: math score, dtype: float64}

In [None]:
processor.invoke("If parental level of education has the impact for reading score?")

In [None]:
processor.invoke("What's the best comprehensive score?")

In [None]:
processor.invoke("What're the features of the student who has the best writing score?")

In [None]:
processor.invoke("What're the features of the student who has the best total score?")

需要分析的综合问题，回答的不好

字段近似表述

In [7]:
processor.invoke("If food impacts writing score?")

Code execution error: Invalid syntax in code: result = from sklearn.preprocessing import OneHotEncoder
   enc = OneHotEncoder(sparse=False)
   df['lunch'] = enc.fit_transform(df[['lunch']])
   corr_matrix = df[['writing score', 'lunch']].corr()
   print(corr_matrix)
Error: invalid syntax (<string>, line 1)


{'code': " ```python\n   df.groupby('lunch')['writing score'].mean()\n   ```",
 'result': lunch
 free/reduced    65.7
 standard        62.2
 Name: writing score, dtype: float64}

In [None]:
processor.invoke("If students who completed preparation have a better writing score?")

In [None]:
processor.invoke("Which racial has the best writing score?")

In [None]:
df.dtypes

In [None]:
df[['reading score', 'writing score', 'math score']].sum(axis=1).max()

实现：
1、如果result的值为None重新执行processor.invoke(query)
2、根据用户的query和processor.invoke(query)的结果，用一个新的总结提炼模型来用自然语言回答用户的问题，而不是只给code
3、如果2提供的信息使你无法总结提炼来回答query，则重新执行processor.invoke(query)，直到步骤2能顺利进行

In [17]:
import pandas as pd 

from version4.model import Model
from retriever import Retriever
from template import get_prompt, combined_template
from agent import ChainProcessor

In [18]:
df = pd.read_csv('product_sample.csv')
model = Model(model_name="llama3.1")
retriever = Retriever("schema", 'embed', embed_model_name="all-MiniLM-L6-v2", df=df)
prompt = get_prompt(combined_template)
processor = ChainProcessor(retriever, prompt, model, df)

In [None]:
processor.invoke("What's the worst average rating?")

近似

In [None]:
processor.invoke("What's the best satisfaction?")

In [None]:
processor.invoke("What's the average inventory?")

综合

In [None]:
processor.invoke("Which product has the highest sales volume?")

In [None]:
processor.invoke("How many products are currently on promotion?")

In [None]:
processor.invoke("What're the features of product which has the best market feedback?")