In [1]:
import pandas as pd 
from main import main

import warnings
warnings.filterwarnings("ignore")


In [2]:
df = pd.read_csv('data/students_performance.csv') 

In [3]:
df.head(2)

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88


In [4]:
df.shape

(1000, 8)

Simple and direct questions

In [5]:
query = "What is the highest math score?"
main(df,query)

📝 The question: What is the highest math score?

💻 The python code:
"```python
df['math score'].max()
```"

📊 The execution result:
100

✅ The concluding response:
The highest math score achieved is 100, as determined by the maximum value in the 'math score' column of the dataset.



In [6]:
query = "What is the lowest reading score?"
main(df,query)

📝 The question: What is the lowest reading score?

💻 The python code:
"```python
df['reading score'].min()
```"

📊 The execution result:
17

✅ The concluding response:
The lowest reading score is 17, as determined by the minimum value in the 'reading score' column of the dataset.



In [7]:
query = "What are the two lowest writing scores?"
main(df,query)

📝 The question: What are the two lowest writing scores?

💻 The python code:
"```python
df['writing score'].nsmallest(2)
```"

📊 The execution result:
index  writing score
0     59             10
1    596             15

✅ The concluding response:
The two lowest writing scores are 10 and 15, corresponding to index values 59 and 596, respectively, indicating the weakest writing performance.



In [8]:
query = "How many students whoes reading score more than 80?"
main(df,query)

📝 The question: How many students whoes reading score more than 80?

💻 The python code:
"```python
df[df['reading score'] > 80].shape[0]
```"

📊 The execution result:
235

✅ The concluding response:
There are 235 students whose reading score is more than 80, indicating a significant number of students have achieved a high reading score.



Comprehensive questions

In [9]:
query = "Which gender has a better math score?"
main(df,query)

📝 The question: Which gender has a better math score?

💻 The python code:
"```python
df.groupby('gender')['math score'].mean()
```"

📊 The execution result:
gender  math score
0  female   63.633205
1    male   68.728216

✅ The concluding response:
The data indicates that males have a better math score, with an average of 68.728216, compared to females who have an average of 63.633205.



In [10]:
query = "If parental level of education has the impact for reading score?"
main(df,query)

📝 The question: If parental level of education has the impact for reading score?

💻 The python code:
"```python
df.groupby('parental level of education')['reading score'].mean()
```"

📊 The execution result:
parental level of education  reading score
0          associate's degree      70.927928
1           bachelor's degree      73.000000
2                 high school      64.704082
3             master's degree      75.372881
4                some college      69.460177
5            some high school      66.938547

✅ The concluding response:
The data suggests that parental level of education has a positive impact on reading scores, as higher levels of education are associated with higher average reading scores, with master's degree having the highest average reading score of 75.37.



In [11]:
query = "What's the best comprehensive score?"
main(df,query)

📝 The question: What's the best comprehensive score?

💻 The python code:
"```python
df[['reading score' 'writing score' 'math score']].sum(axis=1).max()
```"

📊 The execution result:
300

✅ The concluding response:
The best comprehensive score is 300, as determined by summing the reading, writing, and math scores and finding the maximum total score.



In [12]:
query = "What're the features of the student who has the best writing score?"
main(df,query)

📝 The question: What're the features of the student who has the best writing score?

💻 The python code:
"```python
df.iloc[df['writing score'].idxmax()]
```"

📊 The execution result:
index              106
0                       gender           female
1               race/ethnicity          group D
2  parental level of education  master's degree
3                        lunch         standard
4      test preparation course             none
5                   math score               87
6                reading score              100
7                writing score              100

✅ The concluding response:
The student with the best writing score is a female from group D, with a master's degree in parental level of education, having standard lunch, and no test preparation course, who achieved a math score of 87, a reading score of 100, and a perfect writing score of 100.



In [13]:
query = "What're the features of the student who has the best total score?"
main(df,query)

📝 The question: What're the features of the student who has the best total score?

💻 The python code:
"```python
df.iloc[df[['reading score' 'writing score' 'math score']].sum(axis=1).idxmax()]
```"

📊 The execution result:
index                458
0                       gender             female
1               race/ethnicity            group E
2  parental level of education  bachelor's degree
3                        lunch           standard
4      test preparation course               none
5                   math score                100
6                reading score                100
7                writing score                100

✅ The concluding response:
The student with the best total score is a female from group E, with a bachelor's degree as her parental level of education, having standard lunch and no test preparation course, and achieving perfect scores in math, reading, and writing.



Field approximation

In [14]:
query = "If food impacts writing score?"
main(df,query)

📝 The question: If food impacts writing score?

💻 The python code:
"```python
df.groupby('lunch')['writing score'].mean()
```"

📊 The execution result:
lunch  writing score
0  free/reduced      63.022535
1      standard      70.823256

✅ The concluding response:
The data suggests that food may impact writing score, as students receiving free or reduced lunch have a lower average writing score of 63.02 compared to those with standard lunch, who have an average writing score of 70.82.



In [15]:
query = "If students who completed preparation have a better writing score?"
main(df,query)

📝 The question: If students who completed preparation have a better writing score?

💻 The python code:
"```python
df.groupby('test preparation course')['writing score'].mean()
```"

📊 The execution result:
test preparation course  writing score
0               completed      74.418994
1                    none      64.504673

✅ The concluding response:
The data analysis indicates that students who completed the preparation course have a higher average writing score of 74.42 compared to those who did not, suggesting that completing the preparation course is associated with better writing performance.



In [16]:
query = "Which racial has the best writing score?"
main(df,query)

📝 The question: Which racial has the best writing score?

💻 The python code:
"```python
df.groupby('race/ethnicity')['writing score'].mean().idxmax()
```"

📊 The execution result:
'group E'

✅ The concluding response:
The analysis reveals that group E has attained the highest average writing score, thereby indicating it as the racial group with the best writing performance.



different df

In [17]:
df2 = pd.read_csv('data/product.csv') 

In [18]:
query = "What's the worst average rating?"
main(df2,query)

📝 The question: What's the worst average rating?

💻 The python code:
"```python
df['average rating'].min()
```"

📊 The execution result:
1.1

✅ The concluding response:
The worst average rating is 1.1, indicating the lowest level of satisfaction or performance among the available data.



Field approximation

In [19]:
query = "What's the best satisfaction?"
main(df2,query)

📝 The question: What's the best satisfaction?

💻 The python code:
"```python
df['customer satisfaction'].max()
```"

📊 The execution result:
9.8

✅ The concluding response:
The best satisfaction is 9.8, indicating the highest level of customer satisfaction achieved.



In [20]:
query = "What's the average inventory?"
main(df2,query)

📝 The question: What's the average inventory?

💻 The python code:
"```python
df['stock'].mean()
```"

📊 The execution result:
23.2

✅ The concluding response:
The average inventory is 23.2, indicating that the mean stock level across the dataset is approximately 23.2 units.



Comprehensive questions

In [21]:
query = "Which product has the highest sales volume?"
main(df2,query)

📝 The question: Which product has the highest sales volume?

💻 The python code:
"```python
df.loc[df['sales volume'].idxmax()]['product name']
```"

📊 The execution result:
'Product9'

✅ The concluding response:
The analysis result indicates that Product9 has achieved the highest sales volume, making it the product with the highest sales performance.



In [22]:
query = "How many products are currently on promotion?"
main(df2,query)

📝 The question: How many products are currently on promotion?

💻 The python code:
"```python
df['on promotion'].value_counts()
```"

📊 The execution result:
on promotion  count
0           No      5
1          Yes      5

✅ The concluding response:
There are 5 products that are currently on promotion, as indicated by the 'Yes' count in the execution result.



In [23]:
query = "What're the features of product which has the best market feedback?"
main(df2,query)

📝 The question: What're the features of product which has the best market feedback?

💻 The python code:
"```python
df.iloc[df['market feedback'].idxmax()]
```"

📊 The execution result:
index                     0
0              product id                     1
1            product name              Product1
2                category           Electronics
3                   price                326.49
4            sales volume                    10
5            review count                   243
6          average rating                   4.8
7                   stock                    17
8             launch date            2022-01-01
9            on promotion                    No
10     promotion discount                   0.3
11            return rate                  0.14
12  customer satisfaction                   7.1
13        market feedback                  Poor
14       advertising cost                570.92
15                 region                  East
16       competitor