## 1. c纯sql做数据库

In [5]:
import pandas as pd
import sqlite3

# 读取CSV文件
csv_file_path = r'/home/ubuntu/mnt2/wxy/Ako_GPT/data/exam_entry_answer.csv'  # 替换为您的CSV文件路径
data = pd.read_csv(csv_file_path,encoding='ISO-8859-1')


conn = sqlite3.connect('questions.db')
cursor = conn.cursor()


cursor.execute('''
CREATE TABLE IF NOT EXISTS questions (
    entry TEXT PRIMARY KEY,
    answer TEXT
)
''')


for index, row in data.iterrows():
    cursor.execute('INSERT OR IGNORE INTO questions (entry, answer) VALUES (?, ?)', (row['entry'], row['answer']))

conn.commit()


def get_answer(entry_id):
    cursor.execute("SELECT answer FROM questions WHERE entry=?", (entry_id,))
    result = cursor.fetchone()
    if result:
        return result[0]
    else:
        return "Nope"

# 示例使用
entry_id = input("Please enter the question ID: ")
answer = get_answer(entry_id)
print(f"answer: {answer}")

# 关闭数据库连接
conn.close()


答案: Self absorption occurs at higher concentrations when the light that is emitted by an excited atom gets absorbed by the sample, instead of the light thats coming from the light source. This makes the sample look less concentrated than it is.


## 1.2 RAG+LLM

In [None]:
import pandas as pd
import sqlite3
import re
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

conn = sqlite3.connect('questions.db')
def get_answer(entry_id):
    cursor.execute("SELECT answer FROM questions WHERE entry=?", (entry_id,))
    result = cursor.fetchone()
    if result:
        return result[0]
    else:
        return None

# model
tokenizer = AutoTokenizer.from_pretrained(r"/home/ubuntu/mnt2/wxy/model/Mistral-7B-Instruct-v0.3")  # 替换为实际模型名称
model = AutoModelForCausalLM.from_pretrained(r"/home/ubuntu/mnt2/wxy/model/Mistral-7B-Instruct-v0.3")

# 调用LLM处理组合输入的函数
def send_to_llm(input_text):
    inputs = tokenizer.encode(input_text, return_tensors="pt")
    outputs = model.generate(inputs, max_length=2000)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

# 处理用户输入并发送到LLM的函数
def process_query(query):
    # 从用户输入中提取题号
    match = re.search(r'question ID is (\w+)', query)
    if match:
        entry_id = match.group(1)
        answer = get_answer(entry_id)
        if answer:
            combined_input = f"Question ID: {entry_id}\nRetrieved Answer: {answer}\n\nUser Query: {query}"
            response = send_to_llm(combined_input)
            return response
        else:
            return "The specified question number was not found. Please check whether the question number is correct."
    else:
        return "No valid question number was found in the exam database."

# example
user_input = input("Please enter your query (for example: Question number is xxx, please help me solve this question):")
output = process_query(user_input)
print(output)

# close
conn.close()

## 生成考试信息的数据集

In [None]:
import pandas as pd
import random

# Define lists for generating diverse data
subjects = [
    # Class series
    "Class1", "Class2", "Class3", "Class4", 
    "Class5", "Class6", "Class7", "Class8", 
    "Class9", "Class10",

    # Chem series
    "Chem1", "Chem2", "Chem3", "Chem4", 
    "Chem5", "Chem6", "Chem7", "Chem8"

]


classrooms = [
    "A101", "B205", "C302", "D401", "E502", 
    "F203", "G104", "H306", "I405", "J201",
    "K302", "L404", "M205", "N106", "O407"
]

teachers = [
    "Dr. Smith", "Prof. Johnson", "Dr. Williams", 
    "Prof. Brown", "Dr. Miller", "Prof. Davis", 
    "Dr. Garcia", "Prof. Rodriguez", "Dr. Lee", 
    "Prof. Patel", "Dr. Chen", "Prof. Kim", 
    "Dr. Martinez", "Prof. Wilson", "Dr. Taylor"
]

notes = [
    "Advanced Course", "Introductory Level", 
    "Requires Prior Knowledge", "Practical Session", 
    "Research Oriented", "Theoretical Framework", 
    "Group Project Required", "Comprehensive Exam", 
    "Elective Course", "Core Curriculum",
    "Lab Intensive", "Seminar Based", 
    "Independent Study", "Field Research"
]

# Generate 100 rows of simulated data
num_rows = 100
data = {
    'Subject': random.choices(subjects, k=num_rows),
    'Exam Time': [random.randint(1, 6) for _ in range(num_rows)],
    'Classroom': random.choices(classrooms, k=num_rows),
    'Teacher': random.choices(teachers, k=num_rows),
    'Notes': random.choices(notes, k=num_rows)
}

df = pd.DataFrame(data)

# Save to CSV
df.to_csv(r'/home/ubuntu/mnt2/wxy/Ako_GPT/data/exam_info.csv', index=False)

print(df.head(10))
print(f"\nTotal rows generated: {len(df)}")