In [2]:
import pandas as pd
import json
import os
from sklearn.model_selection import train_test_split
import random
import numpy as np
random.seed(42)

folder_path = "E:\厌氧消化预测大模型\BP\文献数据"
output_dir = "literatur_data(no_process)" 

os.makedirs(output_dir, exist_ok=True)

test_jsonl = os.path.join(output_dir, "test_10_percent.jsonl") 
train_30_jsonl = os.path.join(output_dir, "train_30_percent.jsonl")  
train_60_jsonl = os.path.join(output_dir, "train_60_percent.jsonl") 
train_90_jsonl = os.path.join(output_dir, "train_90_percent.jsonl")  
excel_extensions = ('.xlsx', '.xls','csv')

all_test_data = []
all_train_30_data = []
all_train_60_data = []
all_train_90_data = []

for filename in os.listdir(folder_path):
    if filename.lower().endswith(excel_extensions):
 
        file_path = os.path.join(folder_path, filename)
        print(f"Processing file: {file_path}") 
        try:
            excel_data = pd.read_excel(file_path)
        except Exception as e:
            print(f"Error reading {file_path}: {e}")
            continue
       

        ### no numeric feature melt ###
        numeric_cols = excel_data.select_dtypes(include=[np.number]).columns
        feature_names = [col for col in numeric_cols if col != excel_data.columns[-1]]


        output_col = excel_data.columns[-1]
        json_list = []
        for row_idx, row in excel_data.iterrows():
            instruction = f"Please predict {output_col} based on the following anaerobic digestion process description and process variable data."
            for col_name in feature_names:
                value = row[col_name]
                if pd.notna(value) and str(value).lower() != "nan":
                    instruction += f" The {col_name} is {value}."

         
            output = str(row[output_col]) if pd.notna(row[output_col]) else ""

      
            json_obj = {
                "instruction": instruction.strip(),
                "input": "",
                "output": output.strip()
            }

            json_list.append(json_obj)

        if not json_list:
            print(f"No data processed for {file_path}. Skipping.")
            continue
        train_data, test_data = train_test_split(json_list, test_size=0.1, random_state=42)
        all_test_data.extend(test_data)
        train_size = len(train_data)

        n_30 = max(1, int(train_size * 0.3)) 
        n_60 = max(1, int(train_size * 0.6))
        n_90 = max(1, int(train_size * 0.9))
        train_30_data = random.sample(train_data, min(n_30, len(train_data)))
        train_60_data = random.sample(train_data, min(n_60, len(train_data)))
        train_90_data = random.sample(train_data, min(n_90, len(train_data)))
        all_train_30_data.extend(train_30_data)
        all_train_60_data.extend(train_60_data)
        all_train_90_data.extend(train_90_data)
with open(test_jsonl, "w", encoding="utf-8") as f:
    for json_obj in all_test_data:
        f.write(json.dumps(json_obj, ensure_ascii=False) + "\n")
print(f"Test set (10%) saved to: {test_jsonl}, {len(all_test_data)} entries")
with open(train_30_jsonl, "w", encoding="utf-8") as f:
    for json_obj in all_train_30_data:
        f.write(json.dumps(json_obj, ensure_ascii=False) + "\n")
print(f"Train set (30%) saved to: {train_30_jsonl}, {len(all_train_30_data)} entries")
with open(train_60_jsonl, "w", encoding="utf-8") as f:
    for json_obj in all_train_60_data:
        f.write(json.dumps(json_obj, ensure_ascii=False) + "\n")
print(f"Train set (60%) saved to: {train_60_jsonl}, {len(all_train_60_data)} entries")


with open(train_90_jsonl, "w", encoding="utf-8") as f:
    for json_obj in all_train_90_data:
        f.write(json.dumps(json_obj, ensure_ascii=False) + "\n")
print(f"Train set (90%) saved to: {train_90_jsonl}, {len(all_train_90_data)} entries")

Processing file: E:\厌氧消化预测大模型\BP\文献数据\data_bp_1.xlsx
Processing file: E:\厌氧消化预测大模型\BP\文献数据\data_bp_10.xlsx
Processing file: E:\厌氧消化预测大模型\BP\文献数据\data_bp_11.xlsx
Processing file: E:\厌氧消化预测大模型\BP\文献数据\data_bp_12.xlsx
Processing file: E:\厌氧消化预测大模型\BP\文献数据\data_bp_13.xlsx
Processing file: E:\厌氧消化预测大模型\BP\文献数据\data_bp_14.xlsx
Processing file: E:\厌氧消化预测大模型\BP\文献数据\data_bp_15.xlsx
Processing file: E:\厌氧消化预测大模型\BP\文献数据\data_bp_16.xlsx
Processing file: E:\厌氧消化预测大模型\BP\文献数据\data_bp_17.xlsx
Processing file: E:\厌氧消化预测大模型\BP\文献数据\data_bp_18.xlsx
Processing file: E:\厌氧消化预测大模型\BP\文献数据\data_bp_2.xlsx
Processing file: E:\厌氧消化预测大模型\BP\文献数据\data_bp_3.xlsx
Processing file: E:\厌氧消化预测大模型\BP\文献数据\data_bp_4.xlsx
Processing file: E:\厌氧消化预测大模型\BP\文献数据\data_bp_5.xlsx
Processing file: E:\厌氧消化预测大模型\BP\文献数据\data_bp_6.xlsx
Processing file: E:\厌氧消化预测大模型\BP\文献数据\data_bp_7.xlsx
Processing file: E:\厌氧消化预测大模型\BP\文献数据\data_bp_8.xlsx
Processing file: E:\厌氧消化预测大模型\BP\文献数据\data_bp_9.xlsx
Test set (10%) saved to: literatur_da