In [None]:
import os
import openai
import time
from dotenv import load_dotenv
import argparse

# Load environment variables from .env file
load_dotenv(r'api.env')


api_key = os.getenv('OPENAI_API_KEY')
if not api_key:
    raise ValueError("API key not found. Please ensure the .env file is correct and the key is properly set.")


openai.api_key = api_key


tokens_used = 0
last_check_time = time.time()
TPM_LIMIT = 200000

def gpt35_turbo_inference(prompt, text, model="gpt-4o-mini", max_tokens=1000, temperature=0.7):
    global tokens_used, last_check_time
    
    current_time = time.time()
    elapsed_time = current_time - last_check_time

    if elapsed_time > 60:
        tokens_used = 0
        last_check_time = current_time

    combined_prompt = f"{text}\n\n{prompt}"
    tokens_estimate = len(combined_prompt.split()) + max_tokens
    
    if tokens_used + tokens_estimate > TPM_LIMIT:
        sleep_time = 60 - (current_time - last_check_time)
        print(f"Rate limit approaching. Sleeping for {sleep_time:.2f} seconds...")
        time.sleep(sleep_time)
        tokens_used = 0
        last_check_time = time.time()

    try:
        response = openai.ChatCompletion.create(
            model=model,
            messages=[
                {"role": "user", "content": combined_prompt}
            ],
            max_tokens=max_tokens,
            temperature=temperature
        )
        tokens_used += response.usage['total_tokens']
        return response.choices[0].message['content'].strip(), tokens_used
    except Exception as e:
        print(f"Error occurred: {e}")
        return None, tokens_used


def process_files(directory, prompt, output_directory, progress_file):
    processed_files = set()
    if os.path.exists(progress_file):
        with open(progress_file, 'r') as file:
            processed_files.update(file.read().splitlines())

    total_files = [f for f in os.listdir(directory) if f.endswith(".txt")]
    total_count = len(total_files)
    processed_count = len(processed_files)

    if not os.path.exists(output_directory):
        os.makedirs(output_directory)

    for filename in total_files:
        if filename in processed_files:
            print(f"Skipping {filename}, already processed.")
            continue

        file_path = os.path.join(directory, filename)
        try:
            with open(file_path, 'r', encoding='utf-8') as file:
                text = file.read()
                result, tokens_used = gpt35_turbo_inference(prompt, text)
                if result:
                    save_result(filename, result, output_directory)
                    print(f"Processed and saved {filename}")
                    processed_count += 1
                    with open(progress_file, 'a') as pfile:
                        pfile.write(f"{filename}\n")
                else:
                    print(f"Failed to process {filename}")
        except Exception as e:
            print(f"Error reading {file_path}: {e}")

        print(f"Progress: {processed_count}/{total_count} files processed.")

def save_result(filename, result, output_directory):
    output_file_path = os.path.join(output_directory, filename)
    try:
        with open(output_file_path, 'w', encoding='utf-8') as output_file:
            output_file.write(result)
        print(f"Output saved to {output_file_path}")
    except Exception as e:
        print(f"Error writing to {output_file_path}: {e}")
if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Process text files with GPT-4o-mini.")
    parser.add_argument('--input_directory', type=str, required=True, help="Directory containing input text files")
    parser.add_argument('--output_directory', type=str, required=True, help="Directory to save output text files")
    parser.add_argument('--progress_file', type=str, help="File to record progress of processed files")

    args = parser.parse_args()
    
    if not args.progress_file:
        args.progress_file = os.path.join(args.output_directory, 'progress.txt')

    

    prompt = (
        "When generating a new question and answer pair, creatively rewrite the expressions of the existing QA pairs to enhance data diversity. Ensure that all key technical information and details are preserved within the QA structure, while making the language more varied and engaging.\nSteps:\n1.Carefully read the existing QA pair to understand its core information and technical details.\n2.Reformulate the question, changing the angle of inquiry or using different vocabulary, but ensuring that all essential information is preserved.\n3.Ensure that the answer is detailed and accurate, while moderately adjusting the language to improve readability and appeal.\n4.The generated QA pair should be presented in a clear and concise format: '\nQuestion: \nAnswer:'."
    )
    process_files(args.input_directory, prompt, args.output_directory, args.progress_file)
