In [None]:
import os
import openai
import pinecone
import json
from langchain.text_splitter import RecursiveCharacterTextSplitter
import time
from json import JSONDecodeError
import concurrent.futures
import traceback
import requests
from PIL import Image
import pytesseract
from io import BytesIO

#INSERT API KEYS HERE
pinecone.init(api_key="", environment="us-west4-gcp")
index = pinecone.Index(index_name='gpt-dashboard-db')
os.environ['OPENAI_API_KEY'] = ""
openai.api_key = ""

embed_id = 1


In [None]:
#STEP 2: Interrogate
from multiprocessing import Manager
import time
from concurrent.futures import ThreadPoolExecutor

manager = Manager()
rate_limit_hit = manager.Value(bool, False)

filenames = [] #<--- Add filenames with chunks to process here (e.g. 'channel.json')


def get_qa(messages):
    if rate_limit_hit.value:
        print("Rate limit hit, sleeping for 60 seconds")
        time.sleep(60)
        rate_limit_hit.value = False

    user_input_gpt_gpt35 = """#Instructions# You are an AI tool capable of scanning and interpreting communication  \
      threads on platforms such as Slack.  Your primary function is to identify key topics of discussion and create insightful \
        question and answer pairs based on these topics. The goal \
        is to generate around 15 Q&A pairs that could potentially be posed by Acme Inc. employees in relation to the subject \
        matter being discussed. However, should the thread contain a wider variety of topics, you may extend the number of Q&A pairs to 20. \
    #Input Format# You will be provided with a thread history, capturing interactions between employees asking and answering questions. \
        Analyze this thread and generate approximately 10 to 20 Q&A pairs, depending on the diversity of topics discussed. The pairs should\
        reference the original authors of the questions and answers where applicable.
    #Example Output# Consider a scenario where the conversation thread focuses on cap table maintenance, with Branden asking questions \
    # and Jordan providing responses. An appropriate Q&A pair could be:
    Q (Branden): Should we provide cap table maintenance for the customer?
    A (Jordan): No, this is not...(continued)

    #Thread history input data: # """ + messages

    gpt35_system_prompt = "You are helpful Q&A generating bot."

    messages = [{"role": "system", "content": gpt35_system_prompt}, {"role": "user", "content": user_input_gpt_gpt35}]


    for i in range(6): 
        try:
            response = openai.ChatCompletion.create(
                model="gpt-3.5-turbo",
                messages=messages,
                max_tokens=900,
                temperature=0.8
            )
            response_message = response["choices"][0]["message"]["content"]
            prompt_tokens, completion_tokens = response['usage']['prompt_tokens'], response['usage']['completion_tokens']

            return response_message, prompt_tokens, completion_tokens

        except JSONDecodeError as e:
            print(f'JSONDecodeError occurred: {e}')
            time.sleep(10) 
            continue 
        except openai.error.ServiceUnavailableError as e:
            print(f'Server error: {e}. Retrying in 10 seconds.')
            time.sleep(10)  
            continue
        except openai.error.RateLimitError as e: 
            print(f'Hit rate limit. Waiting 60 sec')
            rate_limit_hit.value = True
            time.sleep(60)  
            continue
        except openai.error.APIError as e: 
            print(f'BAD GATEWAY ERROR. Waiting 20 sec')
            rate_limit_hit.value = True
            time.sleep(20)  
            continue
    raise TypeError("Function call not found in response after 5 retries")


def process_chunk(obj):
    messages = obj["messages"]
    try:
        qa, prompt_tokens, completion_tokens = get_qa(messages)
        obj["qa"] = qa
        obj["prompt_tokens"] = prompt_tokens
        obj["completion_tokens"] = completion_tokens
    except openai.error.ServiceUnavailableError:
        print('Server overload error occurred. Writing current data to JSON file.')
        with open("error_file_chunks.json", 'a') as f:
            json.dump(obj, f)
        traceback.print_exc()
    except Exception as e:
        with open("error_file_chunks.json", 'a') as f:
            json.dump(obj, f)
        print(f'Unexpected error occurred: {e}')
        traceback.print_exc()
    return obj

# Create a function to split the chunks into smaller groups, each with a token count <= 85000
def split_chunks(chunks, max_token_count=30000):
    grouped_chunks = []
    current_group = []
    current_group_token_count = 0

    for chunk in chunks:
        if current_group_token_count + chunk['token_count'] <= max_token_count:
            current_group.append(chunk)
            current_group_token_count += chunk['token_count']
        else:
            grouped_chunks.append(current_group)
            current_group = [chunk]
            current_group_token_count = chunk['token_count']

    # Append the last group if it isn't empty
    if current_group:
        grouped_chunks.append(current_group)

    return grouped_chunks


#iterate through files with chunks from channels
for filename in filenames:
    with open(f"./chunks_to_interrogate/{filename}", 'r') as f:
        chunks = json.load(f)

    grouped_chunks = split_chunks(chunks)
    len_of_chunks = [len(chunk) for chunk in grouped_chunks]
    print(len_of_chunks, filename)

    for i, group in enumerate(grouped_chunks):
        with ThreadPoolExecutor() as executor:
            new_chunks = list(executor.map(process_chunk, group))

        # Open the output file in 'append' mode ('a') to add the new chunks to the end
        with open(f'./to_pinecone/{filename}', 'a', encoding='utf-8') as f:
            json.dump(new_chunks, f, ensure_ascii=False, indent=4)
        
        print(f"Done. {i+1}/{len(grouped_chunks)},chunks: {len(group)}")
            
        # Sleep for 1 minute after each group of chunks
        if i != len(grouped_chunks) - 1:
            time.sleep(65)