In [None]:
import requests
import time
import os
import json
import pickle
from tqdm import tqdm,trange
from openai import OpenAI
import pandas as pd
import numpy as np

os.environ["OPENAI_API_KEY"] = ""
os.environ["OPENAI_BASE_URL"] = "https://api.deepseek.com"

In [14]:
#load prompts from data/prompts.yaml
import yaml
with open('data/prompts.yaml', 'r') as file:
    prompts = yaml.safe_load(file)

#load CollectedIssues.csv
df = pd.read_csv('./data/framework_tfjs_orig_issues.csv')

In [15]:
df2 = pd.read_csv('./data/CollectedIssues.csv')
# Filter rows containing 'issue' keyword
df_filtered = df[df['issue'].astype(str).str.contains('issue', case=False, na=False)]

# Get unique values from df2 Faults column
df2_faults = set(df2['Faults'].astype(str).tolist())

# Split into two lists based on whether issue appears in df2 Faults
list_1 = df_filtered[df_filtered['issue'].astype(str).isin(df2_faults)]
list_2 = df_filtered[~df_filtered['issue'].astype(str).isin(df2_faults)]

# Random sampling
np.random.seed(42)
sampled_1 = list_1.sample(n=min(250, len(list_1)), random_state=42)
sampled_2 = list_2.sample(n=min(250, len(list_2)), random_state=42)

# Add labels
sampled_1['label'] = 1
sampled_2['label'] = 0

# Combine and shuffle
result_df = pd.concat([sampled_1, sampled_2]).sample(frac=1, random_state=42).reset_index(drop=True)
result_df.to_csv('./data/sampled_issues_dataset.csv', index=False)

In [16]:
#prompts
result_df

Unnamed: 0,issue,title,comments,state,created_at,updated_at,body,comments_content,label
0,https://github.com/tensorflow/tfjs/issues/2818,module not found on MacOS,13,closed,2020-03-04T11:09:26Z,2020-03-30T22:50:26Z,"To get help from the community, we encourage u...","['@march23hare thanks for reporting ,can you p...",0
1,https://github.com/tensorflow/tfjs/issues/4418,tfjs 2.8.0 is broken and introduces regression...,10,closed,2020-12-16T19:29:47Z,2020-12-18T02:02:27Z,"As subject line says, TFJS 2.8.0 unfortunately...",['cc @annxingyuan @lina128 for visibility - so...,1
2,https://github.com/tensorflow/tfjs/issues/2586,Firebase and Tensorflow-automl not working tog...,4,closed,2019-12-19T02:44:25Z,2020-01-23T22:23:16Z,Hello everybody. I have a very strange issue.W...,['```(node:21613) UnhandledPromiseRejectionWar...,0
3,https://github.com/tensorflow/tfjs/issues/5536,webgpu: fromPixels for HTMLVideoElement relate...,1,open,2021-08-26T08:17:18Z,2021-08-26T08:28:21Z,Run `yarn test` under tfjs-backend-webgpu.Expe...,"[""@qjia7 I'll take this=====""]",1
4,https://github.com/tensorflow/tfjs/issues/5575,Codelab on Tensorflow js,1,closed,2021-09-02T14:56:39Z,2021-09-02T18:46:56Z,https://codelabs.developers.google.com/codelab...,"['Submitted the change internally , will be up...",1
...,...,...,...,...,...,...,...,...,...
495,https://github.com/tensorflow/tfjs/issues/4271,[tfjs-core] Error: The output # of rows (11.4)...,5,closed,2020-11-19T12:42:07Z,2020-11-26T07:17:40Z,**System information**- TensorFlow.js version ...,['might be related issue [here](https://github...,1
496,https://github.com/tensorflow/tfjs/issues/2212,Convert Yolo9000 to Saved model then to GraphM...,3,closed,2019-10-15T16:56:10Z,2019-11-26T11:03:20Z,"To get help from the community, we encourage u...","[""@CharlesCousyn the files generated by darkfl...",0
497,https://github.com/tensorflow/tfjs/issues/5634,Enhance benchmark result,0,open,2021-09-16T22:55:50Z,2021-11-10T22:57:47Z,Refactor the database schema to include failur...,[],0
498,https://github.com/tensorflow/tfjs/issues/5440,[wasm]Error: Kernel 'Complex' not registered f...,0,open,2021-08-06T00:56:45Z,2021-08-12T16:52:55Z,"Case: ``` it('complex', async () => { const...",[],0


In [None]:
def Query(model, sys, usr, max_retries=3, retry_delay=10):

    if 'claude' in model:
        client = Anthropic(
            base_url='https://api.openai-proxy.org/anthropic',
            api_key='',
        )
        for try_idx in range(5):
            try:
                message = client.messages.create(
                    system=sys,
                    messages=[
                        {
                            "role": "user",
                            "content": usr,
                        }
                    ],
                    model="claude-3-7-sonnet-20250219",
                    max_tokens=2048
                )

                
                final_response = message.content[0].text
                # print(final_response)
                return final_response
            except Exception as e:
                # print(e)
                
                time.sleep(2)
        return None
    

    if 'deepseek' in model:
        client = OpenAI(api_key="", base_url="https://api.deepseek.com")
    if 'gpt' in model or 'o3' in model:
        client = OpenAI(api_key="", base_url="https://api.openai-proxy.org/v1")
    for i in range(max_retries):
        try:
            response = client.chat.completions.create(
                model=model,
                messages=[
                    {"role": "system", "content": sys},
                    {"role": "user", "content": usr},
                ],
                stream=False
            )
            return response.choices[0].message.content
        except Exception as e:
            print(e)
            time.sleep(retry_delay)
            continue
    return None

In [None]:

model = "gpt-4o-mini"
results = []
for idx, row in tqdm(df.iterrows(), total=len(df)):
# for idx, row in tqdm(df.head(50).iterrows(), total=50):
    url = str(row['issue'])
    if 'issue' not in url:
        continue

    title = str(row.get('title', ''))
    state = str(row.get('state', ''))
    created_at = str(row.get('created_at', ''))
    body = str(row.get('body', ''))
    comments_content = str(row.get('comments_content', ''))

    issue_report = f"##Issue Report\n###[Title]: {title}\n###[State]: {state}\n###[Created At]: {created_at}\n###[Body]:\n {body}\n###[Other Comments]:\n {comments_content}"

    user_prompt = prompts['user_input']['template'].replace("{{ISSUE REPORT}}", issue_report)
    sys_prompt = prompts['sys_filteration']['template']

    response = Query(model, sys_prompt, user_prompt)

    # copy row, and append a new column 'LLM_classification' with the response
    new_row = row.copy()
    new_row['LLM_classification'] = response
    results.append(new_row)

    # save results to a new csv file
    pd.DataFrame(results).to_csv(f'./res/Filteration_{model}.csv', index=False)
    # break








100%|██████████| 50/50 [00:28<00:00,  1.75it/s]
