In [None]:
import os
import sys
from project_path import PROJECT_PATH
sys.path.insert(0, PROJECT_PATH)
from src.llm_reviewer.notebook_reviewer import IssueLevel


service_account_path = PROJECT_PATH + "/creds/google__sa.json"
tracking_sheet_id = "1qBU7Kvuuij2fxbqPxebReKMxWgIBmOIE5Gi4ZuX0j_4"
delivery_sheet_id = "1eUif5I8xhHU8fY0X9v8r2JI9hWPh7Dq_9VXpSIHwww4"


ISSUE_LEVEL = IssueLevel.MEDIUM
DATA_DIR = PROJECT_PATH + '/data/03_01_2024/'

if not os.path.exists(DATA_DIR):
    os.makedirs(DATA_DIR)

In [None]:
import pandas as pd

from src.sheets_utils import download_sheet_as_df


contributors_df = download_sheet_as_df(
    service_account_path,
    tracking_sheet_id,
    "Contributors"
)

tasks_sheets_df = pd.concat(
    [
        download_sheet_as_df(
            service_account_path,
            tracking_sheet_id,
            "Conversations_Batch_2"
        ),
        download_sheet_as_df(
            service_account_path,
            tracking_sheet_id,
            "Conversations_Batch_3"
        ),
        download_sheet_as_df(
            service_account_path,
            tracking_sheet_id,
            "Conversations_Batch_4"
        ),
        download_sheet_as_df(
            service_account_path,
            tracking_sheet_id,
            "Conversations_Batch_5"
        ),
    ],
    ignore_index=True
)

tasks_sheets_df["completion_date"] = pd.to_datetime(tasks_sheets_df["completion_date"], format="mixed").dt.date
tasks_sheets_df

In [None]:
old = tasks_sheets_df.copy()#df[df["completion_date"] <= pd.to_datetime("2023/12/27").date()]
old_completed = old[old["completion_status"] == "Done"]
old_completed

In [None]:
delivered_df = pd.concat(
    [
        download_sheet_as_df(
            service_account_path,
            delivery_sheet_id,
            "Batch 1"
        ),
        download_sheet_as_df(
            service_account_path,
            delivery_sheet_id,
            "Batch 2"
        ),
        download_sheet_as_df(
            service_account_path,
            delivery_sheet_id,
            "Batch 3"
        ),
    ],
    ignore_index=True
)
delivered_df

In [None]:
def fix_task_link(row):
    if isinstance(row["task_link"], float) and pd.isna(row["task_link"]):
        row["task_link"] = row["#REF!"]
    return row

undelivered_old = old_completed[~old_completed["task_link"].isin(delivered_df["task_link"])]
try:
    undelivered_old = undelivered_old.apply(fix_task_link, axis=1)
except KeyError:
    pass
undelivered_old

In [None]:
import io
import pickle
import nbformat

from google.oauth2 import service_account
from googleapiclient.discovery import build
from googleapiclient.http import MediaIoBaseDownload



def download_and_parse_notebook(service_account_file, file_id):
    # Authenticate with the service account
    credentials = service_account.Credentials.from_service_account_file(
        service_account_file, scopes=['https://www.googleapis.com/auth/drive'])
    service = build('drive', 'v3', credentials=credentials)

    # Request to download the file
    request = service.files().get_media(fileId=file_id)
    fh = io.BytesIO()
    downloader = MediaIoBaseDownload(fh, request)

    # Download the file
    done = False
    while not done:
        status, done = downloader.next_chunk()
        print("Download progress: %d%%." % int(status.progress() * 100))

    # Move the buffer's pointer to the beginning
    fh.seek(0)

    # Open the notebook
    nb_parsed_notebook = nbformat.read(fh, as_version=4)

    return {'file_id': file_id, 'nb_parsed_notebook': nb_parsed_notebook}


def threading_processor(service_account_path, file_id, results):
    results.append(download_and_parse_notebook(service_account_path, file_id))


df = undelivered_old

threads = []
from concurrent.futures import ThreadPoolExecutor

def get_file_id_from_task_link(task_link):
    try:
        return task_link.split("/")[-1]
    except Exception as e:
        print('ERROR' + '='*60)
        print(task_link)
        return None

file_ids = df["task_link"].apply(get_file_id_from_task_link).dropna().tolist()
parsed_conversations = []

with ThreadPoolExecutor(max_workers=20) as executor:
    futures = [executor.submit(download_and_parse_notebook, service_account_path, file_id) for file_id in file_ids]
    for future in futures:
        try:
            result = future.result()
            parsed_conversations.append(result)
        except Exception as e:
            print(f"Failed to download and parse notebook: {e}")


with open(DATA_DIR + 'parsed_conversations.pkl', 'wb') as f:
    pickle.dump(parsed_conversations, f)

with open(DATA_DIR + 'parsed_conversations.pkl', 'rb') as f:
    parsed_conversations = pickle.load(f)



In [None]:
import pickle


with open(DATA_DIR + 'parsed_conversations.pkl', 'rb') as f:
    parsed_conversations = pickle.load(f)
    
from src.llm_reviewer.notebook_reviewer import review_notebooks

OFFSET = 0
notebooks = parsed_conversations

import os

batch_size = 30
total_notebooks = len(notebooks)
reviews_pkl_folder = DATA_DIR + 'raw_reviews_pkls/'
os.makedirs(reviews_pkl_folder, exist_ok=True)

for i in range(OFFSET, total_notebooks, batch_size):
    batch_notebooks = notebooks[i:i+batch_size]
    print(f"Reviewing notebooks {i+1} to {min(i+batch_size, total_notebooks)} out of {total_notebooks}")
    batch_reviews = review_notebooks(batch_notebooks, max_threads_per_notebook=6, max_concurrent_notebooks=16)
    batch_file_name = f'review_results{i+1}-{min(i+batch_size, total_notebooks)}.pkl'
    batch_file_path = os.path.join(reviews_pkl_folder, batch_file_name)
    with open(batch_file_path, 'wb') as f:
        pickle.dump(batch_reviews, f)


In [None]:
from src.llm_reviewer.notebook_reviewer import notebook_reviews_to_df

import glob

review_files = glob.glob(DATA_DIR + 'raw_reviews_pkls/*.pkl')
reviews = []
for file_path in review_files:
    with open(file_path, 'rb') as f:
        reviews.extend(pickle.load(f))

seen_paths = set()
deduped_reviews = []
for review in reversed(reviews):
    if review is not None and review['nb_path'] not in seen_paths:
        seen_paths.add(review['nb_path'])
        deduped_reviews.append(review)
reviews = list(reversed(deduped_reviews))
len(reviews)

In [None]:
CODE_PROMPT = """
You are a concise expert in evaluating and refining the code generated by an AI assistant based on a Large Language Model. You only point out things worth mentioning.

Attributes to consider:
- Code Correctness
- Code Efficiency
- Best Practices
- Code Readability
- Code style Consistency

**1. Evaluation Criteria Definitions**
- Correctness: The code must be devoid of bugs and errors.
- Efficiency: The code must be optimized for maximum performance.
- Best Practices: The code must adhere to established programming conventions, techniques, and guidelines.
- Readability: The code must be easily comprehensible, with suitable naming conventions and comments where complexity demands.
- Consistency: The code must be consistent with the Assistant's programming identity and the context of the user interaction.

**2. Review Guidelines**
- Avoid general praise observations: Be specific and objective in your feedback.
- Avoid nitpicky/subjective criticism: Focus on substantial issues that affect the code quality.

-----

You are provided with the issues found in each turn of an interaction between user and AI LLM Assistant.
If no issues reported, it means no issues were found.

# START OF JUDGMENT MATERIAL
{FEEDBACK}
# END OF JUDGMENT MATERIAL


# Grading rubric

### 5 - Excellent
- Well Formatted
- Correct
- Optimal
- Highly readable

### 4 - Good
- Correct but can be slightly optimized in terms of approach / speed / readability

### 3 - Acceptable
- The code is correct but can be significantly improved.
- The code is not readable.

### 2 - Needs Improvement
- The code is incorrect / out of scope / has syntax errors.
- Looks like it’s copied from ChatGPT.

### 1 - Poor
- Incomplete or Missing code.


Given the feedback above, generate a 1 sentence judgment and a score.
Your output should be as JSON in the following format:

{{"judgment": "single sentence", "score": "1 to 5 according to rubrics and provided by turn feedback"}}

Take a deep breath.
"""


LANG_PROMPT = """
You are a concise expert in analyzing and improving English usage by an AI assistant based on Large Lnaguage Model. You only point out things worth mentioning.


Attributes to consider:
- English correctness, clarity, precision, and conciseness
- Alignment with the assistant's identity and the User's query and persona and User's persona, its background and level of knowledge
- Markdown style and formatting

You will be provided a single turn in the middle of the conversation between a user and an LLM Assistant. Assume that there might be other turns before or after the exchange you are  provided.


**1. Identification of Reply Text for Review**
- Target for analysis: *Text* Replies generated by the LLM Assistant.
- Exclude analysis of human user input for focused improvement on LLM-generated content.
- Exclude LLM **code** content, only review **text** parts. Code is for context only.

**2. Evaluation Criteria Definitions**
- English Correctness: Grammar, syntax, punctuation, and spelling.
- Clarity: The ease with which the intended audience(this particular user) can understand the reply.
- Precision: The accuracy and specificity of the information provided.
- Conciseness: The brevity of the reply while conveying complete information.
- Structure: The organization of information and logical flow within the reply.
- Relevance: The pertinence of the reply to the user's input.
- Leveraging appropriate markdown syntax tools in order to optimize information presented for easier readability and navigation.

**3. Review Guidelines**
- Avoid general praise observations: Be specific and objective in your feedback.
- Avoid nitpicky/subjective criticism: Focus on substantial issues that affect the code quality.

-----

You are provided with the issues found in each turn of an interaction between user and AI LLM Assistant.
If no issues reported, it means no issues were found.

# START OF JUDGMENT MATERIAL
{FEEDBACK}
# END OF JUDGMENT MATERIAL


# Grading rubric

### 5 - Excellent
- Authentic & Realistic (User) 
- Well Formatted markdown for ease of consumption and understanding of the information(Assistant)
- Maximum usefulness while being to the point (Assistant)
- Free of mistakes (Assistant)
- Tailored to the user & situation (Assistant)

### 4 - Good
- Clear but can be optimized with 1 or 2 minor issues.

### 3 - Acceptable
- You can still understand what’s being said but things can be phrased much better. 
- Reasoning/Explanations are missing.
- Can have some minor mistakes or 1 major mistake.

### 2 - Needs Improvement
- It’s hard to understand what’s being said. 
- Has many minor language mistakes or more than 1 major mistake. 
- Looks like it’s copied from ChatGPT.

### 1 - Poor
- Incomplete or Missing responses.


Given the feedback above, generate a 1 sentence judgment and a score.
Your output should be as JSON in the following format:

{{"judgment": "single sentence", "score": "1 to 5 according to rubrics and provided by turn feedback"}}

Take a deep breath.
"""


from src.llm_reviewer.llm_api import make_llm_request, LLMAPIFactory
from src.llm_reviewer.constants import PATH_TO_SECRETS

def get_judgment(prompt, feedback):
    llm_client = LLMAPIFactory(PATH_TO_SECRETS).get()
    judgment = make_llm_request(
        llm_client,
        [{'role': 'system', 'content': prompt.format(FEEDBACK=feedback)}],
        'gpt-4-1106-preview',
        temperature= 0.0,
        max_tokens = 4000,
        response_format = {'type': "json_object"},
        retries = 3,
    )
    return judgment

# Example of running the function
#judgment = get_judgment(CODE_PROMPT, gpt_reviews_df.iloc[0]['code_feedback'])
#judgment


In [None]:
def process_dataframe(reviews, issue_level):
    df = notebook_reviews_to_df(filter(None, reviews), issue_level)
    for i, (index, row) in enumerate(df.iterrows()):
        print(f"Row {i + 1}/{len(df)} for {issue_level}:")
        code_judgment = get_judgment(CODE_PROMPT, row['code_feedback'])
        lang_judgment = get_judgment(LANG_PROMPT, row['lang_feedback'])
        df.loc[index, 'code_judgment'] = code_judgment['judgment']
        df.loc[index, 'code_judgment_score'] = code_judgment['score']
        df.loc[index, 'lang_judgment'] = lang_judgment['judgment']
        df.loc[index, 'lang_judgment_score'] = lang_judgment['score']
        df.loc[index, 'total_score'] = row['code_score'] + row['lang_score']
        print(f"Code Score: {row['code_score']}, Lang Score: {row['lang_score']}, Total Score: {df.loc[index, 'total_score']}")
        print(f"Code Judgment: {df.loc[index, 'code_judgment']}")
        print(f"Full Code Score: {df.loc[index, 'code_judgment_score']}")
        print(f"Language Judgment: {df.loc[index, 'lang_judgment']}")
        print(f"Full language Score: {df.loc[index, 'lang_judgment_score']}")
        print('='*60)
    return df

from concurrent.futures import ThreadPoolExecutor
import threading

# Create a counter and a lock outside the function

def process_row(index, row, total_reviews):
    global counter, results
    try:
        code_judgment = get_judgment(CODE_PROMPT, row['code_feedback'])
        lang_judgment = get_judgment(LANG_PROMPT, row['lang_feedback'])
        results[index] = {
            'code_judgment': code_judgment.get('judgment', None),
            'code_judgment_score': code_judgment.get('score', None),
            'lang_judgment': lang_judgment.get('judgment', None),
            'lang_judgment_score': lang_judgment.get('score', None),
            'total_score': row['code_score'] + row['lang_score']
        }
    except Exception as e:
        print(f"Error processing row {index}: {str(e)}")
    with counter_lock:
        counter += 1
        print(f"Processed {counter}/{total_reviews} rows")

def process_dataframe_parallel(reviews, issue_level, max_workers=20):
    global counter
    counter = 0  # Reset the counter before starting
    df = notebook_reviews_to_df(filter(None, reviews), issue_level)
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        for i, (index, row) in enumerate(df.iterrows()):
            executor.submit(process_row, index, row, len(reviews))
    # Update the dataframe with the results after all rows have been processed
    for index, result in results.items():
        for key, value in result.items():
            df.loc[index, key] = value
    return df

In [None]:

counter = 0
counter_lock = threading.Lock()
results = {}

df_reviews = process_dataframe_parallel(reviews, ISSUE_LEVEL, max_workers=20)
df_reviews.sort_values(by=['code_score', 'lang_score'], inplace=True, ascending=False)
df_reviews.to_csv(DATA_DIR + f'report_{ISSUE_LEVEL}.csv')
import pickle

with open(DATA_DIR + '2step_reviews_df.pkl', 'wb') as f:
    pickle.dump(df_reviews, f)


In [None]:

import matplotlib.pyplot as plt

def plot_histograms(df, issue_level):
    plt.figure(figsize=(21, 7))

    df['code_judgment_score'] = df['code_judgment_score'].astype(float)
    df['lang_judgment_score'] = df['lang_judgment_score'].astype(float)

    # Plotting histogram for code scores
    plt.subplot(2, 2, 1)
    plt.hist(df['code_score'], bins=range(1, 7), align='left', color='skyblue', edgecolor='black')
    plt.title('Histogram of AVG Code Scores')
    plt.xlabel('Code Score')
    plt.ylabel('Frequency')

    # Plotting histogram for language scores
    plt.subplot(2, 2, 2)
    plt.hist(df['lang_score'], bins=range(1, 7), align='left', color='lightgreen', edgecolor='black')
    plt.title('Histogram of AVG Language Scores')
    plt.xlabel('Language Score')
    plt.ylabel('Frequency')

    # Plotting histogram for code judgment scores
    plt.subplot(2, 2, 3)
    plt.hist(df['code_judgment_score'], bins=range(1, 7), align='left', color='lightcoral', edgecolor='black')
    plt.title('Histogram of Code Judgment Scores' + f' for {issue_level} issues')
    plt.xlabel('Code Judgment Score')
    plt.ylabel('Frequency')

    # Plotting histogram for language judgment scores
    plt.subplot(2, 2, 4)
    plt.hist(df['lang_judgment_score'], bins=range(1, 7), align='left', color='lightcoral', edgecolor='black')
    plt.title('Histogram of Language Judgment Scores' + f' for {issue_level} issues')
    plt.xlabel('Language Judgment Score')
    plt.ylabel('Frequency')

    plt.tight_layout()
    plt.show()


In [None]:
plot_histograms(df_reviews, ISSUE_LEVEL)
