### CSC 791 - Natural Language Processing
### Project Assignment 2
#### Vikram Pande
#### News Classification with Prompt Engineering

In [1]:
# import libraries
import openai
import pandas as pd
import time
import re
from tqdm import tqdm

from tenacity import (
    retry,
    stop_after_attempt,
    wait_random_exponential,
)

from sklearn.metrics import accuracy_score



In [2]:
# Function to load the API key from a file
def load_api_key(api_key_file_path):
    with open(api_key_file_path, 'r') as file:
        api_key = file.read().strip()
    return api_key

# Load your API key from a file
api_key = load_api_key('api_key.txt')

# Set up the OpenAI API client
openai.api_key = api_key

@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
def completion_with_backoff(**kwargs):
    return openai.Completion.create(**kwargs)

# Function to return prompt response
def completion_with_backoff(prompt, model="gpt-3.5-turbo", max_tokens=256, temperature=0):
    try:
        response = openai.chat.completions.create(
            model=model,
            messages=[{"role": "system", "content": "You are a helpful assistant."},
                    {"role": "user", "content": prompt}],
            max_tokens=max_tokens,
            temperature = temperature,
            top_p=1.0
        )
    except:
        print('Retrying in 20 seconds!')
        time.sleep(20)
        print('Started again!')
        response = openai.chat.completions.create(
            model=model,
            messages=[{"role": "system", "content": "You are a helpful assistant."},
                    {"role": "user", "content": prompt}],
            max_tokens=max_tokens,
            temperature = temperature,
            top_p=1.0
        )

    return response.choices[0].message.content.strip()


In [3]:
# Read Data
data_path = 'data'
data = pd.read_csv(data_path + '/news-ag-data/test.csv')
print(data.shape)


(7600, 3)


Class mapping of Dataset
1. World
2. Sports
3. Business
4. Science/technology

In [4]:
actual_classes = data['Class Index']
actual_classes = actual_classes[0:100]

actual_classes = actual_classes.to_list()
print(len(actual_classes))

descriptions = data['Description']
descriptions = descriptions[0:100]


100


In [5]:
predicted_classes = []
input_text = []

for idx, description in enumerate(tqdm(descriptions, desc="Processing", unit="element")):
    prompt = f"""
                Q: Classify given sentence into one of the folloiwng classes
                1-world news/world, 2-sports, 3-business, 4-science/technology
                Return class index for each.
                {description}

                A: Let's think step by step...
              """
    try:
        response = completion_with_backoff(prompt)
    except:
        response = -1

    predicted_classes.append(response)
    input_text.append(description)


Processing: 100%|██████████| 100/100 [56:00<00:00, 33.61s/element]  


In [8]:
pc_copy = [predicted_classes]
explanations = predicted_classes


In [16]:
df = pd.DataFrame({
    'description': descriptions,
    'predicted': explanations,
    'actual_class': actual_classes
})


In [18]:
df.to_excel('error_analysis.xlsx')
