### CSC 791 - Natural Language Processing
### Project Assignment 2
#### Vikram Pande
#### News Classification with Prompt Engineering

In [1]:
# import libraries
import openai
import pandas as pd
import time
import re
from tqdm import tqdm

from tenacity import (
    retry,
    stop_after_attempt,
    wait_random_exponential,
)

from sklearn.metrics import accuracy_score


In [2]:
# Function to load the API key from a file
def load_api_key(api_key_file_path):
    with open(api_key_file_path, 'r') as file:
        api_key = file.read().strip()
    return api_key

# Load your API key from a file
api_key = load_api_key('api_key.txt')

# Set up the OpenAI API client
openai.api_key = api_key

@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
def completion_with_backoff(**kwargs):
    return openai.Completion.create(**kwargs)

# Function to return prompt response
def completion_with_backoff(prompt, model="gpt-3.5-turbo", max_tokens=256, temperature=0):
    try:
        response = openai.chat.completions.create(
            model=model,
            messages=[{"role": "system", "content": "You are a helpful assistant."},
                    {"role": "user", "content": prompt}],
            max_tokens=max_tokens,
            temperature = temperature,
            top_p=1.0
        )
    except:
        print('Retrying in 20 seconds!')
        time.sleep(20)
        print('Started again!')
        response = openai.chat.completions.create(
            model=model,
            messages=[{"role": "system", "content": "You are a helpful assistant."},
                    {"role": "user", "content": prompt}],
            max_tokens=max_tokens,
            temperature = temperature,
            top_p=1.0
        )

    return response.choices[0].message.content.strip()


In [3]:
# Read Data
data_path = 'data'
data = pd.read_csv(data_path + '/news-ag-data/test.csv')
print(data.shape)


(7600, 3)


Class mapping of Dataset
1. World
2. Sports
3. Business
4. Science/technology

In [4]:
text = data['Description'][0]
print(text)

test_prompt = f"""
 Q: Classify given sentence into one of the folloiwng classes
                1-world news/world, 2-sports, 3-business, 4-science/technology
                The football game was amazing.
                Return only the class index.

                The word football belongs to sport and game also could belong to sports.
                Hence the answer is sports news - Class 2

                A: 2

                Q: Classify given sentence into one of the folloiwng classes
                1-world news/world, 2-sports, 3-business, 4-science/technology
                {text}
                Return only the class index.

                A:
"""
response = completion_with_backoff(test_prompt)
print(response)


Unions representing workers at Turner   Newall say they are 'disappointed' after talks with stricken parent firm Federal Mogul.
3


### Input test data descriptions in prompts

### Strat 1 : Baseline: Direct Input and Output - Zero-shot prompting

In [7]:
actual_classes = data['Class Index']
actual_classes = actual_classes[100:600]

actual_classes = actual_classes.to_list()
print(len(actual_classes))


500


In [8]:
descriptions = data['Description']
descriptions = descriptions[100:600]


In [9]:
predicted_classes = []
input_text = []

for idx, description in enumerate(tqdm(descriptions, desc="Processing", unit="element")):
    #print(f'starting element {count}')
    prompt = f"""
                Classify given sentence into one of the folloiwng classes
                1-world news/world, 2-sports, 3-business, 4-science/technology
                Return only the class index.
                {description}
              """
    try:
        response = completion_with_backoff(prompt)
    except:
        response = -1

    predicted_classes.append(response)
    input_text.append(description)


Processing: 100%|██████████| 500/500 [1:25:23<00:00, 10.25s/element]   


In [10]:
pc_copy = predicted_classes.copy()
len(predicted_classes)


500

In [14]:
predicted_classes = [int(num) for num in re.findall(r'\d+', ' '.join(predicted_classes))]
print(len(predicted_classes))


500


In [15]:
# predicted_classes = [int(x) for x in predicted_classes]
# predicted_classes


In [17]:
accuracy_baseline = accuracy_score(actual_classes, predicted_classes)
print(accuracy_baseline)


0.692


### Strat 2 : Few Shot Prompting

In [18]:
predicted_classes_fs = []

for idx, description in enumerate(tqdm(descriptions, desc="Processing", unit="element")):

    prompt = f"""
                Q: Classify given sentence into one of the folloiwng classes
                1-world news/world, 2-sports, 3-business, 4-science/technology
                The football game was amazing.
                Return only the class index.

                A: 2

                Q: Classify given sentence into one of the folloiwng classes
                1-world news/world, 2-sports, 3-business, 4-science/technology
                {description}
                Return only the class index.

                A:
              """
    try:
        response = completion_with_backoff(prompt)
    except:
        response = -1

    predicted_classes_fs.append(response)
    input_text.append(description)


Processing: 100%|██████████| 500/500 [1:43:50<00:00, 12.46s/element]   


In [20]:
pcfs_copy = predicted_classes_fs.copy()
len(predicted_classes_fs)


500

In [22]:
predicted_classes_fs = [int(num) for num in re.findall(r'\d+', ' '.join(predicted_classes_fs))]
print(len(predicted_classes_fs))


500


In [23]:
accuracy_fs = accuracy_score(actual_classes, predicted_classes_fs)
print(accuracy_fs)


0.686


### Strat 3 : CoT - Chain of Thoughts

In [24]:
predicted_classes_cot = []
input_text = []

for idx, description in enumerate(tqdm(descriptions, desc="Processing", unit="element")):

    prompt = f"""
                Q: Classify given sentence into one of the folloiwng classes
                1-world news/world, 2-sports, 3-business, 4-science/technology
                The football game was amazing.
                Return only the class index.
                The word football belongs to sport and game also could belong to sports.
                Hence the answer is sports news - Class 2

                A: 2

                Q: Classify given sentence into one of the folloiwng classes
                1-world news/world, 2-sports, 3-business, 4-science/technology
                {description}
                Return only the class index.

                A:
              """
    try:
        response = completion_with_backoff(prompt)
    except:
        response = -1

    predicted_classes_cot.append(response)
    input_text.append(description)


Processing: 100%|██████████| 500/500 [1:00:59<00:00,  7.32s/element]  


In [25]:
pccot_copy = predicted_classes_cot.copy()
len(predicted_classes_cot)


500

In [28]:
# predicted_classes_cot
# pccot_copy
predicted_classes_cot = [int(x) for x in predicted_classes_cot]


In [29]:
accuracy_cot = accuracy_score(actual_classes, predicted_classes_cot)
print(accuracy_cot)


0.648
