## Detailed article explaination
The detailed code explanation for this article is available at the following link:

https://www.daniweb.com/programming/computer-science/tutorials/543568/openai-o3-vs-anthropic-claude-4-for-text-classification-summarization


For my other articles for Daniweb.com, please see this link:

https://www.daniweb.com/members/1235222/usmanmalik57

## Installing and Importing Required Libraries

In [None]:

!pip install anthropic
!pip install openai
!pip install rouge-score
!pip install --upgrade openpyxl
!pip install pandas openpyxl

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=4a323086e532e3e1e5b9c4b1c8b3907d911c5bbb7a11c901f5b2d94f83a334c6
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from itertools import combinations
from collections import Counter
from sklearn.metrics import hamming_loss, accuracy_score
from rouge_score import rouge_scorer
import anthropic
from openai import OpenAI

from google.colab import userdata
OPENAI_API_KEY = userdata.get('OPENAI_API_KEY')
ANTHROPIC_API_KEY = userdata.get('ANTHROPIC_API_KEY')

## Text Classification Comparison

In [None]:
## Dataset download link
## https://www.kaggle.com/datasets/crowdflower/twitter-airline-sentiment?select=Tweets.csv

dataset = pd.read_csv(r"/content/Tweets.csv")
print(dataset.shape)
dataset.head()

(14640, 15)


Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


In [None]:

# Remove rows where 'airline_sentiment' or 'text' are NaN
dataset = dataset.dropna(subset=['airline_sentiment', 'text'])

# Remove rows where 'airline_sentiment' or 'text' are empty strings
dataset = dataset[(dataset['airline_sentiment'].str.strip() != '') & (dataset['text'].str.strip() != '')]

# Filter the DataFrame for each sentiment
neutral_df = dataset[dataset['airline_sentiment'] == 'neutral']
positive_df = dataset[dataset['airline_sentiment'] == 'positive']
negative_df = dataset[dataset['airline_sentiment'] == 'negative']

# Randomly sample records from each sentiment
neutral_sample = neutral_df.sample(n=34)
positive_sample = positive_df.sample(n=33)
negative_sample = negative_df.sample(n=33)

# Concatenate the samples into one DataFrame
dataset = pd.concat([neutral_sample, positive_sample, negative_sample])

# Reset index if needed
dataset.reset_index(drop=True, inplace=True)

# print value counts
print(dataset["airline_sentiment"].value_counts())


airline_sentiment
neutral     34
positive    33
negative    33
Name: count, dtype: int64


In [None]:
def make_prediction(client, model, content, max_tokens):

    if model == "o3":

      response = client.chat.completions.create(
        model= "gpt-4",
        temperature = 0,
        max_tokens = max_tokens,
        messages=[
              {"role": "user", "content": content}
          ]
      )

      response_value = response.choices[0].message.content


    if model == "claude-opus-4-0":

      response = client.messages.create(
                              model= model,
                              max_tokens = max_tokens,
                              temperature=0.0,
                              messages=[
                                  {"role": "user", "content": content}
                              ]
                          )

      response_value = response.content[0].text

    return response_value

In [None]:

def classify_tweets(client, model, dataset, max_tokens):

    all_sentiments = []

    tweets_list = dataset["text"].tolist()

    message = False
    exceptions = 0

    for tweet in tweets_list:

      content = """What is the sentiment expressed in the following tweet about an airline?
      Select sentiment value from positive, negative, or neutral.
      Return only the sentiment value in small letters e.g. positive, negative, or neutral in the output.
      Here is the tweet: {}""".format(tweet)

      sentiment_value = make_prediction(client, model, content, max_tokens)

      print(sentiment_value)
      all_sentiments.append(sentiment_value)

    return all_sentiments

### Text Classification with OpenAI o3

In [None]:
%%time

client = OpenAI(api_key = OPENAI_API_KEY,)
model = "o3"
max_tokens = 10

predictions = classify_tweets(client, model, dataset, max_tokens)
accuracy = accuracy_score(predictions, dataset["airline_sentiment"])
print("Accuracy:", accuracy)

neutral
neutral
negative
positive
positive
neutral
neutral
neutral
neutral
neutral
positive
neutral
negative
neutral
neutral
neutral
positive
neutral
neutral
neutral
neutral
neutral
neutral
neutral
neutral
neutral
positive
positive
neutral
negative
positive
neutral
negative
neutral
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
neutral
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
neutral
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
Accuracy: 0.87
CPU times: user 897 ms, sys: 99.6 ms, total: 996 ms
Wall time: 1min 4s


### Text Classification with Claude 4 Opus

In [None]:
%%time

client = anthropic.Anthropic(api_key = ANTHROPIC_API_KEY)
model = "claude-opus-4-0"

predictions = classify_tweets(client, model, dataset, max_tokens)
accuracy = accuracy_score(predictions, dataset["airline_sentiment"])
print("Accuracy:", accuracy)

neutral
neutral
negative
neutral
positive
neutral
neutral
neutral
neutral
neutral
positive
neutral
negative
neutral
neutral
neutral
positive
neutral
neutral
neutral
neutral
neutral
neutral
neutral
neutral
neutral
positive
neutral
neutral
negative
neutral
neutral
negative
neutral
neutral
positive
neutral
positive
positive
positive
positive
positive
positive
neutral
positive
positive
positive
neutral
positive
positive
positive
positive
neutral
positive
neutral
positive
positive
positive
positive
positive
positive
neutral
neutral
neutral
positive
positive
positive
negative
negative
neutral
negative
negative
negative
negative
negative
negative
negative
negative
negative
neutral
negative
negative
neutral
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
neutral
negative
negative
Accuracy: 0.79
CPU times: user 2.02 s, sys: 279 ms, total: 2.3 s
Wall time: 5min 6s


## Text Summarization Comparison

In [None]:
# https://github.com/reddzzz/DataScience_FP/blob/main/dataset.xlsx

dataset = pd.read_excel(r"/content/summary_datasets.xlsx")
print(dataset.shape)
dataset.head()

(1000, 10)


Unnamed: 0.1,Unnamed: 0,id,human_summary,publication,author,date,year,month,theme,content
0,0,17283,In successfully seeking a temporary halt in th...,New York Times,Carl Hulse,2016-12-31,2016.0,12.0,politics,WASHINGTON — Congressional Republicans have...
1,0,17284,Officers put her in worse danger some months l...,New York Times,Benjamin Mueller and Al Baker,2017-06-19,2017.0,6.0,crime,"After the bullet shells get counted, the blood..."
2,0,17285,The film striking appearance had been created ...,New York Times,Margalit Fox,2017-01-06,2017.0,1.0,entertainment,"When Walt Disney’s “Bambi” opened in 1942, cri..."
3,0,17286,The year was only days old when the news came ...,New York Times,William McDonald,2017-04-10,2017.0,4.0,entertainment,"Death may be the great equalizer, but it isn’t..."
4,0,17287,If North Korea conducts a test in coming month...,New York Times,Choe Sang-Hun,2017-01-02,2017.0,1.0,politics,"SEOUL, South Korea — North Korea’s leader, ..."


In [None]:
def calculate_rouge(reference, candidate):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(reference, candidate)
    return {key: value.fmeasure for key, value in scores.items()}

In [None]:
def summarize_articles(client, model_id, dataset, max_tokens):

    results = []
    for i, (_, row) in enumerate(dataset[:10].iterrows(), start=1):
        article = row['content']
        human_summary = row['human_summary']

        print(f"Summarizing article {i}.")

        content = f"Summarize the following article in 1150 characters. The summary should look like human created:\n\n{article}\n\nSummary:"

        generated_summary = make_prediction(client, model, content, max_tokens)

        rouge_scores = calculate_rouge(human_summary, generated_summary)

        results.append({
            'article_id': row.id,
            'generated_summary': generated_summary,
            'rouge1': rouge_scores['rouge1'],
            'rouge2': rouge_scores['rouge2'],
            'rougeL': rouge_scores['rougeL']
        })

    return results


### Text Summarization with OpenAI o3

In [None]:
%%time

client = OpenAI(api_key = OPENAI_API_KEY,)
model = "o3"
max_tokens = 1150
results = summarize_articles(client, model, dataset, max_tokens)
results_df = pd.DataFrame(results)
mean_values = results_df[["rouge1", "rouge2", "rougeL"]].mean()
print(mean_values)

Summarizing article 1.
Summarizing article 2.
Summarizing article 3.
Summarizing article 4.
Summarizing article 5.
Summarizing article 6.
Summarizing article 7.
Summarizing article 8.
Summarizing article 9.
Summarizing article 10.
rouge1    0.351287
rouge2    0.115566
rougeL    0.179441
dtype: float64
CPU times: user 668 ms, sys: 59.4 ms, total: 727 ms
Wall time: 1min 21s


### Text Summarization with Claude 4 Opus

In [None]:
%%time

client = anthropic.Anthropic(api_key = ANTHROPIC_API_KEY)
model = "claude-opus-4-0"

results = summarize_articles(client, model, dataset, max_tokens)
results_df = pd.DataFrame(results)
mean_values = results_df[["rouge1", "rouge2", "rougeL"]].mean()
print(mean_values)

Summarizing article 1.
Summarizing article 2.
Summarizing article 3.
Summarizing article 4.
Summarizing article 5.
Summarizing article 6.
Summarizing article 7.
Summarizing article 8.
Summarizing article 9.
Summarizing article 10.
rouge1    0.341579
rouge2    0.067680
rougeL    0.141958
dtype: float64
CPU times: user 974 ms, sys: 115 ms, total: 1.09 s
Wall time: 2min 12s
