## Detailed Article Explaination

The detailed code explanation for this article is available at the following link:

https://www.daniweb.com/programming/computer-science/tutorials/542287/comparison-of-fine-tuning-gpt-4o-mini-vs-gpt-3-5-for-text-classification
For my other articles for Daniweb.com, please see this link:

https://www.daniweb.com/members/1235222/usmanmalik57

## Importing and Installing Required Libraries

In [None]:
!pip install openai

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from openai import OpenAI
import pandas as pd
import os
import json

  from pandas.core import (


## Importing the Dataset

In [2]:

dataset = pd.read_csv(r"D:\Datasets\Tweets.csv")

def preprocess_data(dataset, n):

    # Remove rows where 'airline_sentiment' or 'text' are NaN
    dataset = dataset.dropna(subset=['airline_sentiment', 'text'])

    # Remove rows where 'airline_sentiment' or 'text' are empty strings
    dataset = dataset[(dataset['airline_sentiment'].str.strip() != '') & (dataset['text'].str.strip() != '')]

    # Filter the DataFrame for each sentiment
    neutral_df = dataset[dataset['airline_sentiment'] == 'neutral']
    positive_df = dataset[dataset['airline_sentiment'] == 'positive']
    negative_df = dataset[dataset['airline_sentiment'] == 'negative']

    # Select records from Nth index
    neutral_sample = neutral_df[n: n +34]
    positive_sample = positive_df[n: n +33]
    negative_sample = negative_df[n: n +33]

    # Concatenate the samples into one DataFrame
    dataset = pd.concat([neutral_sample, positive_sample, negative_sample])

    # Reset index if needed
    dataset.reset_index(drop=True, inplace=True)

    dataset = dataset[["text", "airline_sentiment"]]

    return dataset

In [3]:
training_data = preprocess_data(dataset, 0)
print(training_data["airline_sentiment"].value_counts())
training_data.head()

airline_sentiment
neutral     34
positive    33
negative    33
Name: count, dtype: int64


Unnamed: 0,text,airline_sentiment
0,@VirginAmerica What @dhepburn said.,neutral
1,@VirginAmerica I didn't today... Must mean I n...,neutral
2,@VirginAmerica Really missed a prime opportuni...,neutral
3,@VirginAmerica did you know that suicide is th...,neutral
4,@VirginAmerica will you be making BOS&gt;LAS n...,neutral


In [4]:
test_data = preprocess_data(dataset, 100)
print(test_data["airline_sentiment"].value_counts())
test_data.head()

airline_sentiment
neutral     34
positive    33
negative    33
Name: count, dtype: int64


Unnamed: 0,text,airline_sentiment
0,@VirginAmerica are you ready!? Let's say it to...,neutral
1,@VirginAmerica can u help this 👸 @FreyaBevan_F...,neutral
2,@virginamerica may start service to Hawaii fro...,neutral
3,@VirginAmerica add DTW and I'm sold!,neutral
4,@VirginAmerica momma I made it! 😁😁😁,neutral


## Converting Training Data to JSON Format for OpenAI Model Fine-tuning

In [5]:
# JSON file path
json_file_path = r"D:\Datasets\airline_sentiments.json"

# Function to create the JSON structure for each row
def create_json_structure(row):
    return {
        "messages": [
            {"role": "system", "content": "You are a Twitter sentiment analysis expert who can predict sentiment expressed in the tweets about an airline. You select sentiment value from positive, negative, or neutral."},
            {"role": "user", "content": row['text']},
            {"role": "assistant", "content": row['airline_sentiment']}
        ]
    }

# Convert DataFrame to JSON structures
json_structures = training_data.apply(create_json_structure, axis=1).tolist()

# Write JSON structures to file, each on a new line
with open(json_file_path, 'w') as f:
    for json_structure in json_structures:
        f.write(json.dumps(json_structure) + '\n')

print(f"Data has been written to {json_file_path}")


Data has been written to D:\Datasets\airline_sentiments.json


In [24]:
client = OpenAI(
    # This is the default and can be omitted
    api_key = os.environ.get('OPENAI_API_KEY'),
)


training_file = client.files.create(
  file=open(json_file_path, "rb"),
  purpose="fine-tune"
)


## Fine-Tuning GPT-4o Mini for Text Classification

In [7]:
fine_tuning_job_gpt4o_mini = client.fine_tuning.jobs.create(
  training_file=training_file.id, 
  model="gpt-4o-mini-2024-07-18"
)

In [16]:
ft_model_id = client.fine_tuning.jobs.retrieve(fine_tuning_job_gpt4o_mini.id).fine_tuned_model

In [17]:
def find_sentiment(client, model, dataset):
    tweets_list = dataset["text"].tolist()

    all_sentiments = []


    i = 0


    while i < len(tweets_list):

        try:
            tweet = tweets_list[i]
            content = """What is the sentiment expressed in the following tweet about an airline?
            Select sentiment value from positive, negative, or neutral. Return only the sentiment value in small letters.
            tweet: {}""".format(tweet)

            response = client.chat.completions.create(
                model=model,
                temperature=0,
                max_tokens=10,
                messages=[
                    {"role": "user", "content": content}
                ]
            )

            sentiment_value = response.choices[0].message.content

            all_sentiments.append(sentiment_value)
            i += 1
            print(i, sentiment_value)

        except Exception as e:
            print("===================")
            print("Exception occurred:", e)

    accuracy = accuracy_score(all_sentiments, dataset["airline_sentiment"])
    print(f"Accuracy: {accuracy}")
    
find_sentiment(client,ft_model_id, test_data)

1 positive
2 positive
3 neutral
4 positive
5 positive
6 negative
7 negative
8 positive
9 positive
10 positive
11 positive
12 positive
13 neutral
14 neutral
15 neutral
16 neutral
17 neutral
18 neutral
19 neutral
20 positive
21 positive
22 neutral
23 neutral
24 positive
25 neutral
26 neutral
27 positive
28 neutral
29 neutral
30 neutral
31 neutral
32 neutral
33 positive
34 negative
35 positive
36 positive
37 positive
38 positive
39 positive
40 positive
41 positive
42 positive
43 positive
44 positive
45 positive
46 positive
47 positive
48 positive
49 positive
50 positive
51 positive
52 positive
53 positive
54 positive
55 positive
56 negative
57 positive
58 positive
59 neutral
60 positive
61 positive
62 positive
63 positive
64 positive
65 positive
66 positive
67 positive
68 neutral
69 negative
70 neutral
71 negative
72 negative
73 negative
74 negative
75 negative
76 negative
77 negative
78 neutral
79 negative
80 negative
81 negative
82 negative
83 negative
84 negative
85 negative
86 negativ

## Fine-Tuning GPT-3.5 Turbo for Text Classification

In [18]:
fine_tuning_job_gpt_3_5 = client.fine_tuning.jobs.create(
  training_file=training_file.id, 
  model="gpt-3.5-turbo"
)

In [23]:
ft_model_id = client.fine_tuning.jobs.retrieve(fine_tuning_job_gpt_3_5.id).fine_tuned_model
find_sentiment(client,ft_model_id, test_data)

1 positive
2 positive
3 neutral
4 positive
5 positive
6 negative
7 negative
8 positive
9 positive
10 positive
11 positive
12 positive
13 neutral
14 negative
15 neutral
16 neutral
17 neutral
18 neutral
19 neutral
20 positive
21 neutral
22 neutral
23 neutral
24 neutral
25 neutral
26 neutral
27 neutral
28 neutral
29 neutral
30 neutral
31 neutral
32 neutral
33 positive
34 negative
35 positive
36 positive
37 positive
38 positive
39 positive
40 neutral
41 positive
42 positive
43 positive
44 positive
45 positive
46 positive
47 positive
48 positive
49 positive
50 positive
51 positive
52 positive
53 positive
54 positive
55 positive
56 positive
57 positive
58 positive
59 positive
60 positive
61 positive
62 positive
63 positive
64 positive
65 positive
66 positive
67 positive
68 negative
69 negative
70 negative
71 negative
72 negative
73 negative
74 negative
75 negative
76 negative
77 negative
78 negative
79 negative
80 negative
81 negative
82 negative
83 negative
84 negative
85 positive
86 negati