In [None]:
!pip install openai
!pip install transformers
!pip install datasets

Collecting openai
  Downloading openai-1.21.1-py3-none-any.whl (309 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m309.7/309.7 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
Collecting httpx<1,>=0.23.0 (from openai)
  Downloading httpx-0.27.0-py3-none-any.whl (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
Collecting httpcore==1.* (from httpx<1,>=0.23.0->openai)
  Downloading httpcore-1.0.5-py3-none-any.whl (77 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.9/77.9 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting h11<0.15,>=0.13 (from httpcore==1.*->httpx<1,>=0.23.0->openai)
  Downloading h11-0.14.0-py3-none-any.whl (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.3/58.3 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: h11, httpcore, httpx, openai
Successfully installed h11-0.14.0 httpcore-1.0.5 ht

**Préparation des données**

In [None]:
import pandas as pd
import json

# Charger le fichier CSV
data = pd.read_csv('IMDB_Dataset.csv')

# Fonction pour nettoyer et préparer les lignes
def prepare_jsonl(row):
    # Nettoyer les balises HTML et autres caractères non souhaités si nécessaire
    review = row['review'].replace('<br />', ' ')
    # Créer le prompt et la completion au format désiré
    prompt = f"Review: '{review}'\nSentiment:"
    completion = f" {row['sentiment'].capitalize()}"
    return json.dumps({"prompt": prompt, "completion": completion})

# Appliquer la fonction à chaque ligne du DataFrame et collecter les résultats
jsonl_data = data.apply(prepare_jsonl, axis=1).tolist()

# Écrire les données dans un fichier JSONL
with open('formatted_data.jsonl', 'w') as f:
    for item in jsonl_data:
        f.write(item + '\n')

print("Les données ont été converties et sauvegardées en format JSONL.")


Les données ont été converties et sauvegardées en format JSONL.


**Formatage des données au format "chat" GPT**

In [None]:
import json

# Open original file and the file to write to
with open('formatted_data.jsonl', 'r') as file, open('chat_formatted_data.jsonl', 'w') as outfile:
    for line in file:
        data = json.loads(line)
        # Extracting prompt and stripping unwanted characters
        prompt = data['prompt'].replace("Review: '", "").replace("'\nSentiment:", "")
        # The chat format should include the review and a prompt for the model to generate completion
        chat_format = [
            {"role": "user", "content": prompt},
            {"role": "assistant", "content": "The sentiment of the review is:"}
        ]
        completion = data['completion'].strip()
        # Integration of the completion into the last message from the assistant
        output = {"messages": chat_format}
        output["messages"].append({"role": "system", "content": completion})  # Append completion as system's response
        json.dump(output, outfile)
        outfile.write('\n')


**Réduction de la taille du dataset pour l'entraînement** (car sinon c'est trop cher)

In [None]:
import pandas as pd
import json

# Chemin vers votre fichier JSONL original
input_path = 'chat_formatted_data.jsonl'
# Chemin pour le nouveau fichier JSONL réduit
output_path = 'reduced_chat_formatted_data.jsonl'

# Charger toutes les données du fichier JSONL
data = pd.read_json(input_path, lines=True)

# Sélectionner un sous-ensemble aléatoire de 5000 entrées
reduced_data = data.sample(n=5000, random_state=42)  # random_state pour la reproductibilité

# Sauvegarder le sous-ensemble réduit dans un nouveau fichier JSONL
reduced_data.to_json(output_path, orient='records', lines=True)

print(f"Reduced dataset with 5000 entries saved to {output_path}")


Reduced dataset with 5000 entries saved to reduced_chat_formatted_data.jsonl


**Split en set d'entraînement et de validation**

In [None]:
import pandas as pd

# Charger les données du fichier JSONL
input_path = 'reduced_chat_formatted_data.jsonl'
data = pd.read_json(input_path, lines=True)

# Mélanger les données pour assurer une distribution aléatoire
data = data.sample(frac=1).reset_index(drop=True)

# Définir le pourcentage de split, e.g., 80% pour l'entraînement
train_size = int(0.8 * len(data))

# Diviser les données en ensembles d'entraînement et de validation
train_data = data[:train_size]
validation_data = data[train_size:]

# Chemins des fichiers de sortie
train_output_path = 'train_data.jsonl'
validation_output_path = 'validation_data.jsonl'

# Sauvegarder les ensembles d'entraînement et de validation en tant que nouveaux fichiers JSONL
train_data.to_json(train_output_path, orient='records', lines=True)
validation_data.to_json(validation_output_path, orient='records', lines=True)

print(f"Training data saved to {train_output_path}")
print(f"Validation data saved to {validation_output_path}")


Training data saved to train_data.jsonl
Validation data saved to validation_data.jsonl


**Entrer la clés API d'openAI**

In [None]:
import os

# This will prompt you to enter the API key and store it in an environment variable
api_key = input("Enter your OpenAI API key: ")
os.environ['OPENAI_API_KEY'] = api_key

**Création d'une instance de client**

In [None]:
import os
from openai import OpenAI

client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))


**Charger les fichiers d'entraînement sur les serveurs d'OpenAI**

In [None]:
# Upload the corrected format file for fine-tuning
with open("reduced_chat_formatted_data.jsonl", "rb") as file_stream:
    response = client.files.create(
        file=file_stream,
        purpose="fine-tune"
    )

file_id = response.id
print("File uploaded with ID:", file_id)


File uploaded with ID: file-VRr1vsgWZ9OJ2O5S8nGgDrvN


In [None]:
# Upload the corrected format file for fine-tuning
with open("train_data.jsonl", "rb") as file_stream:
    response = client.files.create(
        file=file_stream,
        purpose="fine-tune"
    )

file_id = response.id
print("File uploaded with ID:", file_id)

File uploaded with ID: file-hrnWk9POrgveDm1se2VTIyu3


In [None]:
# Upload the corrected format file for fine-tuning
with open("validation_data.jsonl", "rb") as file_stream:
    response = client.files.create(
        file=file_stream,
        purpose="fine-tune"
    )

file_id = response.id
print("File uploaded with ID:", file_id)

File uploaded with ID: file-C8CtoFi28KPKgYYqEvBCINYg


**Démarrage du finetuning du modèle gpt 3.5**

In [None]:
# Create a fine-tuning job using the 'jobs' property with correct parameters
finetune_response = client.fine_tuning.jobs.create(
    model="gpt-3.5-turbo-0125",
    training_file= file_id,
    # Adjust hyperparameters as necessary
    hyperparameters={
        "n_epochs": 1,
        "batch_size": 8,
        "learning_rate_multiplier": 0.1
    }
)

# Obtain and print the job ID
finetune_job_id = finetune_response.id
print("Finetuning started with ID:", finetune_job_id)


Finetuning started with ID: ftjob-DTD9oTLd26E97SOHZQcXLAan


**Vérifier l'état du finetuning**

On peut aussi regarder l'avancement de l'entraînement sur le site d'openAI: https://platform.openai.com/finetune

In [None]:
# Correctly retrieving the status of a fine-tuning job
status_response = client.fine_tuning.jobs.retrieve(fine_tuning_job_id="ftjob-fLTQP0DSrQcRRGESA0hdLlab")
print("Status of the fine-tuning:", status_response.status)



Status of the fine-tuning: succeeded


**Fonction pour compter les tokens d'un fichier**

Peut servir à estimer le prix du finetuning

In [None]:
from transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

def count_tokens_in_file(file_path):
    total_tokens = 0
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            data = json.loads(line)
            # Concatenate all content to tokenize together
            full_content = ' '.join([msg['content'] for msg in data['messages']])
            # Tokenize and count, handling max_length
            tokens = tokenizer.encode(full_content, truncation=True, max_length=1024)
            total_tokens += len(tokens)
    return total_tokens

file_path = 'reduced_chat_formatted_data.jsonl'
token_count = count_tokens_in_file(file_path)
print(f"Total tokens in the file: {token_count}")


Total tokens in the file: 1480357


**Tester le modèle finetuné sur un prompt**

In [None]:
import openai
import json


# ID du modèle finetuné
model_id = 'ft:gpt-3.5-turbo-0125:personal:1-epoch:9F9CPo9M'

# Test avec un exemple de prompt
prompt_text = "Last summer I had an appointment to get new tires and had to wait a super long time. I also went in this week for them to fix a minor problem with a tire they put on. They fixed it for free, and the very next morning I had the same issue. I called to complain, and the manager didn't even apologize!!! So frustrated. Never going back.  They seem overpriced, too."

# Effectuer une requête de complétion avec le modèle finetuné
response = client.chat.completions.create(
    model=model_id,  # Utilisez l'ID de votre modèle finetuné
    messages=[
        {"role": "user", "content": prompt_text},  # L'utilisateur présente la review
        {"role": "system", "content": "The sentiment of the review is:"}  # L'assistant demande l'évaluation du sentiment
    ],
    max_tokens=1  # Nombre de tokens maximum pour la réponse générée
)

# Conversion de l'objet de réponse en chaîne JSON
response_json_str = response.to_json()

# Conversion de la chaîne JSON en dictionnaire
response_dict = json.loads(response_json_str)

# Affichage de la réponse générée en utilisant les clés JSON appropriées
content = response_dict['choices'][0]['message']['content']
role = response_dict['choices'][0]['message']['role']

prediction = content.lower()
print(prediction)
if prediction == "positive":
    number = 1
elif prediction == "negative":
    number = 0
else:
    number = 2

print(number)

Role: assistant, Content: negative
negative
0


**Evaluation sur le Dataset Yelp_polarity**

In [None]:
from datasets import load_dataset

dataset = load_dataset("yelp_polarity")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading data:   0%|          | 0.00/256M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/17.7M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/560000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/38000 [00:00<?, ? examples/s]

**Evaluation du modèle finetuné**

In [None]:
# Fonction pour obtenir la prédiction du modèle
def get_prediction(text):
    try:
        response = client.chat.completions.create(
            model='ft:gpt-3.5-turbo-0125:personal:1-epoch:9F9CPo9M',  # Utilisez l'ID de votre modèle finetuné
            messages=[
                {"role": "user", "content": text},
                {"role": "system", "content": "Sentiment:"}  # Indiquer au modèle de fournir une évaluation sentimentale
            ],
            max_tokens=1  # Nombre de tokens maximum pour la réponse générée
        )

        # Conversion de l'objet de réponse en chaîne JSON
        response_json_str = response.to_json()

        # Conversion de la chaîne JSON en dictionnaire
        response_dict = json.loads(response_json_str)

        # Affichage de la réponse générée en utilisant les clés JSON appropriées
        content = response_dict['choices'][0]['message']['content']

        prediction = content.lower()

        # Convertir "positive" en 2, "negative" en 1, et tout autre résultat en 0
        if prediction == "positive":
            return 1
        elif prediction == "negative":
            return 0
        else:
            print(f"Incorrect prediction: {prediction}")
            return 2  # Retourner 2 pour les réponses non conformes
    except Exception as e:
        print(f"Error with prediction: {e}")
        return 2

# Préparation des données
test_reviews = dataset['test']['text']
test_labels = dataset['test']['label']

# Initialisation de la liste pour les prédictions
predicted_labels = []
# Application de la fonction de prédiction sur l'ensemble de test
for i, review in enumerate(test_reviews):
    predicted = get_prediction(review)
    predicted_labels.append(predicted)
    # Affichage de la progression
    if (i+1) % 100 == 0:  # Afficher la progression tous les 10 échantillons
        print(f"Processed {i+1}/{len(test_reviews)} reviews.")

# Calcul de l'exactitude
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(test_labels, predicted_labels)
print(f"Accuracy: {accuracy:.2f}")

# Calcul du nombre de réponses non conformes
num_inconclusive = predicted_labels.count(2)
print(f"Number of inconclusive responses: {num_inconclusive}")

**Score sur 4500 exemples**

Car les inférences coûtent cher...

In [None]:
# Préparation des données
test_reviews = dataset['test']['text'][:4509]
test_labels = dataset['test']['label'][:4509]

# Calcul de l'exactitude
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(test_labels, predicted_labels)
print(f"Accuracy: {accuracy:.2f}")

# Calcul du nombre de réponses non conformes
num_inconclusive = predicted_labels.count(2)
print(f"Number of inconclusive responses: {num_inconclusive}")

Accuracy: 0.93
Number of inconclusive responses: 225


**Evaluation de GPT non finetuné pour comparaison**

Score sur 1000 exemples

In [None]:
# Fonction pour obtenir la prédiction du modèle
def get_prediction(text):
    try:
        response = client.chat.completions.create(
            model='gpt-3.5-turbo-0125',
            messages=[
                {"role": "user", "content": text},
                {"role": "system", "content": "Sentiment:"}  # Indiquer au modèle de fournir une évaluation sentimentale
            ],
            max_tokens=1  # Nombre de tokens maximum pour la réponse générée
        )

        # Conversion de l'objet de réponse en chaîne JSON
        response_json_str = response.to_json()

        # Conversion de la chaîne JSON en dictionnaire
        response_dict = json.loads(response_json_str)

        # Affichage de la réponse générée en utilisant les clés JSON appropriées
        content = response_dict['choices'][0]['message']['content']

        prediction = content.lower()

        # Convertir "positive" en 2, "negative" en 1, et tout autre résultat en 0
        if prediction == "positive":
            return 1
        elif prediction == "negative":
            return 0
        else:
            print(f"Incorrect prediction: {prediction}")
            return 2  # Retourner 2 pour les réponses non conformes
    except Exception as e:
        print(f"Error with prediction: {e}")
        return 2

# Préparation des données
test_reviews = dataset['test']['text'][:1000]
test_labels = dataset['test']['label'][:1000]

# Initialisation de la liste pour les prédictions
predicted_labels = []
# Application de la fonction de prédiction sur l'ensemble de test
for i, review in enumerate(test_reviews):
    predicted = get_prediction(review)
    predicted_labels.append(predicted)
    # Affichage de la progression
    if (i+1) % 100 == 0:  # Afficher la progression tous les 10 échantillons
        print(f"Processed {i+1}/{len(test_reviews)} reviews.")

# Calcul de l'exactitude
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(test_labels, predicted_labels)
print(f"Accuracy: {accuracy:.2f}")

# Calcul du nombre de réponses non conformes
num_inconclusive = predicted_labels.count(2)
print(f"Number of inconclusive responses: {num_inconclusive}")

Incorrect prediction: mixed
Incorrect prediction: sounds
Incorrect prediction: overall
Incorrect prediction: the
Incorrect prediction: the
Incorrect prediction: overall
Incorrect prediction: neutral
Incorrect prediction: i
Incorrect prediction: the
Incorrect prediction: i
Incorrect prediction: mixed
Incorrect prediction: it
Incorrect prediction: it
Incorrect prediction: it
Incorrect prediction: it
Incorrect prediction: overall
Incorrect prediction: that
Incorrect prediction: it
Incorrect prediction: overall
Incorrect prediction: mixed
Incorrect prediction: that
Incorrect prediction: neutral
Incorrect prediction: overall
Incorrect prediction: love
Incorrect prediction: overall
Incorrect prediction: overall
Incorrect prediction: mike
Incorrect prediction: mixed
Incorrect prediction: it
Incorrect prediction: overall
Incorrect prediction: it
Incorrect prediction: n
Incorrect prediction: i
Incorrect prediction: neutral
Incorrect prediction: it
Incorrect prediction: overall
Incorrect predict

In [None]:
# Calcul de l'exactitude
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(test_labels, predicted_labels)
print(f"Accuracy: {accuracy:.2f}")

# Calcul du nombre de réponses non conformes
num_inconclusive = predicted_labels.count(2)
print(f"Number of inconclusive responses: {num_inconclusive}")

Accuracy: 0.73
Number of inconclusive responses: 263


Bloc juste pour récupérer les output au format incorrect pour le modèle finetuné (les output précédents ont été supprimé par mauvais manip) sur 1000 exemples.

In [None]:
import json

# Fonction pour obtenir la prédiction du modèle
def get_prediction(text):
    try:
        response = client.chat.completions.create(
            model='ft:gpt-3.5-turbo-0125:personal:1-epoch:9F9CPo9M',  # Utilisez l'ID de votre modèle finetuné
            messages=[
                {"role": "user", "content": text},
                {"role": "assistant", "content": "The sentiment of the review is:"}  # Indiquer au modèle de fournir une évaluation sentimentale
            ],
            max_tokens=1  # Nombre de tokens maximum pour la réponse générée
        )

        # Conversion de l'objet de réponse en chaîne JSON
        response_json_str = response.to_json()

        # Conversion de la chaîne JSON en dictionnaire
        response_dict = json.loads(response_json_str)

        # Affichage de la réponse générée en utilisant les clés JSON appropriées
        content = response_dict['choices'][0]['message']['content']

        prediction = content.lower()

        # Convertir "positive" en 2, "negative" en 1, et tout autre résultat en 0
        if prediction == "positive":
            return 1
        elif prediction == "negative":
            return 0
        else:
            print(f"Incorrect prediction: {prediction}")
            return 2  # Retourner 2 pour les réponses non conformes
    except Exception as e:
        print(f"Error with prediction: {e}")
        return 2

# Préparation des données
test_reviews = dataset['test']['text'][:1000]
test_labels = dataset['test']['label'][:1000]

# Initialisation de la liste pour les prédictions
predicted_labels = []
# Application de la fonction de prédiction sur l'ensemble de test
for i, review in enumerate(test_reviews):
    predicted = get_prediction(review)
    predicted_labels.append(predicted)
    # Affichage de la progression
    if (i+1) % 100 == 0:  # Afficher la progression tous les 10 échantillons
        print(f"Processed {i+1}/{len(test_reviews)} reviews.")


Incorrect prediction: mixed
Incorrect prediction: mixed
Incorrect prediction: neutral
Incorrect prediction: fr
Incorrect prediction: neutral
Incorrect prediction: pr
Incorrect prediction: nost
Incorrect prediction: nost
Incorrect prediction: the
Processed 100/1000 reviews.
Incorrect prediction: the
Incorrect prediction: dis
Incorrect prediction: the
Incorrect prediction: mixed
Processed 200/1000 reviews.
Incorrect prediction: neutral
Incorrect prediction: the
Incorrect prediction: neutral
Incorrect prediction: the
Incorrect prediction: neutral
Incorrect prediction: fr
Incorrect prediction: neutral
Incorrect prediction: neutral
Processed 300/1000 reviews.
Incorrect prediction: neutral
Incorrect prediction: the
Incorrect prediction: dis
Incorrect prediction: mixed
Incorrect prediction: neutral
Processed 400/1000 reviews.
Incorrect prediction: neutral
Incorrect prediction: mixed
Incorrect prediction: neutral
Incorrect prediction: neutral
Incorrect prediction: neutral
Incorrect prediction:

In [None]:
# Calcul de l'exactitude
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(test_labels, predicted_labels)
print(f"Accuracy: {accuracy:.2f}")

# Calcul du nombre de réponses non conformes
num_inconclusive = predicted_labels.count(2)
print(f"Number of inconclusive responses: {num_inconclusive}")

Accuracy: 0.92
Number of inconclusive responses: 64
