In [1]:
!pip install httpx==0.27.2

Collecting httpx==0.27.2
  Downloading httpx-0.27.2-py3-none-any.whl.metadata (7.1 kB)
Downloading httpx-0.27.2-py3-none-any.whl (76 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/76.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.4/76.4 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: httpx
  Attempting uninstall: httpx
    Found existing installation: httpx 0.28.0
    Uninstalling httpx-0.28.0:
      Successfully uninstalled httpx-0.28.0
Successfully installed httpx-0.27.2


In [1]:
import openai

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

In [2]:
from google.colab import userdata

In [4]:
import os
os.environ["OPENAI_API_KEY"] = userdata.get('OPENAI_API_KEY')

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
file_path = '/content/drive/MyDrive/topics_classification_dataset.csv'
df = pd.read_csv(file_path)

In [55]:
df

Unnamed: 0,LEWISSPLIT,Text,Topics,embeddings
0,TRAIN,JAGUAR SEES STRONG GROWTH IN NEW MODEL SALES J...,earn,"[0.0133146271109581, 0.02631673403084278, 0.02..."
1,TRAIN,NORD RESOURCES CORP <NRD> 4TH QTR NET Shr 19 c...,earn,"[-0.0001450378622394055, 0.02493785135447979, ..."
2,TRAIN,FIVE GROUPS APPLY TO BUY FRENCH TELEPHONE GROU...,acq,"[-0.009246275760233402, -0.030360525473952293,..."
3,TRAIN,BLIZZARD CLOSES BOSPHORUS Blizzard conditions ...,ship,"[0.014441131614148617, 0.028736023232340813, 0..."
4,TRAIN,JAPAN FUND <JPN> SEEKERS CONFIDENT OF FINANCIN...,acq,"[-0.0040326304733753204, 0.00496899988502264, ..."
...,...,...,...,...
7052,TRAIN,BAKER INTERNATIONAL CORP SUES HUGHES TOOL SEEK...,acq,"[-0.018963780254125595, -0.02632923424243927, ..."
7053,TRAIN,USAIR GROUP REJECTS TRANS WORLD AIRLINES TAKEO...,acq,"[-0.05530624836683273, -0.03717518970370293, 0..."
7054,TRAIN,BAKER <BKO> SUES TO FORCE HUGHES <HT> MERGER B...,acq,"[-0.04654429480433464, -0.008925688453018665, ..."
7055,TRAIN,SPAIN DEREGULATES BANK DEPOSIT INTEREST RATES ...,interest,"[0.02821340411901474, -0.008079394698143005, 0..."


In [19]:
def generate_embeddings(text):
    client = openai.OpenAI()
    response = client.embeddings.create(
        input=text,
        model="text-embedding-3-small"
    )

    return response.data

In [20]:
def generate_embeddings_in_batches(text_list, batch_size=50):
    embeddings = []
    for i in range(0, len(text_list), batch_size):
        batch = text_list[i:i + batch_size]
        response = generate_embeddings(batch)  # Call your embedding function
        embeddings.extend(response)
    return embeddings


In [21]:
batch_size = 1000
text_embeddings = generate_embeddings_in_batches(df['Text'].tolist(), batch_size=batch_size)

In [32]:
df.loc[:, "embeddings"] = text_embeddings

In [33]:
df

Unnamed: 0,LEWISSPLIT,Text,Topics,embeddings
0,TRAIN,JAGUAR SEES STRONG GROWTH IN NEW MODEL SALES J...,earn,"Embedding(embedding=[0.0133146271109581, 0.026..."
1,TRAIN,NORD RESOURCES CORP <NRD> 4TH QTR NET Shr 19 c...,earn,"Embedding(embedding=[-0.0001450378622394055, 0..."
2,TRAIN,FIVE GROUPS APPLY TO BUY FRENCH TELEPHONE GROU...,acq,"Embedding(embedding=[-0.009246275760233402, -0..."
3,TRAIN,BLIZZARD CLOSES BOSPHORUS Blizzard conditions ...,ship,"Embedding(embedding=[0.014441131614148617, 0.0..."
4,TRAIN,JAPAN FUND <JPN> SEEKERS CONFIDENT OF FINANCIN...,acq,"Embedding(embedding=[-0.0040326304733753204, 0..."
...,...,...,...,...
7052,TRAIN,BAKER INTERNATIONAL CORP SUES HUGHES TOOL SEEK...,acq,"Embedding(embedding=[-0.018963780254125595, -0..."
7053,TRAIN,USAIR GROUP REJECTS TRANS WORLD AIRLINES TAKEO...,acq,"Embedding(embedding=[-0.05530624836683273, -0...."
7054,TRAIN,BAKER <BKO> SUES TO FORCE HUGHES <HT> MERGER B...,acq,"Embedding(embedding=[-0.04654429480433464, -0...."
7055,TRAIN,SPAIN DEREGULATES BANK DEPOSIT INTEREST RATES ...,interest,"Embedding(embedding=[0.02821340411901474, -0.0..."


In [34]:
df["embeddings"] = [x.embedding for x in text_embeddings]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["embeddings"] = [x.embedding for x in text_embeddings]


In [35]:
df

Unnamed: 0,LEWISSPLIT,Text,Topics,embeddings
0,TRAIN,JAGUAR SEES STRONG GROWTH IN NEW MODEL SALES J...,earn,"[0.0133146271109581, 0.02631673403084278, 0.02..."
1,TRAIN,NORD RESOURCES CORP <NRD> 4TH QTR NET Shr 19 c...,earn,"[-0.0001450378622394055, 0.02493785135447979, ..."
2,TRAIN,FIVE GROUPS APPLY TO BUY FRENCH TELEPHONE GROU...,acq,"[-0.009246275760233402, -0.030360525473952293,..."
3,TRAIN,BLIZZARD CLOSES BOSPHORUS Blizzard conditions ...,ship,"[0.014441131614148617, 0.028736023232340813, 0..."
4,TRAIN,JAPAN FUND <JPN> SEEKERS CONFIDENT OF FINANCIN...,acq,"[-0.0040326304733753204, 0.00496899988502264, ..."
...,...,...,...,...
7052,TRAIN,BAKER INTERNATIONAL CORP SUES HUGHES TOOL SEEK...,acq,"[-0.018963780254125595, -0.02632923424243927, ..."
7053,TRAIN,USAIR GROUP REJECTS TRANS WORLD AIRLINES TAKEO...,acq,"[-0.05530624836683273, -0.03717518970370293, 0..."
7054,TRAIN,BAKER <BKO> SUES TO FORCE HUGHES <HT> MERGER B...,acq,"[-0.04654429480433464, -0.008925688453018665, ..."
7055,TRAIN,SPAIN DEREGULATES BANK DEPOSIT INTEREST RATES ...,interest,"[0.02821340411901474, -0.008079394698143005, 0..."


In [39]:
y_train = df[df['LEWISSPLIT'] == "TRAIN"]['Topics'].tolist()
X_train = np.vstack(df[df['LEWISSPLIT'] == "TRAIN"]['embeddings'].values)

In [53]:
X_test = np.vstack(df[df['LEWISSPLIT'] == "TEST"]['embeddings'].values)
y_test = df[df['LEWISSPLIT'] == "TEST"]['Topics'].tolist()

In [48]:
clf = RandomForestClassifier(n_estimators=100)
clf.fit(X_train, y_train)

In [49]:
preds = clf.predict(X_test)

In [51]:
from sklearn.metrics import classification_report

In [54]:
report = classification_report(y_test, preds)
print(report)

              precision    recall  f1-score   support

         acq       0.93      0.99      0.96       695
        earn       0.98      0.99      0.99      1077
    interest       0.98      0.68      0.81        82
    money-fx       0.84      0.84      0.84        87
        ship       1.00      0.22      0.36        36

    accuracy                           0.96      1977
   macro avg       0.95      0.74      0.79      1977
weighted avg       0.96      0.96      0.95      1977

