In [None]:
import backoff
import tiktoken
import pandas as pd
from openai import OpenAI, RateLimitError
import os
import time

client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))


# embedding model parameters
embedding_model = "text-embedding-ada-002"
embedding_encoding = "cl100k_base"  # this the encoding for text-embedding-ada-002
max_tokens = 8000  # the maximum for text-embedding-ada-002 is 8191


# Define the maximum number of tokens
max_tokens = 8000

# Assuming tiktoken and the necessary encoding are correctly installed and set up
# Define the encoding for the embedding model (replace with the actual encoding you're using)
embedding_encoding = "cl100k_base"  # Example encoding

# Load the JSON data from the file
json_df = pd.read_json(
    './data/News_Category_Dataset_v3.json', lines=True)

# Create a combined column with headline and short_description
json_df["combined"] = "headline: " + json_df.headline.str.strip() + \
    "; short_description: " + json_df.short_description.str.strip()

# Get the encoding from tiktoken library
encoding = tiktoken.get_encoding(embedding_encoding)

# Count the number of tokens using the encoding's encode method
json_df["n_tokens"] = json_df.combined.apply(lambda x: len(encoding.encode(x)))

# Filter out rows where the token count exceeds the max_tokens limit
filtered_json_df = json_df[json_df['n_tokens'] <= max_tokens]

# Display the number of lines before and after filtering
lines_before = len(json_df)
lines_after = len(filtered_json_df)

print("Lines of text before filtering: ", lines_before)
print("Lines of text after filtering: ", lines_after)

batch_size = 10


def get_embeddings(text, model):
    text = text.replace("\n", " ")
    return client.embeddings.create(input=[text], model=model).data[0].embedding


@backoff.on_exception(
    backoff.expo,
    RateLimitError,
    max_time=60,
    max_tries=10
)
def get_embeddings_with_backoff(batch, engine):
    return get_embeddings(list_of_text=batch, engine=engine)


# Randomly sample 200000 data points
df_200k = json_df.sample(200000, random_state=42)

# Divide the prompts into 10 batches
prompts = df_200k.combined.tolist()
prompt_batches = [prompts[i:i+batch_size]
                  for i in range(0, len(prompts), batch_size)]

# Initialize embedding vector list
embeddings = []
batch_count = 0

# Processing each batch.
for batch in prompt_batches:
    batch_embeddings = get_embeddings_with_backoff(batch, embedding_model)
    embeddings.extend(batch_embeddings)
    batch_count += 1

    # Pause for 65 seconds after every three batches are processed.
    if batch_count % 3 == 0:
        time.sleep(65)

# Add the embedding vectors to the dataframe.
df_200k['embedding'] = embeddings

# Save the data as a Parquet file.
df_200k.to_parquet(
    "./data/News_Category_With_Samples_200k.parquet", index=True)


In [1]:
import pandas as pd

parquet_file_path = './data/News_Category_With_Samples_200k.parquet'

df = pd.read_parquet(parquet_file_path)

df


Unnamed: 0,link,headline,category,short_description,authors,date,combined,embedding
0,https://www.huffpost.com/entry/covid-boosters-...,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...,"Carla K. Johnson, AP",2022-09-23,headline: Over 4 Million Americans Roll Up Sle...,"b'[-0.019705895334482193,-0.019851570948958397..."
1,https://www.huffpost.com/entry/american-airlin...,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...,Mary Papenfuss,2022-09-23,"headline: American Airlines Flyer Charged, Ban...","b'[-0.020039038732647896,-0.016167860478162766..."
2,https://www.huffpost.com/entry/funniest-tweets...,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha...",Elyse Wanshel,2022-09-23,headline: 23 Of The Funniest Tweets About Cats...,"b'[0.0028837064746767282,0.012555263936519623,..."
3,https://www.huffpost.com/entry/funniest-parent...,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to...",Caroline Bologna,2022-09-23,headline: The Funniest Tweets From Parents Thi...,"b'[0.0026038987562060356,0.02257823757827282,0..."
4,https://www.huffpost.com/entry/amy-cooper-lose...,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,Amy Cooper accused investment firm Franklin Te...,Nina Golgowski,2022-09-22,headline: Woman Who Called Cops On Black Bird-...,"b'[-0.027758052572607994,-0.023510990664362907..."
...,...,...,...,...,...,...,...,...
209522,https://www.huffingtonpost.com/entry/rim-ceo-t...,RIM CEO Thorsten Heins' 'Significant' Plans Fo...,TECH,Verizon Wireless and AT&T are already promotin...,"Reuters, Reuters",2012-01-28,headline: RIM CEO Thorsten Heins' 'Significant...,"b'[-0.032943569123744965,-0.0286103505641222,-..."
209523,https://www.huffingtonpost.com/entry/maria-sha...,Maria Sharapova Stunned By Victoria Azarenka I...,SPORTS,"Afterward, Azarenka, more effusive with the pr...",,2012-01-28,headline: Maria Sharapova Stunned By Victoria ...,"b'[-0.03897290304303169,-0.0015498248394578695..."
209524,https://www.huffingtonpost.com/entry/super-bow...,"Giants Over Patriots, Jets Over Colts Among M...",SPORTS,"Leading up to Super Bowl XLVI, the most talked...",,2012-01-28,"headline: Giants Over Patriots, Jets Over Colt...","b'[-0.028294872492551804,-0.01749710738658905,..."
209525,https://www.huffingtonpost.com/entry/aldon-smi...,Aldon Smith Arrested: 49ers Linebacker Busted ...,SPORTS,CORRECTION: An earlier version of this story i...,,2012-01-28,headline: Aldon Smith Arrested: 49ers Lineback...,"b'[-0.0012897694250568748,0.00182717340067029,..."


# Random Forest

In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import pandas as pd
import numpy as np
import json

# load data
training_data = pd.read_parquet("./data/News_Category_With_Samples_200k.parquet")

# random sampling
df = training_data.sample(frac=1, random_state=42)

# Since the embeddings may be saved as a list in string form, we need to convert it into a numpy array.
def embeddings_to_floats(embedding_str):
    # Use json.loads to safely convert a list of strings into a numpy array.
    return np.array(json.loads(embedding_str))

# Apply the conversion function to each embedding.
# Note: If the dataset is very large, this may consume a significant amount of memory, and batch processing may be necessary.
X = np.stack(df['embedding'].apply(embeddings_to_floats).values)
y = df['category'].values

# split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# train a random forest model
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# make predictions
preds = clf.predict(X_test)

# generate a classification report
report = classification_report(y_test, preds)
print(report)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


                precision    recall  f1-score   support

          ARTS       0.54      0.48      0.51       462
ARTS & CULTURE       0.57      0.26      0.36       394
  BLACK VOICES       0.56      0.44      0.49      1424
      BUSINESS       0.57      0.56      0.57      1755
       COLLEGE       0.58      0.51      0.54       368
        COMEDY       0.58      0.46      0.52      1613
         CRIME       0.58      0.67      0.62      1047
CULTURE & ARTS       0.82      0.35      0.49       319
       DIVORCE       0.82      0.73      0.77      1069
     EDUCATION       0.51      0.39      0.44       309
 ENTERTAINMENT       0.68      0.82      0.75      5189
   ENVIRONMENT       0.71      0.35      0.47       470
         FIFTY       0.53      0.27      0.36       420
  FOOD & DRINK       0.66      0.86      0.75      1910
     GOOD NEWS       0.42      0.30      0.35       381
         GREEN       0.47      0.52      0.49       740
HEALTHY LIVING       0.60      0.36      0.45  

# Logistic Regression

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import pandas as pd
import numpy as np
import json

# load data
training_data = pd.read_parquet("./data/News_Category_With_Samples_200k.parquet")

# random sampling
df = training_data.sample(frac=1, random_state=42)

# Since the embeddings may be saved as a list in string form, we need to convert it into a numpy array.
def embeddings_to_floats(embedding_str):
    # Use json.loads to safely convert a list of strings into a numpy array.
    return np.array(json.loads(embedding_str))

# Apply the conversion function to each embedding.
# Note: If the dataset is very large, this may consume a significant amount of memory, and batch processing may be necessary.
X = np.stack(df['embedding'].apply(embeddings_to_floats).values)
y = df['category'].values

# split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# train a logistic regression model
clf = LogisticRegression()
clf.fit(X_train, y_train)

# make predictions
preds = clf.predict(X_test)

# generate a classification report
report = classification_report(y_test, preds)
print(report)
