In [3]:
import pandas as pd
import os
import gcsfs
import numpy as np

# Dataset taken from https://www.kaggle.com/datasets/bittlingmayer/amazonreviews
dataset = 'amazon-reviews-sentiment'
storage_options = {"token": "anon"}
fs = gcsfs.GCSFileSystem(**storage_options)
remote_dir = f'gs://lilac-data-us-east1/datasets/{dataset}/'
local_dir = os.path.join('data', '.cache', 'datasets', dataset)

if not os.path.exists(local_dir):
  print('Downloading dataset from GCS...')
  fs.download(remote_dir, local_dir, recursive=True)

## Parsing raw data


In [4]:
positive_label = '__label__2 '
negative_label = '__label__1 '

texts = []
labels = []

with open(os.path.join(local_dir, 'amazon-reviews.txt'), 'r') as f:
  while True:
    line = f.readline()
    if not line:
      break
    if line.startswith(positive_label):
      label = True
    elif line.startswith(negative_label):
      label = False
    else:
      print('label not found in line', line)
      continue
    text = line[len(positive_label):]
    texts.append(text)
    labels.append(label)

df = pd.DataFrame({'text': texts, 'label': labels})
df

Unnamed: 0,text,label
0,Great CD: My lovely Pat has one of the GREAT v...,True
1,One of the best game music soundtracks - for a...,True
2,Batteries died within a year ...: I bought thi...,False
3,"works fine, but Maha Energy is better: Check o...",True
4,Great for the non-audiophile: Reviewed quite a...,True
...,...,...
399995,Unbelievable- In a Bad Way: We bought this Tho...,False
399996,"Almost Great, Until it Broke...: My son reciev...",False
399997,Disappointed !!!: I bought this toy for my son...,False
399998,Classic Jessica Mitford: This is a compilation...,True


## Getting training data


In [5]:
from lilac.embeddings.openai import OpenAI

signal = OpenAI()
signal.setup()
signal._split = False


def get_balanced_data(df, is_train, group_size):
  fname = os.path.join(local_dir, f'balanced_train={is_train}_group_size={group_size}.pkl')
  if os.path.exists(fname):
    return pd.read_pickle(fname)
  print('Computing embeddings for balanced dataset...')
  res_df = df.groupby(df['label']).sample(group_size)
  res_df['embeddings'] = [
    np.array(x[0]['embedding'], np.float32) for x in signal.compute(res_df['text'])
  ]
  # Shuffle the rows.
  res_df = res_df.sample(frac=1)
  res_df.to_pickle(fname)
  return res_df


train_df = get_balanced_data(df, is_train=True, group_size=50)
test_df = get_balanced_data(df, is_train=False, group_size=1024)
train_df.head()

Unnamed: 0,text,label,embeddings
351288,Starting To Be Annoyed By Becky...: I'm not su...,False,"[-0.030075073, -0.027816772, 0.0109939575, -0...."
288426,the cover is fine - the pool is horrible: The ...,False,"[0.0067443848, 0.013435364, 0.016174316, -0.03..."
37540,"Good album, not their best.: This album is pro...",True,"[-0.020095825, -0.0065460205, 0.0008125305, -0..."
367738,"This is a horror novel, right?: Never one to p...",False,"[-2.1219254e-05, -0.010925293, 0.006416321, -0..."
37435,Superb mix of global non secular musical denom...,True,"[-0.008049011, -0.0131073, -0.022949219, -0.02..."


## Training a model


In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, f1_score
from lilac.utils import DebugTimer

model = LogisticRegression(
  class_weight='balanced', C=30, tol=1e-5, warm_start=True, max_iter=10_000)

embeddings = np.array(list(train_df['embeddings']))
labels = list(train_df['label'])
test_embeddings = np.array(list(test_df['embeddings']))
test_labels = list(test_df['label'])

with DebugTimer('Training a model'):
  model.fit(embeddings, labels)

roc_auc = roc_auc_score(labels, model.predict_proba(embeddings)[:, 1])
f1_val = f1_score(labels, model.predict(embeddings))
print(f'Train set: {len(labels)} examples: AUC: {roc_auc:.3f} F1: {f1_val:.3f}')

roc_auc = roc_auc_score(test_labels, model.predict_proba(test_embeddings)[:, 1])
f1_val = f1_score(test_labels, model.predict(test_embeddings))
print(f'Test set: {len(test_labels)} examples: AUC: {roc_auc:.3f} F1: {f1_val:.3f}')

accuracy = model.score(test_embeddings, test_labels)
print(f'Accuracy on test set, {len(test_labels)} examples: {accuracy:.3f}')

Training a model took 0.020s.
Train set: 100 examples: AUC: 1.000 F1: 1.000
Test set: 2048 examples: AUC: 0.984 F1: 0.935
Accuracy on test set, 2048 examples: 0.934


## Save the concept


In [10]:
from lilac.concepts.concept import Concept, Example


def save_concept(positive_sentiment):
  data = {}

  for index, (label, text) in enumerate(zip(labels, list(train_df['text']))):
    id = str(index)
    ex = Example(label=bool(label), text=text, id=str(index))
    if not positive_sentiment:
      ex.label = not ex.label
    data[id] = ex

  concept_name = 'positive_sentiment' if positive_sentiment else 'negative_sentiment'
  description = 'Positive sentiment' if positive_sentiment else 'Negative sentiment'
  concept = Concept(
    namespace='lilac', concept_name=concept_name, type='text', data=data, description=description)

  with open(f'data/concept/{concept.namespace}/{concept.concept_name}/concept.json', 'w') as f:
    f.write(concept.model_dump_json(exclude_none=True, exclude_defaults=True, indent=2))


save_concept(positive_sentiment=True)
save_concept(positive_sentiment=False)