### Text Classification using Cohere Embeddings and Scikit-Learn

In [106]:
%pip install --upgrade pandas --quiet
%pip install --upgrade cohere --quiet
%pip install --upgrade scikit-learn --quiet

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


#### 1. Load Dataset

In [107]:
import os
import cohere
import pandas as pd
import tqdm as notebook_tqdm

from datasets import load_dataset
from dotenv import load_dotenv, find_dotenv

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler


_ = load_dotenv(find_dotenv()) # read local .env file
cohere_api_key = os.environ['COHERE_API_KEY']

In [119]:
ds = load_dataset("FinGPT/fingpt-sentiment-cls", split="train")

In [123]:
df = pd.DataFrame()
df["sentence"] = pd.DataFrame(ds['input'][0:500])
df["label"] = pd.DataFrame(ds['output'][0:500])

In [124]:
sentences_train, sentences_test, labels_train, labels_test = train_test_split(list(df.sentence), 
                                                                              list(df.label), 
                                                                              test_size=0.25,
                                                                              random_state=0)

#### 2. Get Cohere Embeddings

In [125]:
co = cohere.Client(cohere_api_key)

##### 2.1 Embed training set

In [127]:
embeddings_train = co.embed(texts=sentences_train,
                            model="embed-english-v2.0",
                            truncate="END").embeddings

##### 2.2 Embed test set

In [None]:
embeddings_test = co.embed(texts=sentences_test,
                           model="embed-english-v2.0",
                           truncate="END").embeddings

In [None]:
print(f"sentence: {sentences_train[10]}")
print(f"embedding-vector: {embeddings_train[0][:10]}")

sentence: AB InBev looks to win over SABMiller investors
embedding-vector: [-1.7871094, 2.109375, 0.01259613, 0.12561035, 0.6064453, -0.6826172, 0.13586426, -2.640625, -0.11907959, 0.6948242]


#### 3. Train Classifier

In [None]:
# The 'balanced' mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as n_samples / (n_classes * np.bincount(y))
svm_classifier = make_pipeline(StandardScaler(), SVC(class_weight='balanced')) 
svm_classifier.fit(embeddings_train, labels_train)

#### 4. Evaluate the Model

In [None]:
score = svm_classifier.score(embeddings_test, labels_test)
print(f"Test Accuracy: {100*score}%!")

Validation accuracy on Large is 88.0%!
