In [21]:
import pandas as pd
import numpy as np


train_df = pd.read_csv('./c_train.csv')
test_df = pd.read_csv('./c_test.csv')

In [22]:
from transformers import AutoModel

In [23]:
model = AutoModel.from_pretrained("jinaai/jina-embeddings-v3", trust_remote_code=True)
model.to("mps")
_ = model.eval()

In [24]:
train_embeddings = model.encode(train_df['query'], task="classification")
test_embeddings = model.encode(test_df['query'], task="classification")

# Attemp 1: Logistic Regression as classification head

In [25]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.multiclass import OneVsRestClassifier

# Train logistic regression
lr = OneVsRestClassifier(LogisticRegression(random_state=0, max_iter=1000))
lr.fit(train_embeddings, train_df['intent'])

# Make predictions on test set
test_pred = lr.predict(test_embeddings)

# Calculate and print metrics
test_accuracy = accuracy_score(test_df['intent'], test_pred)
print(f"Test accuracy: {test_accuracy:.4f}")

# Print detailed classification report
print("\nClassification Report:")
print(classification_report(test_df['intent'], test_pred))


Test accuracy: 0.8543

Classification Report:
                precision    recall  f1-score   support

  abbreviation       0.84      1.00      0.91        26
      aircraft       0.31      0.62      0.42         8
       airfare       0.72      0.56      0.63        61
       airline       0.93      0.47      0.62        30
       airport       0.00      0.00      0.00        13
      capacity       1.00      0.05      0.09        21
          city       0.00      0.00      0.00         5
      day_name       0.00      0.00      0.00         2
      distance       1.00      0.10      0.18        10
        flight       0.88      0.99      0.93       627
     flight_no       0.00      0.00      0.00         9
   flight_time       0.00      0.00      0.00         1
   ground_fare       0.00      0.00      0.00         7
ground_service       0.80      0.97      0.88        36
          meal       0.00      0.00      0.00         6
      quantity       0.00      0.00      0.00         3



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [27]:
import pickle
with open('../weights/lr_jina_model.pkl', 'wb') as f:
    pickle.dump(lr, f)

## Conclusion: Logistic regression with NN embeddings perform slightly worse than with tf-idf
Plus:
* Multi-language
* Embedder do not require re-training

Minus:
* Slower than tf-idf and requires more resources
* New intent still requires labeling

In [12]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Initialize and train Random Forest model
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(train_embeddings, train_df['intent'])

# Make predictions
rf_test_pred = rf.predict(test_embeddings)

# Calculate and print metrics
rf_test_accuracy = accuracy_score(test_df['intent'], rf_test_pred)

print(f"Random Forest Test accuracy: {rf_test_accuracy:.4f}")

# Print detailed classification report
print("\nRandom Forest Classification Report:")
print(classification_report(test_df['intent'], rf_test_pred))

Random Forest Test accuracy: 0.8659

Random Forest Classification Report:
                precision    recall  f1-score   support

  abbreviation       0.93      1.00      0.96        26
      aircraft       0.43      0.38      0.40         8
       airfare       0.90      0.44      0.59        61
       airline       1.00      0.47      0.64        30
       airport       1.00      0.08      0.14        13
      capacity       1.00      0.52      0.69        21
          city       0.00      0.00      0.00         5
      day_name       0.00      0.00      0.00         2
      distance       1.00      0.50      0.67        10
        flight       0.86      1.00      0.92       627
     flight_no       0.00      0.00      0.00         9
   flight_time       0.00      0.00      0.00         1
   ground_fare       1.00      0.43      0.60         7
ground_service       0.89      0.92      0.90        36
          meal       0.00      0.00      0.00         6
      quantity       0.00    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Let's try reduce vector embedding space

In [19]:
train_embeddings = model.encode(train_df['query'], task="classification", truncate_dim=512)
test_embeddings = model.encode(test_df['query'], task="classification", truncate_dim=512)

## Attemp 3: Logistic regression on reduced space

In [20]:
# Train logistic regression
lr = OneVsRestClassifier(LogisticRegression(random_state=0, max_iter=1000))
lr.fit(train_embeddings, train_df['intent'])

# Make predictions on test set
test_pred = lr.predict(test_embeddings)

# Calculate and print metrics
test_accuracy = accuracy_score(test_df['intent'], test_pred)
print(f"Test accuracy: {test_accuracy:.4f}")

# Print detailed classification report
print("\nClassification Report:")
print(classification_report(test_df['intent'], test_pred))

Test accuracy: 0.8532

Classification Report:
                precision    recall  f1-score   support

  abbreviation       0.84      1.00      0.91        26
      aircraft       0.27      0.50      0.35         8
       airfare       0.73      0.52      0.61        61
       airline       0.93      0.47      0.62        30
       airport       0.00      0.00      0.00        13
      capacity       1.00      0.05      0.09        21
          city       0.00      0.00      0.00         5
      day_name       0.00      0.00      0.00         2
      distance       1.00      0.20      0.33        10
        flight       0.87      1.00      0.93       627
     flight_no       0.00      0.00      0.00         9
   flight_time       0.00      0.00      0.00         1
   ground_fare       0.00      0.00      0.00         7
ground_service       0.81      0.97      0.89        36
          meal       0.00      0.00      0.00         6
      quantity       0.00      0.00      0.00         3



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Conclusion: performs similar as with full sized embeddings