In [1]:
import pandas as pd
import numpy as np


train_df = pd.read_csv('./c_train.csv')
test_df = pd.read_csv('./c_test.csv')

In [6]:
np.random.seed(0)

In [5]:
# Build tf-idf dictionary on train
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF vectorizer
tfidf = TfidfVectorizer(min_df=2, max_df=0.95)

# Get feature names
train_tfidf = tfidf.fit_transform(train_df['query'])
test_tfids = tfidf.transform(test_df['query'])


## Attemp 1: Logistic regression

In [17]:
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Train logistic regression
lr = OneVsRestClassifier(LogisticRegression(random_state=0, max_iter=1000))
lr.fit(train_tfidf, train_df['intent'])

# Make predictions on test set
test_pred = lr.predict(test_tfids)

# Calculate and print metrics
test_accuracy = accuracy_score(test_df['intent'], test_pred)
print(f"Test accuracy: {test_accuracy:.4f}")

# Print detailed classification report
print("\nClassification Report:")
print(classification_report(test_df['intent'], test_pred))


Test accuracy: 0.8682

Classification Report:
                precision    recall  f1-score   support

  abbreviation       1.00      0.96      0.98        26
      aircraft       0.44      0.88      0.58         8
       airfare       0.85      0.64      0.73        61
       airline       1.00      0.63      0.78        30
       airport       0.00      0.00      0.00        13
      capacity       1.00      0.14      0.25        21
          city       0.00      0.00      0.00         5
      day_name       0.00      0.00      0.00         2
      distance       1.00      0.10      0.18        10
        flight       0.87      0.99      0.93       627
     flight_no       0.00      0.00      0.00         9
   flight_time       0.00      0.00      0.00         1
   ground_fare       1.00      0.57      0.73         7
ground_service       0.97      0.89      0.93        36
          meal       0.00      0.00      0.00         6
      quantity       0.30      1.00      0.46         3



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [13]:
# Save weights
import os
import pickle

# Create weights directory if it doesn't exist
os.makedirs('../weights', exist_ok=True)

# Save the TF-IDF vectorizer
with open('../weights/tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(tfidf, f)

# Save the logistic regression model  
with open('../weights/logistic_regression.pkl', 'wb') as f:
    pickle.dump(lr, f)



### Conclusion: get better results than fixed predictor
Pluses:
* easy and fast to train
* fast inference

Minuses:
* fixed dictionary, unseen words in queries requeires re-training
* not multilingual unless re-trained
* new intents need markup

## Attemp 2: Random Forest

In [18]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Initialize and train Random Forest model
rf = RandomForestClassifier(n_estimators=100, max_depth=100, random_state=42)
rf.fit(train_tfidf, train_df['intent'])

# Make predictions
rf_test_pred = rf.predict(test_tfids)

# Calculate and print metrics
rf_test_accuracy = accuracy_score(test_df['intent'], rf_test_pred)

print(f"Random Forest Test accuracy: {rf_test_accuracy:.4f}")

# Print detailed classification report
print("\nRandom Forest Classification Report:")
print(classification_report(test_df['intent'], rf_test_pred))


Random Forest Test accuracy: 0.9064

Random Forest Classification Report:
                precision    recall  f1-score   support

  abbreviation       1.00      1.00      1.00        26
      aircraft       0.44      1.00      0.62         8
       airfare       0.86      0.82      0.84        61
       airline       1.00      0.57      0.72        30
       airport       1.00      0.54      0.70        13
      capacity       1.00      0.57      0.73        21
          city       0.00      0.00      0.00         5
      day_name       0.00      0.00      0.00         2
      distance       1.00      0.60      0.75        10
        flight       0.92      0.98      0.95       627
     flight_no       0.00      0.00      0.00         9
   flight_time       1.00      1.00      1.00         1
   ground_fare       1.00      0.57      0.73         7
ground_service       0.95      1.00      0.97        36
          meal       0.00      0.00      0.00         6
      quantity       0.43    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [16]:
# Save the random forest model
with open('../weights/random_forest.pkl', 'wb') as f:
    pickle.dump(rf, f)

### Conclusion: better metrics with all same minuses
