In [14]:
import pandas as pd
import numpy as np


train_df = pd.read_csv('../data/atis/train.tsv', header=None, sep='\t')
test_df = pd.read_csv('../data/atis/test.tsv', header=None, sep='\t')
train_df.columns = ['query', 'intent']
test_df.columns = ['query', 'intent']

In [15]:
train_df.head()

Unnamed: 0,query,intent
0,i want to fly from boston at 838 am and arrive...,flight
1,what flights are available from pittsburgh to ...,flight
2,what is the arrival time in san francisco for ...,flight_time
3,cheapest airfare from tacoma to orlando,airfare
4,round trip fares from pittsburgh to philadelph...,airfare


In [16]:
train_df.describe()

Unnamed: 0,query,intent
count,4634,4634
unique,4634,22
top,i want to fly from boston at 838 am and arrive...,flight
freq,1,3426


In [17]:
test_df.describe()

Unnamed: 0,query,intent
count,850,850
unique,850,20
top,i would like to find a flight from charlotte t...,flight
freq,1,613


In [18]:
train_df['intent'].value_counts()

intent
flight                        3426
airfare                        403
ground_service                 235
airline                        148
abbreviation                   108
aircraft                        78
flight_time                     52
quantity                        49
distance                        20
city                            18
airport                         18
ground_fare                     17
flight+airfare                  17
capacity                        16
flight_no                       12
meal                             6
restriction                      5
airline+flight_no                2
ground_service+ground_fare       1
airfare+flight_time              1
cheapest                         1
aircraft+flight+flight_no        1
Name: count, dtype: int64

In [20]:
test_df['intent'].value_counts()

intent
flight               613
airfare               48
ground_service        36
airline               28
abbreviation          26
capacity              21
airport               13
flight+airfare        12
distance              10
aircraft               8
flight_no              8
ground_fare            7
meal                   6
city                   5
quantity               3
day_name               2
flight_time            1
airfare+flight         1
flight+airline         1
flight_no+airline      1
Name: count, dtype: int64

#### Note: dataset not balanced ~74% of train queries and ~72% of test queries are about flight

In [39]:
# Fixed classifier accuracy
from sklearn.metrics import accuracy_score, classification_report
fixed_pred = ['flight'] * test_df.shape[0]

test_accuracy = accuracy_score(test_df['intent'], fixed_pred)
print(f"Test accuracy: {test_accuracy:.4f}")

print("\nClassification Report:")
print(classification_report(test_df['intent'], fixed_pred))

Test accuracy: 0.7212

Classification Report:
                   precision    recall  f1-score   support

     abbreviation       0.00      0.00      0.00        26
         aircraft       0.00      0.00      0.00         8
          airfare       0.00      0.00      0.00        48
   airfare+flight       0.00      0.00      0.00         1
          airline       0.00      0.00      0.00        28
          airport       0.00      0.00      0.00        13
         capacity       0.00      0.00      0.00        21
             city       0.00      0.00      0.00         5
         day_name       0.00      0.00      0.00         2
         distance       0.00      0.00      0.00        10
           flight       0.72      1.00      0.84       613
   flight+airfare       0.00      0.00      0.00        12
   flight+airline       0.00      0.00      0.00         1
        flight_no       0.00      0.00      0.00         8
flight_no+airline       0.00      0.00      0.00         1
      fli

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### Split `+` classes (combinations will be provided through multi-class predictors)

In [33]:
def split_intent(df):
    # Get rows with + in intent
    plus_rows = df[df['intent'].str.contains('\+')]

    # Create new rows by splitting intents
    new_rows = []
    for _, row in plus_rows.iterrows():
        intents = row['intent'].split('+')
        for intent in intents:
            new_rows.append({
                'query': row['query'],
                'intent': intent
            })

    # Convert to dataframe and append to original
    plus_split_df = pd.DataFrame(new_rows)
    return pd.concat([df[~df['intent'].str.contains('\+')], plus_split_df])

In [34]:
clean_train_df = split_intent(train_df)
clean_test_df = split_intent(test_df)

In [37]:
clean_train_df.to_csv('./c_train.csv', header=True, index=False)
clean_test_df.to_csv('./c_test.csv', header=True, index=False)