In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings
warnings.filterwarnings("ignore")

In [9]:

# Define the paths to both train and test datasets
splits = {'train': 'data/train/train.parquet', 'test': 'data/test/test.parquet'}

# Read both datasets
train_df = pd.read_parquet("hf://datasets/raghavdw/cci-dataset-v2/" + splits["train"])
test_df = pd.read_parquet("hf://datasets/raghavdw/cci-dataset-v2/" + splits["test"])

# Combine them into one dataset
combined_df = pd.concat([train_df, test_df], axis=0, ignore_index=True)

# Now you can work on the combined dataset
print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")
print(f"Combined shape: {combined_df.shape}")

# You can also reset the index if needed
combined_df = combined_df.reset_index(drop=True)


Train shape: (12478, 12)
Test shape: (3120, 12)
Combined shape: (15598, 12)


In [10]:
combined_df.head()

Unnamed: 0,Index,Utterance,Predicted_Intent,Intent_Score,Sentiment,empathy_score,listening_score,Topic,Topic_Name,rsic_score,RSICs,fallback_type
0,2819,Can I get assistance in LHR when I land on my ...,ChangeFlight,0.3,POSITIVE,0,0,2,Reservations and Ticketing,1.0,Low,no fallback
1,9253,"If I book my flight through princess cruise, s...",BookFlight,0.3,POSITIVE,0,0,2,Reservations and Ticketing,1.0,Low,no fallback
2,220,"Okay,arrive in Oslo at 5:00pm and catch Flight...",PetTravel,0.3,POSITIVE,0,0,8,Special Services and Assistance,1.0,Low,no fallback
3,7736,I did not see my canceled reservation in my ac...,CancelFlight,0.3,NEGATIVE,0,0,2,Reservations and Ticketing,1.0,Low,no fallback
4,60,I already booked a flight feb. 16th that i wou...,BookFlight,0.3,POSITIVE,0,0,2,Reservations and Ticketing,1.0,Low,no fallback


In [13]:
combined_df["empathy_score"].value_counts()

Unnamed: 0_level_0,count
empathy_score,Unnamed: 1_level_1
0,15324
1,240
2,34


In [16]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15598 entries, 0 to 15597
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Index             15598 non-null  int64  
 1   Utterance         15598 non-null  object 
 2   Predicted_Intent  15598 non-null  object 
 3   Intent_Score      15598 non-null  float64
 4   Sentiment         15598 non-null  object 
 5   empathy_score     15598 non-null  int64  
 6   listening_score   15598 non-null  int64  
 7   Topic             15598 non-null  int64  
 8   Topic_Name        15598 non-null  object 
 9   rsic_score        15598 non-null  float64
 10  RSICs             15598 non-null  object 
 11  fallback_type     15598 non-null  object 
dtypes: float64(2), int64(4), object(6)
memory usage: 1.4+ MB


In [19]:
combined_df.describe(include="all")

Unnamed: 0,Index,Utterance,Predicted_Intent,Intent_Score,Sentiment,empathy_score,listening_score,Topic,Topic_Name,rsic_score,RSICs,fallback_type
count,15598.0,15598,15598,15598.0,15598,15598.0,15598.0,15598.0,15598,15598.0,15598,15598
unique,,1752,18,,3,,,,9,,2,1
top,,How many award miles do I need for a Mileagepl...,Other,,NEGATIVE,,,,General Customer Service,,Low,no fallback
freq,,750,5414,,7522,,,,6586,,15582,15598
mean,7799.5,,,0.301016,,0.019746,0.017566,3.55084,,1.003775,,
std,4502.899085,,,0.008228,,0.154004,0.131373,2.807851,,0.082237,,
min,1.0,,,0.3,,0.0,0.0,1.0,,1.0,,
25%,3900.25,,,0.3,,0.0,0.0,1.0,,1.0,,
50%,7799.5,,,0.3,,0.0,0.0,2.0,,1.0,,
75%,11698.75,,,0.3,,0.0,0.0,6.0,,1.0,,


In [20]:
combined_df.isna().sum()

Unnamed: 0,0
Index,0
Utterance,0
Predicted_Intent,0
Intent_Score,0
Sentiment,0
empathy_score,0
listening_score,0
Topic,0
Topic_Name,0
rsic_score,0


In [21]:
combined_df.duplicated().sum()

np.int64(0)

In [22]:
combined_df.columns

Index(['Index', 'Utterance', 'Predicted_Intent', 'Intent_Score', 'Sentiment',
       'empathy_score', 'listening_score', 'Topic', 'Topic_Name', 'rsic_score',
       'RSICs', 'fallback_type'],
      dtype='object')

In [24]:
combined_df=combined_df.drop(columns=["Index","Predicted_Intent","Intent_Score","rsic_score","RSICs"])

In [25]:
combined_df.head()

Unnamed: 0,Utterance,Sentiment,empathy_score,listening_score,Topic,Topic_Name,fallback_type
0,Can I get assistance in LHR when I land on my ...,POSITIVE,0,0,2,Reservations and Ticketing,no fallback
1,"If I book my flight through princess cruise, s...",POSITIVE,0,0,2,Reservations and Ticketing,no fallback
2,"Okay,arrive in Oslo at 5:00pm and catch Flight...",POSITIVE,0,0,8,Special Services and Assistance,no fallback
3,I did not see my canceled reservation in my ac...,NEGATIVE,0,0,2,Reservations and Ticketing,no fallback
4,I already booked a flight feb. 16th that i wou...,POSITIVE,0,0,2,Reservations and Ticketing,no fallback


In [26]:
combined_df['Utterance'] = combined_df['Utterance'].astype(str).str.strip()
combined_df = combined_df[combined_df['Utterance'].str.len() > 3]

combined_df['Sentiment'] = combined_df['Sentiment'].str.lower().str.strip()
combined_df['Topic_Name'] = combined_df['Topic_Name'].astype(str).str.strip()


In [27]:
combined_df.head()

Unnamed: 0,Utterance,Sentiment,empathy_score,listening_score,Topic,Topic_Name,fallback_type
0,Can I get assistance in LHR when I land on my ...,positive,0,0,2,Reservations and Ticketing,no fallback
1,"If I book my flight through princess cruise, s...",positive,0,0,2,Reservations and Ticketing,no fallback
2,"Okay,arrive in Oslo at 5:00pm and catch Flight...",positive,0,0,8,Special Services and Assistance,no fallback
3,I did not see my canceled reservation in my ac...,negative,0,0,2,Reservations and Ticketing,no fallback
4,I already booked a flight feb. 16th that i wou...,positive,0,0,2,Reservations and Ticketing,no fallback


In [28]:
combined_df['text_len'] = combined_df['Utterance'].str.len()
combined_df['word_count'] = combined_df['Utterance'].apply(lambda x: len(x.split()))
combined_df['avg_word_len'] = combined_df['text_len'] / combined_df['word_count']

In [29]:
combined_df['is_negative'] = (combined_df['Sentiment'] == 'negative').astype(int)
combined_df['is_positive'] = (combined_df['Sentiment'] == 'positive').astype(int)
combined_df['is_neutral']  = (combined_df['Sentiment'] == 'neutral').astype(int)

In [30]:
import re
import string

def clean_text(t):
    t = t.lower()
    t = re.sub(r"http\S+", "", t)
    t = t.translate(str.maketrans("", "", string.punctuation))
    t = re.sub(r'\s+', ' ', t)
    return t.strip()

combined_df['clean_text'] = combined_df['Utterance'].apply(clean_text)


In [31]:
combined_df = combined_df.dropna(subset=['clean_text', 'Sentiment', 'Topic_Name'])


In [32]:
combined_df.to_csv("cci_clean_prepared.csv", index=False)
