In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
import nltk
from nltk.corpus import stopwords
import re
import numpy as np

In [None]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
df1 = pd.read_csv('/content/callsf0d4f5a.csv', encoding='utf-8', quotechar='"', escapechar='\\', engine='python')
df2 = pd.read_csv('/content/reason18315ff.csv', quotechar='"', escapechar='\\', engine='python')

In [None]:
data = [np.nan, 'Seating', 'Voluntary Cancel', 'Voluntary Change', 'Post-Flight',
        'Communications', 'Baggage', 'Mileage Plus', 'IRROPS', 'IRROPS  ',
        'Digital   Support', 'Checkout', 'Check-In', 'Upgrade', 'Voluntary   Change',
        'Booking', 'Other Topics', 'ETC', 'Products & Services', 'Products and Services',
        'Digital Support', 'Post-Flight  ', 'Traveler   Updates', 'Voluntary  Change',
        'Traveler Updates', 'Voluntary  Cancel', 'Disability', 'Upgrade  ',
        'Digital  Support', 'Unaccompanied Minor', 'Mileage Plus  ', 'Traveler  Updates',
        'Check In', 'Schedule Change', 'Products and Services  ', 'Post Flight',
        'Mileage   Plus', 'Voluntary Change  ', 'Seating  ', 'Mileage  Plus',
        'Digital Support  ', 'Baggage  ', 'Other  Topics', 'Booking  ', ' Baggage',
        'Communications  ', 'Check-In  ', 'Traveler Updates  ', 'Unaccompanied Minor  ',
        'Checkout  ', 'ETC  ', 'Schedule Change  ', 'Other Topics  ', 'Voluntary Cancel  ',
        'Disability  ']

df2['primary_call_reason'] = df2['primary_call_reason'].str.strip()  # Remove leading/trailing spaces
df2['primary_call_reason'] = df2['primary_call_reason'].str.title()  # Convert to title case

df2['primary_call_reason'] = df2['primary_call_reason'].replace({
    'Voluntary Cancel': 'Voluntary Cancel',
    'Voluntary  Cancel': 'Voluntary Cancel',
    'Voluntary Change': 'Voluntary Change',
    'Voluntary   Change': 'Voluntary Change',
    'Voluntary  Change': 'Voluntary Change',
    'Products & Services': 'Products and Services',
    'Products and Services  ': 'Products and Services',
    'Digital Support': 'Digital Support',
    'Digital   Support': 'Digital Support',
    'Digital  Support': 'Digital Support',
    'Baggage': 'Baggage',
    'Baggage  ': 'Baggage',
    'Mileage Plus': 'Mileage Plus',
    'Mileage   Plus': 'Mileage Plus',
    'Mileage  Plus': 'Mileage Plus',
    'Check-In': 'Check-In',
    'Check-In  ': 'Check-In',
    'Check In': 'Check-In',
    'Check In  ': 'Check-In',
    'Schedule Change': 'Schedule Change',
    'Schedule Change  ': 'Schedule Change',
    'Unaccompanied Minor': 'Unaccompanied Minor',
    'Unaccompanied Minor  ': 'Unaccompanied Minor',
    'Other Topics': 'Other Topics',
    'Other Topics  ': 'Other Topics',
    'Other  Topics': 'Other Topics',
    'Traveler Updates': 'Traveler Updates',
    'Traveler  Updates  ': 'Traveler Updates',
    'Traveler   Updates': 'Traveler Updates',
    'Traveler  Updates': 'Traveler Updates',
    'Post-Flight': 'Post-Flight',
    'Post-Flight  ': 'Post-Flight',
    'Post Flight': 'Post-Flight',
    'Post Flight  ': 'Post-Flight',
})

# View the cleaned data
print(df2['primary_call_reason'].value_counts())

primary_call_reason
Irrops                   13311
Voluntary Change         10848
Seating                   6365
Mileage Plus              5851
Post-Flight               4330
Communications            3840
Products And Services     2856
Baggage                   2832
Upgrade                   2738
Booking                   2637
Check-In                  1904
Checkout                  1888
Voluntary Cancel          1607
Digital Support           1225
Etc                        952
Traveler Updates           937
Other Topics               818
Schedule Change            731
Products and Services      476
Disability                 403
Unaccompanied Minor        104
Name: count, dtype: int64


In [None]:
df = pd.merge(df1, df2, on='call_id')

In [None]:
df.head(10)

Unnamed: 0,call_id,customer_id,agent_id,call_start_datetime,agent_assigned_datetime,call_end_datetime,call_transcript,primary_call_reason
0,4667960400,2033123310,963118,7/31/2024 23:56,8/1/2024 0:03,8/1/2024 0:34,\n\nAgent: Thank you for calling United Airlin...,Voluntary Cancel
1,1122072124,8186702651,519057,8/1/2024 0:03,8/1/2024 0:06,8/1/2024 0:18,\n\nAgent: Thank you for calling United Airlin...,Booking
2,6834291559,2416856629,158319,7/31/2024 23:59,8/1/2024 0:07,8/1/2024 0:26,\n\nAgent: Thank you for calling United Airlin...,Irrops
3,2266439882,1154544516,488324,8/1/2024 0:05,8/1/2024 0:10,8/1/2024 0:17,\n\nAgent: Thank you for calling United Airlin...,Upgrade
4,1211603231,5214456437,721730,8/1/2024 0:04,8/1/2024 0:14,8/1/2024 0:23,\n\nAgent: Thank you for calling United Airlin...,Seating
5,5297766997,5590154991,817160,8/1/2024 0:11,8/1/2024 0:16,8/1/2024 0:40,\n\nAgent: Thank you for calling United Airlin...,Mileage Plus
6,324593040,6774865122,519057,8/1/2024 0:08,8/1/2024 0:21,8/1/2024 0:34,\n\nAgent: Thank you for calling United Airlin...,Checkout
7,8902603117,7974326984,488324,8/1/2024 0:13,8/1/2024 0:21,8/1/2024 0:29,\n\nAgent: Thank you for calling United Airlin...,Mileage Plus
8,7222687732,8023417234,957331,8/1/2024 0:14,8/1/2024 0:22,8/1/2024 0:35,\n\nAgent: Thank you for calling United Airlin...,Mileage Plus
9,4113684017,1528835057,158319,8/1/2024 0:20,8/1/2024 0:28,8/1/2024 0:37,\n\nAgent: Thank you for calling United Airlin...,Irrops


In [None]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    words = text.split()
    words = [word for word in words if word not in stop_words]
    return ' '.join(words)

In [None]:
df['cleaned_transcript'] = df['call_transcript'].apply(preprocess_text)

X = df['cleaned_transcript']

le = LabelEncoder()
y = le.fit_transform(df['primary_call_reason'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

rf_model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
rf_model.fit(X_train_tfidf, y_train)

y_pred = rf_model.predict(X_test_tfidf)

print(classification_report(y_test, y_pred, target_names=le.classes_))

new_call_transcript = ["Hi, I'm calling because my flight was delayed by 3 hours."]
X_new = vectorizer.transform(new_call_transcript)
predicted_class = le.inverse_transform(rf_model.predict(X_new))
print("Predicted Reason:", predicted_class[0])

                       precision    recall  f1-score   support

              Baggage       0.00      0.00      0.00       604
              Booking       0.00      0.00      0.00       513
             Check-In       0.00      0.00      0.00       359
             Checkout       0.00      0.00      0.00       384
       Communications       0.00      0.00      0.00       757
      Digital Support       0.00      0.00      0.00       255
           Disability       0.00      0.00      0.00        86
                  Etc       0.00      0.00      0.00       197
               Irrops       0.22      0.65      0.32      2763
         Mileage Plus       0.08      0.01      0.02      1130
         Other Topics       0.00      0.00      0.00       174
          Post-Flight       0.08      0.01      0.01       848
Products And Services       0.00      0.00      0.00       572
Products and Services       0.00      0.00      0.00        86
      Schedule Change       0.00      0.00      0.00  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
