In [13]:
import pandas as pd
import numpy as np
import nltk
import re
import warnings
warnings.filterwarnings("ignore")

pd.set_option('display.max_columns', None)

import matplotlib.pyplot as plt
import seaborn as sns

import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

nltk.data.path.append('C:/Users/Divyanshu/AppData/Roaming/nltk_data')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

customers = pd.read_csv('customers2afd6ea.csv')
reasons = pd.read_csv('reason18315ff.csv')
sentiments = pd.read_csv('sentiment_statisticscc1e57a.csv')
test = pd.read_csv('testbc7185d.csv')

calls = pd.read_csv('callsf0d4f5a.csv')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Divyanshu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Divyanshu\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Divyanshu\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


In [2]:
df = pd.merge(calls, sentiments, on=['call_id', 'agent_id'], how='left')
df = pd.merge(df, reasons, on='call_id', how='left')
df = pd.merge(df, customers, on='customer_id', how='left')

In [6]:
df['call_transcript'] = df['call_transcript'].fillna('')
df['agent_tone'] = df['agent_tone'].fillna('neutral')
df['customer_tone'] = df['customer_tone'].fillna('neutral')
df['average_sentiment'] = df['average_sentiment'].fillna(0)
df['silence_percent_average'] = df['silence_percent_average'].fillna(df['silence_percent_average'].mean())
df['elite_level_code'] = df['elite_level_code'].fillna(0)

In [7]:
df['call_start_datetime'] = pd.to_datetime(df['call_start_datetime'])
df['agent_assigned_datetime'] = pd.to_datetime(df['agent_assigned_datetime'])
df['call_end_datetime'] = pd.to_datetime(df['call_end_datetime'])
df['call_duration'] = (df['call_end_datetime'] - df['agent_assigned_datetime']).dt.total_seconds()
df['wait_time'] = (df['agent_assigned_datetime'] - df['call_start_datetime']).dt.total_seconds()

In [None]:
!pip install sentence-transformers
from sentence_transformers import SentenceTransformer
bert_model = SentenceTransformer('all-MiniLM-L6-v2')
df['clean_transcript'] = df['call_transcript'].fillna('')
transcript_embeddings = bert_model.encode(df['clean_transcript'].tolist(), show_progress_bar=True)
X_text = np.array(transcript_embeddings)

In [14]:
def preprocess_text(text):
    text = str(text).lower()
    text = re.sub(r'\n', ' ', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word.isalpha()]
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

In [15]:
df['clean_transcript'] = df['call_transcript'].apply(preprocess_text)

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_text = tfidf_vectorizer.fit_transform(df['clean_transcript'])

In [17]:
tone_mapping = {
    'angry': -2,
    'frustrated': -1,
    'neutral': 0,
    'calm': 1,
    'polite': 2
}

df['agent_tone_encoded'] = df['agent_tone'].map(tone_mapping)
df['customer_tone_encoded'] = df['customer_tone'].map(tone_mapping)

In [19]:
numerical_features = ['call_duration', 'wait_time', 'average_sentiment', 'silence_percent_average', 'elite_level_code', 'agent_tone_encoded', 'customer_tone_encoded']
X_numeric = df[numerical_features].fillna(0)

In [20]:
from scipy.sparse import hstack

X = hstack([X_text, X_numeric])

In [21]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['primary_call_reason'])

In [22]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3)

In [24]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
model.fit(X_train, y_train)

In [27]:
from sklearn.metrics import classification_report, accuracy_score
y_pred = model.predict(X_val)
print('Accuracy:', accuracy_score(y_val, y_pred))
labels_in_y_val = np.unique(y_val)
print(classification_report(y_val, y_pred, labels=labels_in_y_val, target_names=label_encoder.classes_[:len(labels_in_y_val)]))

Accuracy: 0.28287610824861903
                         precision    recall  f1-score   support

                Baggage       0.00      0.00      0.00        27
                Baggage       0.20      0.00      0.00       845
              Baggage         0.00      0.00      0.00        31
                Booking       0.23      0.01      0.02       733
              Booking         0.00      0.00      0.00        14
               Check In       0.00      0.00      0.00       119
               Check-In       0.00      0.00      0.00       454
             Check-In         0.00      0.00      0.00         9
               Checkout       0.71      0.01      0.02       550
             Checkout         0.00      0.00      0.00        19
         Communications       0.22      0.00      0.01      1083
       Communications         0.00      0.00      0.00        20
      Digital   Support       0.00      0.00      0.00        43
       Digital  Support       0.00      0.00      0.00     