In [None]:
%%capture
!pip install -r requirements.txt

In [None]:
from clearml import Task, Dataset

project_name = 'userxx/CustomerSupport'
task_name = "02-exploratory-data-analysis"

task = Task.init(project_name=project_name, task_name=task_name)
#Task.add_requirements("./requirements.txt")
#task.set_base_docker(docker_image="nvidia/cuda:12.4.1-cudnn-runtime-ubuntu22.04")
#task.execute_remotely(queue_name="q-group-a-cpu-050")

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [None]:
dataset_name = "bitext-customer-support"
dataset = Dataset.get(
        dataset_project=project_name,
        dataset_name=dataset_name,
        only_completed=True,
        only_published=False,
)
dataset_path = dataset.get_local_copy()
files = dataset.list_files()
print(dataset_path)
df = pd.read_csv(dataset_path + "/" + files[0])

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isnull().sum()

In [None]:
plt.figure(figsize=(10, 6))
sns.countplot(data=df, x='category')
plt.xticks(rotation=90)
plt.title("Distribution of Category")
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.countplot(data=df, x='intent')
plt.xticks(rotation=90)
plt.title("Distribution of Intent")
plt.show()

In [None]:
df['response_length'] = df['response'].apply(len)
plt.figure(figsize=(10, 6))
sns.histplot(df['response_length'], bins=50)
plt.title("Distribution of Response Lengths")
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.countplot(data=df, x='intent')
plt.xticks(rotation=90)
plt.title("Distribution of Intent")
plt.show()

In [None]:
# Group by intent and calculate summary statistics
intent_length_stats = df.groupby('intent')['response_length'].agg(['mean', 'min', 'max', 'median', 'std'])

# Display the results
intent_length_stats

- Mean: The average length of responses for each intent.
- Min & Max: The shortest and longest response lengths recorded.
- Median: The middle value, offering insight into the typical response length.
- Standard Deviation (std): Measures variation in response length, indicating  - - - - consistency or diversity in responses.

In [None]:
plt.figure(figsize=(12, 6))
sns.boxplot(data=df, x='intent', y='response_length')
plt.xticks(rotation=90)
plt.title("Response Length Distribution by Intent")
plt.xlabel("Intent")
plt.ylabel("Response Length")
plt.show()

##### Observation
- Complex inquiries (e.g., check_refund_policy, cancel_order) tend to have longer responses, suggesting detailed explanations.
- Straightforward queries (e.g., track_order, check_cancellation_fee) generally have shorter responses, indicating concise replies.


In [None]:
category_length_stats = df.groupby('category')['response_length'].agg(['mean', 'min', 'max', 'median', 'std'])
category_length_stats

In [None]:
plt.figure(figsize=(12, 6))
sns.boxplot(data=df, x='category', y='response_length')
plt.xticks(rotation=90)
plt.title("Response Length Distribution by Category")
plt.xlabel("Category")
plt.ylabel("Response Length")
plt.show()

##### Observation

- Complex interactions, such as REFUND (mean: 913.54) and ACCOUNT (mean: 707.84), have longer responses, likely due to detailed explanations required.
- Shorter responses are seen in CANCEL (mean: 260.87), indicating straightforward resolutions.
- High standard deviation values, such as in REFUND (std: 566.09) and ORDER (std: 356.25), suggest significant variation in response length, potentially due to differing levels of query complexity within these categories.
- DELIVERY (mean: 609.99, std: 267.13) and PAYMENT (mean: 664.40, std: 214.24) exhibit moderate response lengths with some variability, indicating the need for flexibility in handling customer inquiries.

#### Text Polarity

In [None]:
from textblob import TextBlob
import nltk
nltk.download('punkt')

def get_polarity(text):
    return TextBlob(str(text)).sentiment.polarity  # Convert to string to handle NaN values

In [None]:
df["response_polarity"] = df["response"].apply(get_polarity)

In [None]:
category_sentiment = df.groupby("category")["response_polarity"].mean().reset_index()
category_sentiment

In [None]:
intent_sentiment = df.groupby("intent")["response_polarity"].mean().reset_index()
intent_sentiment

In [None]:
plt.figure(figsize=(12,6))
sns.histplot(df["response_polarity"], bins=30, kde=True)
plt.title("Distribution of Response Polarity")
plt.xlabel("Polarity Score")
plt.ylabel("Frequency")
plt.show()

In [None]:
category_polarity = df.groupby("category")["response_polarity"].describe()
category_polarity

In [None]:
intent_polarity = df.groupby("intent")["response_polarity"].describe()
intent_polarity

In [None]:
plt.figure(figsize=(12, 6))
sns.boxplot(x="category", y="response_polarity", data=df)
plt.xticks(rotation=90)  # Rotate x-axis labels
plt.title("Polarity Distribution by Category")
plt.show()


In [None]:
plt.figure(figsize=(12, 6))
sns.boxplot(x="intent", y="response_polarity", data=df)
plt.xticks(rotation=90)
plt.title("Polarity Distribution by Intent")
plt.show()

In [None]:
task.close()