In [18]:
from pymongo import MongoClient
import pandas as pd

In [19]:
client = MongoClient('mongodb://localhost:27017/')
db = client['flask_db']

In [21]:
def extract_user_info():
    collection = db['chat_client_info']
    data = list(collection.find({}, {'_id': 0, "session_id": 1, "client_id": 1, "product": 1, "grateful": 1, "ranting": 1, "expression": 1, "civil": 1}))
    df = pd.DataFrame(data)
    df.to_csv('user_info.csv', index=False)
    return df

def extract_chat_history():
    collection = db['chat_history_collection']
    data = list(collection.find({}, {'_id': 0, "session_id": 1, "client_id": 1, "turn_number": 1, "sender": 1, "receiver": 1, "message": 1, "timestamp": 1}))
    df = pd.DataFrame(data)
    df.to_csv('chat_history.csv', index=False)
    return df

def extract_user_feedback():
    collection = db['chat_emo_feedback']
    data = list(collection.find({}, {'_id': 0, "session_id": 1, "client_id": 1, "turn_number": 1, "support_type": 1, "support_content": 1, "timestamp_arrival": 1, "timestamp_feedback": 1, "user_feedback": 1}))
    df = pd.DataFrame(data)
    df.to_csv('user_feedback.csv', index=False)
    return df

In [22]:
# export csv run functions

if __name__ == "__main__":
    user_info_df = extract_user_info()
    chat_history_df = extract_chat_history()
    user_feedback_df = extract_user_feedback()

In [23]:
# Analyze the value from csv table

user_info_df = pd.read_csv('user_info.csv')
chat_history_df = pd.read_csv('chat_history.csv')
user_feedback_df = pd.read_csv('user_feedback.csv')


In [9]:
chat_history_df

Unnamed: 0,session_id,client_id,turn_number,sender,receiver,message,timestamp
0,44f5b12c-f174-4be2-a48f-a83d87b2df2e,7f0046bc-a96a-4c76-a313-c09f36acf7cb,1,representative,client,"Hi, I recently stayed at your hotel and I was ...",2024-07-01 21:05:49.438
1,44f5b12c-f174-4be2-a48f-a83d87b2df2e,e6b6b40b-6b2a-4032-bd13-a84a73e0dbe7,1,representative,client,"Sorry to bother you, but I just checked into m...",2024-07-01 21:05:56.795
2,44f5b12c-f174-4be2-a48f-a83d87b2df2e,e6b6b40b-6b2a-4032-bd13-a84a73e0dbe7,1,client,representative,hello! sorry for the inconvenience caused,2024-07-01 21:06:30.649
3,44f5b12c-f174-4be2-a48f-a83d87b2df2e,e6b6b40b-6b2a-4032-bd13-a84a73e0dbe7,2,representative,client,"Sorry to bother you, but I just checked into m...",2024-07-01 21:06:30.649
4,44f5b12c-f174-4be2-a48f-a83d87b2df2e,e6b6b40b-6b2a-4032-bd13-a84a73e0dbe7,2,client,representative,no worries! yes! I am gonna ask my co worker t...,2024-07-01 21:07:15.437
...,...,...,...,...,...,...,...
71,ca488fb6-5e5c-46f3-ac78-33ea49e7a8c8,49da3987-cf0c-4511-8b95-f2ec2fa86631,5,representative,client,You're most welcome! It's always a pleasure to...,2024-07-02 06:21:22.044
72,ca488fb6-5e5c-46f3-ac78-33ea49e7a8c8,49da3987-cf0c-4511-8b95-f2ec2fa86631,5,client,representative,wish you all the best!,2024-07-02 06:21:43.461
73,ca488fb6-5e5c-46f3-ac78-33ea49e7a8c8,49da3987-cf0c-4511-8b95-f2ec2fa86631,6,representative,client,Thank you so much! Wishing you all the best as...,2024-07-02 06:21:43.461
74,ca488fb6-5e5c-46f3-ac78-33ea49e7a8c8,49da3987-cf0c-4511-8b95-f2ec2fa86631,6,client,representative,take care! bye!,2024-07-02 06:21:56.212


In [24]:
# User Info
print(user_info_df.describe())  # Basic statistics
print(user_info_df['product'].value_counts())
print(user_info_df['grateful'].mean())
print(user_info_df['ranting'].mean())
print(user_info_df['expression'].mean())

        grateful  ranting  expression      civil
count  12.000000     12.0        12.0  12.000000
mean    0.250000      1.0         1.0   0.250000
std     0.452267      0.0         0.0   0.452267
min     0.000000      1.0         1.0   0.000000
25%     0.000000      1.0         1.0   0.000000
50%     0.000000      1.0         1.0   0.000000
75%     0.250000      1.0         1.0   0.250000
max     1.000000      1.0         1.0   1.000000
product
hotel      10
airline     2
Name: count, dtype: int64
0.25
1.0
1.0


In [25]:
# Chat History
chat_history_df['timestamp'] = pd.to_datetime(chat_history_df['timestamp'])

# Report average message length for client and for rep
average_msg_length_client = chat_history_df[chat_history_df['sender'] == 'client']['message'].apply(len).mean()
average_msg_length_rep = chat_history_df[chat_history_df['sender'] == 'representative']['message'].apply(len).mean()
print("Average Message Length (Client):", average_msg_length_client)
print("Average Message Length (Rep):", average_msg_length_rep)

# Report the average time of response for client and for rep
response_times = chat_history_df.sort_values(by=['session_id', 'timestamp'])
response_times['response_time'] = response_times.groupby('session_id')['timestamp'].diff().dt.seconds
average_response_time_client = response_times[response_times['sender'] == 'client']['response_time'].mean()
average_response_time_rep = response_times[response_times['sender'] == 'representative']['response_time'].mean()
print("Average Response Time (Client):", average_response_time_client)
print("Average Response Time (Rep):", average_response_time_rep)

# Report the average number of turns
chat_history_df['turn_number'] = pd.to_numeric(chat_history_df['turn_number'], errors='coerce')  # Ensure turn numbers are numeric
max_turns_per_session = chat_history_df.groupby(['client_id', 'session_id'])['turn_number'].max().reset_index()
average_turns = max_turns_per_session.groupby('client_id')['turn_number'].mean()
print("Average Number of Turns per Session:", average_turns.mean())

Average Message Length (Client): 75.0625
Average Message Length (Rep): 159.29545454545453
Average Response Time (Client): 58.46875
Average Response Time (Rep): 6.333333333333333
Average Number of Turns per Session: 3.6666666666666665


In [None]:
# Analysis for User Feedback
average_feedback_by_type = user_feedback_df.groupby('support_type')['user_feedback'].mean()
print(average_feedback_by_type)

In [None]:
# New testing！----Joy

In [16]:
# print(chat_history_df['timestamp'].min())
# print(chat_history_df['timestamp'].max())
# print(chat_history_df['timestamp'])
for timestamp in chat_history_df['timestamp']:
    print(timestamp)

2024-07-01 21:05:49.438000
2024-07-01 21:05:56.795000
2024-07-01 21:06:30.649000
2024-07-01 21:06:30.649000
2024-07-01 21:07:15.437000
2024-07-01 21:07:15.437000
2024-07-01 21:07:48.385000
2024-07-01 21:07:48.385000
2024-07-01 21:08:39.747000
2024-07-01 21:08:39.747000
2024-07-01 21:09:04.233000
2024-07-01 21:09:04.233000
2024-07-01 21:09:27.812000
2024-07-01 21:09:27.812000
2024-07-01 21:10:25.280000
2024-07-01 21:10:30.290000
2024-07-01 21:11:06.393000
2024-07-01 21:11:06.393000
2024-07-01 21:11:44.543000
2024-07-01 21:11:44.543000
2024-07-01 21:13:17.363000
2024-07-01 21:13:17.363000
2024-07-01 21:14:32
2024-07-01 21:14:32
2024-07-01 21:15:31.469000
2024-07-01 21:15:31.469000
2024-07-01 21:16:39.129000
2024-07-01 21:16:39.129000
2024-07-01 21:18:40.695000
2024-07-01 21:18:40.695000
2024-07-01 21:19:33.683000
2024-07-01 21:19:33.683000
2024-07-01 21:20:05.023000
2024-07-01 21:20:05.023000
2024-07-01 21:21:20.799000
2024-07-01 21:21:20.799000
2024-07-01 21:21:59.669000
2024-07-02 06:0

In [49]:
import pandas as pd
from datetime import datetime, timedelta

# analysis of duration and other info depends on client_id
grouped = chat_history_df.groupby('client_id')

chat_history_df['timestamp'] = pd.to_datetime(chat_history_df['timestamp'])
# create a list include duration,message, turns
results_list = []

for client_id, group in grouped:

    duration = group['timestamp'].max() - group['timestamp'].min()
    
    word_count = group['message'].str.split().str.len().sum()
    
    turn_count = group['turn_number'].nunique()
    
    results_list.append({
        'client_id': client_id,
        'conversation_duration': duration,
        'word_count': word_count,
        'turn_count': turn_count
    })

results = pd.DataFrame(results_list)

average_duration = results['conversation_duration'].mean()
average_word_count = results['word_count'].mean()
average_turn_count = results['turn_count'].mean()

# filtrate wrong data
results = results[results['conversation_duration'] >= timedelta(minutes=1)]

results = results[results['turn_count'] >= 2]

# print(results)
# print("-"*30)
print(f"\nAverage conversation duration: {average_duration}")
print(f"\nAverage conversation words: {average_word_count}")
print(f"\nAverage conversation turns: {average_turn_count}")

# analysis userInfo + chatHistory
merged_df = results.merge(user_info_df, on='client_id')

emo_1 = merged_df[merged_df['grateful'] == 1]
emo_0 = merged_df[merged_df['grateful'] == 0]
print("-"*30)
print("\nAverages when grateful=1:")
print(f"Length: {emo_1['word_count'].mean():.2f} words")
print(f"Time: {emo_1['conversation_duration'].dt.total_seconds().mean() / 60:.2f} minutes")
print(f"Turns: {emo_1['turn_count'].mean():.2f}")

print("\nAverages when grateful=0:")
print(f"Length: {emo_0['word_count'].mean():.2f} words")
print(f"Time: {emo_0['conversation_duration'].dt.total_seconds().mean() / 60:.2f} minutes")
print(f"Turns: {emo_0['turn_count'].mean():.2f}")

civil_1 = merged_df[merged_df['civil'] == 1]
civil_0 = merged_df[merged_df['civil'] == 0]

print("\nAverages when civil=1:")
print(f"Length: {civil_1['word_count'].mean():.2f} words")
print(f"Time: {civil_1['conversation_duration'].dt.total_seconds().mean() / 60:.2f} minutes")
print(f"Turns: {civil_1['turn_count'].mean():.2f}")

print("\nAverages when civil=0:")
print(f"Length: {civil_0['word_count'].mean():.2f} words")
print(f"Time: {civil_0['conversation_duration'].dt.total_seconds().mean() / 60:.2f} minutes")
print(f"Turns: {civil_0['turn_count'].mean():.2f}")

#  What is the time to complete post-task survey
user_feedback_df['timestamp_arrival'] = pd.to_datetime(user_feedback_df['timestamp_arrival'])
user_feedback_df['timestamp_feedback'] = pd.to_datetime(user_feedback_df['timestamp_feedback'])

user_feedback_df['survey_completion_time'] = user_feedback_df['timestamp_feedback'] - user_feedback_df['timestamp_arrival']

average_completion_time = user_feedback_df['survey_completion_time'].mean()

print("-"*30)
print(f"Average time of completing survey: {average_completion_time.total_seconds() / 60:.2f} minutes")


Average conversation duration: 0 days 00:02:37.458333333

Average conversation words: 146.0

Average conversation turns: 3.6666666666666665
------------------------------

Averages when grateful=1:
Length: 236.50 words
Time: 3.50 minutes
Turns: 7.00

Averages when grateful=0:
Length: 474.50 words
Time: 12.25 minutes
Turns: 11.00

Averages when civil=1:
Length: 236.50 words
Time: 3.50 minutes
Turns: 7.00

Averages when civil=0:
Length: 474.50 words
Time: 12.25 minutes
Turns: 11.00
------------------------------
Average time of completing survey: 1.11 minutes
