In [4]:
from typing import List
from tqdm import tqdm
import pandas as pd
from datetime import datetime
from collections import Counter
import os

In [2]:
args = {'filename': r'data\ubuntu dialogue\Ubuntu-dialogue-corpus\dialogueText.csv',
        'question_length_threshold': 20,
        'answer_length_threshold': 20}

In [5]:
class Dialogue:
    id = 0,
    utterances = [],
    questionCount = 0,
    combinedQuestionWordLength = 0,
    combinedAnswerWordLength = 0,
    answerCount = 0

    def __init__(self, id):
        self.id = id
        self.utterances = []
        self.questionCount = 0
        self.combinedQuestionWordLength = 0
        self.combinedAnswerWordLength = 0
        self.answerCount = 0

class Utterance:
    text = "",
    dialogueId = -1,
    fromUser = "",
    isQuestion = False,
    isAnswer = False

    def __init__(self, text, dialogueId, fromUser):
        self.text = text
        self.dialogueId = dialogueId
        self.fromUser = fromUser
        self.isQuestion = False
        self.isAnswer = False

#region Logger Setup

# Log to console, and to a timestamped log file
def Log(text):    
    print(text)
    with open(logFilePath, 'a') as f:
        f.write(datetime.now().strftime("%H:%M:%S") + " " + text + "\n")

#endregion

#region Setup

fileName = args['filename']
questionWordThreshold = args['question_length_threshold']
answerWordThreshold = args['answer_length_threshold']

if not os.path.exists("./output"):
    os.makedirs("./output")

startTime = datetime.now()
logFileName = startTime.strftime("%Y%m%d_%H%M%S.txt")
logFilePath = "./output/{logFileName}".format(logFileName = logFileName)

totalQuestionCount = 0
totalAnswerCount = 0
total3TurnDialogueCount = 0
totalNon3TurnDialogueCount = 0
totalLTEQuestionThresholdWord3TurnDialogueCount = 0
totalOverQuestionThresholdWord3TurnDialogueCount = 0
totalLTEAnswerThresholdWord3TurnDialogueCount = 0
totalOverAnswerThresholdWord3TurnDialogueCount = 0
totalDialoguesWithinThreshold = 0

df = (pd.read_csv(fileName))

#endregion

#region Data Handling

def GetUtterances():
    parsedUtterances = []

    for index, row in df.iterrows():
        dialogueId = row["dialogueID"]
        # Strip .tsv from end of dialogue Id
        dialogueId = dialogueId[0:len(dialogueId) - 4]

        # Make unique id from folder and dialogue id
        strUniqueId = "{folderId}{dialogueId}".format(folderId = row["folder"], dialogueId = dialogueId)
        parsedUtterances.append(Utterance(row['text'], int(strUniqueId), row['from']))

    return parsedUtterances

#endregion

#region Data Analysis

# def PerformAnalysis():
#     Log("Proceeding with analysis tasks...")

#     startTime = datetime.now()

#     Log("Commencing analysis of {commentClassDescription} class comments".format(commentClassDescription = commentClass.description))

#     for comment in tqdm(commentClass.comments):
#         TokeniseForAnalysis(comment, commentClass)

#     Log("Spelling corrections required for {count} words".format(count = commentClass.correctedSpellingsCount))
#     Log("Sentence count: {count}".format(count = commentClass.sentenceCount))
#     Log("Token counts - before processing: {preTokensCount}, after processing: {postTokensCount} ".format(
#         preTokensCount = commentClass.preProcessedTokenCount, postTokensCount = commentClass.postProcessedTokenCount))
#     Log("Most commonly-appearing words: {top10}".format(top10 = Counter(commentClass.tokens).most_common(10)))

#     endTime = datetime.now()
#     secondsElapsed = str(endTime - startTime)
#     Log("Finished analysing '{commentClassDescription}' class in {elapsed}".format(
#         commentClassDescription = commentClass.description, elapsed = secondsElapsed))
    
#     Log("Analysis complete.")

# endregion

# region Program Flow

Log("Parsing utterances...")
utterances:List[Utterance] = GetUtterances()
Log("Done. {count} utterances parsed.".format(count = len(utterances)))

Log("Parsing into dialogues")
dialogues:List[Dialogue] = []
dialogue = Dialogue(utterances[0].dialogueId)
lastDialogueId = utterances[0].dialogueId

for u in tqdm(utterances):
    if u.dialogueId != lastDialogueId:
        # Stash the current dialogue and create a new one to work with
        dialogues.append(dialogue)
        dialogue = Dialogue(u.dialogueId)

    # NOTE: Following our meeting 26/7/23, this logic is questionable
    if len(dialogue.utterances) == 0:
        # must the the question, first message
        u.isQuestion = True
    elif len(dialogue.utterances) == 1:
        u.isQuestion = (dialogue.utterances[0].fromUser == u.fromUser)
        u.isAnswer = (dialogue.utterances[0].fromUser != u.fromUser)
    else:
        # third turn cannot be the question
        u.isQuestion = False
        u.isAnswer = True

    dialogue.utterances.append(u)

    if u.isQuestion:
        dialogue.questionCount += 1
        totalQuestionCount += 1
        dialogue.combinedQuestionWordLength += len(str(u.text))

    if u.isAnswer:
        dialogue.answerCount += 1
        totalAnswerCount += 1
        dialogue.combinedAnswerWordLength += len(str(u.text))

    lastDialogueId = u.dialogueId

# now push the final dialogue we were working on
dialogues.append(dialogue)

Log("Parsed utterances into {count} distinct dialogues".format(count = len(dialogues)))

for dialogue in dialogues:
    if len(dialogue.utterances) == 3:
        total3TurnDialogueCount += 1
        if dialogue.combinedQuestionWordLength <= questionWordThreshold:
            totalLTEQuestionThresholdWord3TurnDialogueCount += 1
        else:
            totalOverQuestionThresholdWord3TurnDialogueCount += 1

        if dialogue.combinedAnswerWordLength <= answerWordThreshold:
            totalLTEAnswerThresholdWord3TurnDialogueCount += 1
        else:
            totalOverAnswerThresholdWord3TurnDialogueCount += 1

        if dialogue.combinedQuestionWordLength <= questionWordThreshold and dialogue.combinedAnswerWordLength <= answerWordThreshold:
            totalDialoguesWithinThreshold += 1
    else:
        totalNon3TurnDialogueCount += 1
        Log("Non 3 Turn Dialogue found, Our Unique ID: {id}".format(id=dialogue.id))

Log("There are {count} non-three-turn dialogues".format(count = totalNon3TurnDialogueCount))
Log("There are {count} three-turn dialogues".format(count = total3TurnDialogueCount))
Log("Among the three-turn dialogues, there are {count} with <={threshold} question words (in threshold), and {count2} over threshold"
    .format(count = totalLTEQuestionThresholdWord3TurnDialogueCount, count2 = totalOverQuestionThresholdWord3TurnDialogueCount, threshold=questionWordThreshold))
Log("Among the three-turn dialogues, there are {count} with <={threshold} answer words (in threshold), and {count2} over threshold"
    .format(count = totalLTEAnswerThresholdWord3TurnDialogueCount, count2 = totalOverAnswerThresholdWord3TurnDialogueCount, threshold=answerWordThreshold))
Log("Total Number of Dialogues Falling Within the specified Thresholds: {count}".format(count=totalDialoguesWithinThreshold))

Log("Getting Word Counts")
# Note that we are given the following data from toc.csv in the ubuntu dataset, so no need to get it again
#lines,words,characters,filename
#9212878,91660344,996253904,dialogueText_196.csv
#16587831,166392849,1799936480,dialogueText_301.csv
#1038325,11035331,116070597,dialogueText.csv

# Note, I used 'Counter' in my mid-module assignment to get the most popular words.
# I think you'll need a quick and dirty spaCy tokeniser to rip the utterances list into just a flat list of words.
# It's going to be massive, but once you've done that, you can do Counter(flatListOfWords).most_common(20) 

Log("Analyse.py ceased executing at {now}".format(now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
Log("Shell output logged to {file}".format(file = logFilePath))

Parsing utterances...
Done. 1038324 utterances parsed.
Parsing into dialogues


100%|██████████| 1038324/1038324 [00:01<00:00, 713331.86it/s]


Parsed utterances into 346108 distinct dialogues
There are 0 non-three-turn dialogues
There are 346108 three-turn dialogues
Among the three-turn dialogues, there are 18881 with <=20 question words (in threshold), and 327227 over threshold
Among the three-turn dialogues, there are 60173 with <=20 answer words (in threshold), and 285935 over threshold
Total Number of Dialogues Falling Within the specified Thresholds: 3821
Getting Word Counts
Analyse.py ceased executing at 2023-07-27 01:16:22
Shell output logged to ./output/20230727_011535.txt


In [None]:
from collections import Counter
import os
import spacy
import csv
import pandas as pd

! spacy download en_core_web_lg'
nlp = spacy.load("en_core_web_lg")

In [None]:
 ! pip install -q kaggle
os.environ['KAGGLE_USERNAME'] = "avinashfernando" # username from the json file
os.environ['KAGGLE_KEY'] = "7ba1d8c04bcf3cf93dc66bf86a81f0ab" # key from the json file
!kaggle datasets download -d rtatman/ubuntu-dialogue-corpus
! unzip ubuntu-dialogue-corpus.zip

In [None]:
# Replace 'path_to_dataset.csv' with the actual path to your CSV file
df = pd.read_csv('Ubuntu-dialogue-corpus/dialogueText.csv')

# Display the first few rows of the DataFrame (head)
print(df.head())

Calculate the average, minimum, and maximum length of dialogues in terms of the number of turns (rows) present in each dialogue. This will give an overview of how long the conversations usually are.

In [None]:
# 1. Dialogue Length Analysis
dialogue_lengths = df.groupby("text").size()
average_dialogue_length = dialogue_lengths.mean()
min_dialogue_length = dialogue_lengths.min()
max_dialogue_length = dialogue_lengths.max()
print("Average Dialogue Length:", average_dialogue_length)
print("Minimum Dialogue Length:", min_dialogue_length)
print("Maximum Dialogue Length:", max_dialogue_length)

Analyze the distribution of "from" and "to" fields to identify the most active users in the dataset. This can help in understanding which users are more engaged in the conversations.

In [None]:
# 2. User Interaction Analysis
user_activity = df["from"].value_counts()
most_active_user = user_activity.idxmax()
print("Most Active User:", most_active_user)

Examine the "date" field to determine patterns in dialogue activity over time. This could involve identifying peak hours or days with higher dialogue activity.

In [None]:
# 3. Time-based Analysis
df["date"] = pd.to_datetime(df["date"])  # Convert date column to datetime format
df["hour"] = df["date"].dt.hour
hourly_activity = df["hour"].value_counts()
print("Hourly Activity:\n", hourly_activity)

Investigate how many dialogues are complete (i.e., have both "from" and "to" fields filled for at least two rows) and how many remain incomplete (have only one row or no "to" field). This can provide insights into the quality of the dataset and potential data cleaning needs.

In [None]:
# 4. Dialogue Completion Analysis
complete_dialogues = df.groupby("text").filter(lambda x: x["to"].count() > 1)
incomplete_dialogues = df.groupby("text").filter(lambda x: x["to"].count() <= 1)
print("Number of Complete Dialogues:", len(complete_dialogues["text"].unique()))
print("Number of Incomplete Dialogues:", len(incomplete_dialogues["text"].unique()))

Conduct a word frequency analysis on the "text" field to identify the most common words used by users in their dialogues. This can help in understanding the most prevalent topics of conversation.

In [None]:
# 5. Word Frequency Analysis

# Preprocess the text to remove unwanted characters and convert to lowercase
def preprocess_text(text):
    if isinstance(text, str):  # Check if the value is a string
        text = text.translate(str.maketrans("", "", string.punctuation))  # Remove punctuation
        text = text.lower()  # Convert to lowercase
    return text

# Process text with spaCy to obtain lemmatized words and exclude stop words
def process_text_with_spacy(text):
    if isinstance(text, str):  # Check if the value is a string
        doc = nlp(text)
        lemmatized_words = [token.lemma_ for token in doc if not token.is_stop]
        return lemmatized_words
    else:
        return []

# Apply preprocessing and processing to the 'text' column
df['processed_text'] = df['text'].apply(preprocess_text)
df['lemmatized_words'] = df['processed_text'].apply(process_text_with_spacy)

# Flatten the list of lemmatized words for word frequency analysis
all_words = [word for words_list in df['lemmatized_words'] for word in words_list]

# Calculate word frequencies
word_freq = Counter(all_words)

# Top N words and their frequencies (change N to any number you desire)
top_n = 20
top_words, top_word_counts = zip(*word_freq.most_common(top_n))

# Plot the word frequency distribution
plt.figure(figsize=(10, 6))
plt.bar(top_words, top_word_counts)
plt.xticks(rotation=45)
plt.xlabel('Words')
plt.ylabel('Frequency')
plt.title(f'Top {top_n} Words Frequency')
plt.tight_layout()
plt.show()

To determine the distribution of dialogue types, we need to analyze the patterns in the "from" and "to" fields. We can classify the dialogues into different types based on the presence or absence of "from" and "to" values in each row.

QQA (Question-Question-Answer): A dialogue where a user asks a question, another user responds with a question, and finally, a user provides an answer.
QAA (Question-Answer-Answer): A dialogue where a user asks a question, and two other users provide separate answers.
QA (Question-Answer): A dialogue where a user asks a question, and another user provides an answer.
QQ (Question-Question): A dialogue where two users exchange questions without any direct answers.
A (Answer): A dialogue where a user provides an answer without any preceding questions.

In [None]:
# 6. Distribution Analysis
# Identify dialogue types based on "from" and "to" fields
def classify_dialogue_type(row):
    from_user = row["from"]
    to_user = row["to"]

    if pd.notnull(from_user) and pd.notnull(to_user):
        return "QQA"
    elif pd.notnull(from_user) and pd.isnull(to_user):
        return "QA"
    elif pd.notnull(to_user) and pd.isnull(from_user):
        return "A"
    elif pd.isnull(from_user) and pd.isnull(to_user):
        return "QQ"

# Apply the dialogue type classification to each row
df["dialogue_type"] = df.apply(classify_dialogue_type, axis=1)

# Calculate the distribution of dialogue types
dialogue_type_distribution = df["dialogue_type"].value_counts()

# Display the distribution
print("Dialogue Type Distribution:")
print(dialogue_type_distribution)

Analyzing the distribution of sentence lengths in the dataset involves calculating the length of each sentence (turn of dialogue) in terms of the number of words or characters and then analyzing the distribution of these lengths across the dataset. This can provide insights into the typical sentence length in the dialogues and help identify any patterns or trends.

In [None]:
import pandas as pd

# Step 2: Drop rows with missing text values
df = df.dropna(subset=['text'])

# Step 3: Preprocess the text to calculate sentence lengths
df['word_count'] = df['text'].apply(lambda x: len(str(x).split()))
df['char_count'] = df['text'].apply(lambda x: len(str(x)))

# Step 4: Save distribution of sentence lengths (Word Count) into a CSV file
word_count_distribution = df['word_count'].value_counts().sort_index()
word_count_distribution.to_csv('word_count_distribution.csv', header=['Frequency'])

# Step 5: Save distribution of sentence lengths (Character Count) into a CSV file
char_count_distribution = df['char_count'].value_counts().sort_index()
char_count_distribution.to_csv('char_count_distribution.csv', header=['Frequency'])

# Step 6: Calculate summary statistics for word count and character count
summary_word_count = df['word_count'].describe()
summary_char_count = df['char_count'].describe()

# Step 7: Save summary statistics for word count and character count into CSV files
summary_word_count.to_csv('word_count_summary.csv', header=True)
summary_char_count.to_csv('char_count_summary.csv', header=True)