In [5]:
import pandas as pd
import numpy as np
import re

import emoji
import collections as c

# for visualization
import matplotlib.pyplot as plt
import plotly.express as px
# word cloud
from wordcloud import WordCloud, STOPWORDS

In [6]:
def startsWithDateTime(s):
 #   pattern = '^([0-2][0-9]|(3)[0-1])(\/)(((0)[0-9])|((1)[0-2]))(\/)(\d{2}|\d{4}), ([0-9])|([0-9]):([0-9][0-9]) '
    pattern = '^([2][0-9][0-9][0-9])+(/)(((0)[0-9])|((1)[0-2]))+(/)([0-2][0-9]|(3)[0-1]), ([0-9])|([0-9]):([0-9][0-9]) '
    result = re.match(pattern, s)
    if result:
        return True
    return False

In [8]:
def startsWithAuthor(s):
    """
        This function is used to verify the string(s) contains 'Author' or not with the help of regular expressions.
        
        Parameters:
            s: String
        
        Returns:
            True if it contains author name otherwise False
    """
    
    pattern = '^([\w()\[\]-]+):|([\w]+[\s]+([\w()\[\]-]+)):'
    result = re.match(pattern, s)
    if result:
        return True
    return False

In [13]:
def getDataPoint(line):
    """
        Use to extract the date, time, author and message from line.
        
        Parameters: 
            line (from txt file)
        
        Returns:
            date, time, author, message        
    """
    try:
        splitLine = line.split(' - ') # splitLine = ['18/06/17, 22:47', 'Loki: Why do you have 2 numbers, Banner?']
    
        dateTime = splitLine[0] # dateTime = '18/06/17, 22:47'
    except:
        splitline = line.split(' ')
        dateTime = splitLine[0] # dateTime = '18/06/17, 22:47'
        pass
    
    date, time = dateTime.split(', ') # date = '18/06/17'; time = '22:47'
    
    message = ' '.join(splitLine[1:]) # message = 'Loki: Why do you have 2 numbers, Banner?'
    
    if startsWithAuthor(message): # True
        splitMessage = message.split(': ') # splitMessage = ['Loki', 'Why do you have 2 numbers, Banner?']
        author = splitMessage[0] # author = 'Loki'
        message = ' '.join(splitMessage[1:]) # message = 'Why do you have 2 numbers, Banner?'
    else:
        author = None
    return date, time, author, message

In [14]:
def read_data(file_contents):
    """
        This function is use to return the extracted data from txt file.
        
        Parameters:
            file_contents -> line by line contents from txt chat file
            
        Returns:
            data -> list of list having elements as date, time, author and message by the user.
    """
    
    data = [] # List to keep track of data so it can be used by a Pandas dataframe
    
    messageData = [] # to capture intermediate output for multi-line messages
    date, time, author = None, None, None # Intermediate variables to keep track of the current message being processed
    
    for line in file_contents:
        line = line.strip() # Guarding against erroneous leading and trailing whitespaces

        if startsWithDateTime(line): # If a line starts with a Date Time pattern, then this indicates the beginning of a new message
            if len(messageData) > 0: # Check if the message buffer contains characters from previous iterations
                data.append([date, time, author, ' '.join(messageData)]) # Save the tokens from the previous message in data
            messageData.clear() # Clear the messageData so that it can be used for the next message
            date, time, author, message = getDataPoint(line) # Identify and extract tokens from the line
            messageData.append(message) # Append message
        else:
            messageData.append(line) # If a line doesn't start with a Date Time pattern, then it is part of a multi-line message. So, just append to messageData
    return data

In [15]:
def return_df(filename):
    """
        This function reads the txt file of chat and return in dataframe format having columns Date, Time, Author and Message.
    """
    with open(filename) as f:
        file_contents = [x.rstrip() for x in f]
        
    return pd.DataFrame(read_data(file_contents), columns=['Date', 'Time', 'Author', 'Message'])

In [16]:
filename="test123.txt"
data = return_df(filename)
data["Date"] = pd.to_datetime(data["Date"])
data.head()

Unnamed: 0,Date,Time,Author,Message
