In [1]:
import pandas as pd
import numpy as np
import os
import json
os.chdir('..')

In [2]:
dir_data = os.path.join('data', 'extracted')

In [3]:
def id_formatter(ts):
    return int((float(ts) - 1356998400) * 100)

def parse(message, channel):
    if 'subtype' not in message:
        user_id = message['user']
        # getting message timestamp and id
        msg_ts = message['ts']
        msg_id = id_formatter(msg_ts)
        ts = pd.to_datetime(msg_ts.split('.')[0], unit='s')
        # getting message text
        text = message['text']
        # by default replies_count is 0
        replies_count = 0
        # check if message contains thread timestamp
        if 'thread_ts' in message:
            # getting thread timestamp and id
            msg_thread_ts = message['thread_ts']
            thread_id = id_formatter(msg_thread_ts)
            thread_ts = pd.to_datetime(msg_thread_ts.split('.')[0], unit='s')
            # if message cotains reply count - it is main (change replies count)
            if 'reply_count' in message:
                main_msg = 1
                replies_count = message['reply_count']
            # otherwise - it is thread
            else:
                main_msg = 0
        # if no thread timestamp - message is main and has no thread messages
        # thread id and ts are equal to message id and ts (but replies_count = 0)
        else:
            main_msg = 1
            thread_id = msg_id
            thread_ts = ts
        # collect reactions
        if 'reactions' in message:
            reactions = {'name': '', 'count': 0}
            for reaction in message['reactions']:
                reactions['name'] = reaction['name']
                reactions['count'] = reaction['count']
        else:
            reactions = {}
        return {
            'msg_id': msg_id,
            'user_id': user_id, 
            'channel': channel, 
            'timestamp': ts, 
            'text': text, 
            'main_msg': main_msg, 
            'thread_id': thread_id,
            'thread_timestamp': thread_ts, 
            'replies_count': replies_count, 
            'reactions': reactions
        }

In [4]:
def retrieve_data(dir_data):
    rows = []
    # data folder paths
    for root, subFolders, files in os.walk(dir_data):
        for sf in subFolders:
            print('reading files from {}...'.format(sf))
            for f_name in os.listdir(os.path.join(dir_data, sf)):
                if not f_name.startswith('.'):
                    file = os.path.join(os.path.join(dir_data, sf), f_name)
                    with open(file) as json_data:
                        messages = json.load(json_data)
                        for message in messages:
                            parsed = parse(message, sf)
                            if parsed is not None:
                                rows.append(parsed)
                                    
    return rows

In [5]:
values = retrieve_data(dir_data)
data = pd.DataFrame(values, columns=['msg_id', 'user_id', 'channel', 'timestamp', 'text', 
                             'main_msg', 'thread_id', 'thread_timestamp', 'replies_count', 'reactions'])


In [None]:
data.to_csv('data/train_set.csv')