# Facebook Messenger Data
Hi all, this is my facebook messenger data parsing document.

In [2]:
import os, json
import pandas as pd

rootdir = './messages/inbox'
paths = []
for root, dirs, files in os.walk(rootdir):
    for name in files:
        if name.endswith((".json")):
            full_path = os.path.join(root, name)
            paths.append(full_path)
print(f'''
total: {len(paths)} files
[0]: {paths[0]}
[1]: {paths[1]}
...
      ''')



total: 1119 files
[0]: ./messages/inbox/stszuoyesan_igm9kgo29q-1/message_1.json
[1]: ./messages/inbox/yurong_lvvfeb8qfw-1/message_1.json
...
      


# Format
the message json file is formated like this:
Some tips can be found [here](https://stackabuse.com/reading-and-writing-json-to-a-file-in-python/).

```json
{
    "participatns": [
        {
            "name": "Vibert Thio"
        },
    ...],
    "messages": [
        {
            "sender_id_INTERNAL": "",
            "sender_name": "Vibert Thio",
            "timestamp_ms": 1434350607403,
            "content": "\u00e6\u0088...",
            "type": "Generic"
        },
        ...
    ],
    "title": "...",
    "is_still_participant": true,
    "thread_type": "Regular", // "RegularGroup"
    "thread_path": "inbox/..."
}
```

In [3]:
import json

chatrooms = []
for path in paths:
    with open(path, encoding='utf-8') as json_file:
        data = json.load(json_file)
        chatrooms.append(data)

print(f'''
chatrooms[0] participants:
{chatrooms[0]['participants']}
''')


chatrooms[0] participants:
[{'name': 'è¨±å\x93²æ¦\x95'}, {'name': 'Vibert Thio'}, {'name': 'æ\x9d\x8eæ\x80¡å\x9d¤'}]



# Inspect inside the data

`encode('latin-1').decode('utf-8)`

This is a hack (found [this](https://stackoverflow.com/questions/50008296/facebook-json-badly-encoded) on SO) to make chinese readable because facebook messed up the encoding.

Fields that need to be transform this way include:

```python
chatrooms['participatns'][0]['name']
chatrooms['messages'][0]['sender_name']
chatrooms['messages'][0]['content']
chatrooms['title']
chatrooms['thread_path']
```


In [4]:
people = chatrooms[1]['participants']
for i in range(len(people)):
    print(f'''[{i}]{people[i]['name'].encode('latin-1').decode('UTF-8')}''')

[0]宇容
[1]Vibert Thio


In [5]:
def transformFromLatinToUTF8 (string):
    return string.encode('latin-1').decode('UTF-8')

shinyi_chatrooms = []
for chatroom in chatrooms:
    if 'participants' in chatroom.keys():
        participants = chatroom['participants']
        for person in participants:
            person['name'] = transformFromLatinToUTF8(person['name'])
            if person['name'] == '何昕逸':
                shinyi_chatrooms.append(chatroom)
    if 'messages' in chatroom.keys():
        messages = chatroom['messages']
        for message in messages:
            if 'sender_name' in message.keys():
                message['sender_name'] = transformFromLatinToUTF8(message['sender_name'])
            if 'content' in message.keys():
                message['content'] = transformFromLatinToUTF8(message['content'])
    if 'title' in chatroom.keys():
        chatroom['title'] = transformFromLatinToUTF8(chatroom['title'])
    if 'thread_path' in chatroom.keys():
        chatroom['thread_path'] = transformFromLatinToUTF8(chatroom['thread_path'])
            

I found that the order of the chatrooms is kinda chaos. They are not sorted by any obvious parameters, such as the created time. Also, everytime the number of messages exceeds 10000, the database will create a new chatroom.

In [37]:
from datetime import datetime

total_messages_count = 0
for chatroom in shinyi_chatrooms:
    messages = chatroom['messages']
    count = len(messages)
    print(f'#messages count: {count}')
#     print(chatroom['title'])
#     for i in range(4):
#         if 'timestamp_ms' in messages[i].keys():
#         sec = datetime.fromtimestamp(messages[-i]['timestamp_ms'] / 1000.0) 
#         print(f"{sec} ({messages[-i]['type']})")
#         if 'content' in messages[-i].keys():
#             print(f"{messages[-i]['content']}")
#     print('-' * 20)
    total_messages_count += count
print('---------------------')
print(f'chatrooms count: {len(shinyi_chatrooms)}')
print(f'messages count: {total_messages_count}')

#messages count: 4
#messages count: 10000
#messages count: 10000
#messages count: 10000
#messages count: 6616
#messages count: 10000
#messages count: 570
#messages count: 375
#messages count: 340
#messages count: 201
#messages count: 386
#messages count: 396
#messages count: 12
#messages count: 616
---------------------
chatrooms count: 14
messages count: 49516


# Get all the messages from Shinyi and sort by time

In [32]:
messages = []
for i in range(1, 6):
    messages = messages + shinyi_chatrooms[i]['messages']
messages = sorted(messages, key=lambda message: message['timestamp_ms'])
for m in messages[:2]:
    if 'content' in m.keys():
        print(f'''
{m['sender_name']} {datetime.fromtimestamp(m['timestamp_ms'] / 1000.0).strftime("%Y/%d/%m")}
{m['content']}''')


Vibert Thio 2016/06/01
黑你今天會去subway嘛~？

何昕逸 2016/06/01
應該不會歐～爆炸期末中QQ


In [30]:
import jieba
jieba.set_dictionary('data/dict.txt.big.txt')

In [29]:
count = {}
for m in messages:
    if 'content' in m.keys():
        seg_list = jieba.cut(m['content'])
        for word in seg_list:
            if word in count.keys():
                count[word] = count[word] + 1
            else:
                count[word] = 1

count = {k: v for k, v in sorted(count.items(), key=lambda item: -item[1])}