# Messenger Analysis

## Instructions
- Export data from Facebook
- Copy messages.htm from zip from html folder into here

In [None]:
# Importing stuff
from html.parser import HTMLParser

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

%pylab inline

import dateutil.parser

As of 8/1/2016, the format for messages as follows:

```
<div class="thread">
    Person1, Person2

    <div class="message">
        <div class="message_header">
            <span class="user">Person1</span>
            <span class="meta">Thursday, July 7, 2016 at 6:28pm MST</span>
        </div>
    </div>
    
    <p>message1</p>
        <div class="message">
        <div class="message_header">
            <span class="user">Person2</span>
            <span class="meta">Thursday, July 7, 2016 at 6:29pm MST</span>
        </div>
    </div>
    <p>message2</p>
</div>
```

In [None]:
class MessagesHTMLParser(HTMLParser):

    def __init__(self):
        HTMLParser.__init__(self)
        self.in_thread = False # data should be conversation participants
        self.in_user = False # data should be sender
        self.in_meta = False # data should be timestamp
        self.in_p = False # data should be message
        
        self.participants = None
        self.sender = None
        self.timestamp = None
        
        self.messages = list() # List of tuples containing all messages
            
    def handle_starttag(self, tag, attrs):
        if len(attrs) > 0:
            if tag == 'div':
                if self._contains_attr(attrs, 'thread'):
                    self.in_thread = True
            elif tag == 'span':
                if self._contains_attr(attrs, 'user'):
                    self.in_user = True
                elif self._contains_attr(attrs, 'meta'):
                    self.in_meta = True
        elif tag == 'p':
            self.in_p = True
            
    def handle_data(self, data):
        if self.in_thread:
            self.participants = data
            self.in_thread = False
        elif self.in_user:
            self.sender = data
            self.in_user = False
        elif self.in_meta:
            self.timestamp = dateutil.parser.parse(data)
            self.in_meta = False
        elif self.in_p:
            self.messages.append((self.participants,
                                  self.timestamp,
                                  self.sender,
                                  data))
            self.sender = None
            self.timestamp = None
            self.in_p = False
            
    def _contains_attr(self, attrs, attribute):
        for pair in attrs:
            if attribute in pair:
                return True
        return False


In [None]:
# Read in data
# This may take a while depending on the number of conversations

parser = MessagesHTMLParser()
parser.feed(open('messages.htm').read())

df = pd.DataFrame(parser.messages,
                 columns=['participants', 'timestamp',
                         'sender', 'message'])

In [None]:
# Show first few conversations
df.head()