In [None]:
import mailbox
import email.utils
import pandas as pd
import exifread
from datetime import datetime, timezone
import pytz
from ipywidgets import IntProgress, IntText

In [None]:
filename = "tgthorley.mbox"
filepath = './data/test/'

mbox = mailbox.mbox(filepath + filename)

In [None]:
def create_email_summary(message):
    emaildate = email.utils.parsedate_to_datetime(message['Date']).astimezone(pytz.utc)
    email_sum = {
            "email_date": emaildate,
            "_id": message['Message-ID'],
            "subject": message['Subject']
        }
    return email_sum

In [None]:
def create_file(part):
    try:
        filename = part.get_filename()
        filepath = "./attachements/"+filename
        fb = open(filepath,'wb')
        fb.write(part.get_payload(decode=True))
        fb.close()
        return filepath
    except:
        return ""

In [None]:
#takes an image path and retruns exif data
def parse_img(imgpath):
    try:
        f = open(imgpath, 'rb')
        tags = exifread.process_file(f)
        return tags
    except:
        print("Failed to extract exif %s" % imgpath)
        return []

In [None]:
#takes a message with an attached immage  and returns the exif data
def getexif_from_attached_image(image):
    try:
        filepath = create_file(image)
        tags = parse_img(filepath)
        return tags
    except:
        print ("File not found", "./attachements/"+image.get_filename())
        return {}

In [None]:
def compare_sent_time_and_img_creation(tags, message):
    try:
        emaildate = email.utils.parsedate_to_datetime(message['Date']).astimezone(pytz.utc)
        email_sum = create_email_summary(message)
        if "EXIF DateTimeOriginal" in tags:
            
            imgdate = datetime.strptime(tags["EXIF DateTimeOriginal"].values, '%Y:%m:%d %H:%M:%S').astimezone(pytz.utc)
            timediff = emaildate - imgdate
            if timediff.total_seconds() > 0 and timediff.total_seconds() < 86400:
                print ('WARNING: Image taken less than a day after sending\n', message['Message-ID'], message['Subject'] )
            email_sum["img_date"] = imgdate
            email_sum["timediff"] = timediff
            
        return email_sum
    except: 
        print("Error", message['Subject'])
        return {}


In [None]:
emails = []
i=0
progressBar = IntProgress(description='Processing emails '+str(len(mbox)), min=0, max=len(mbox), style = {'description_width': 'initial'})
progressText = IntText(value=0, description='Processed so far: ', style = {'description_width': 'initial'})
display(progressBar, progressText)

for message in mbox:
    email_sum = {}
    if message.get_content_maintype() == 'multipart':
        for part in message.walk():
            if 'image' in part.get_content_type():
                tags = getexif_from_attached_image(part)
                email_sum.update(compare_sent_time_and_img_creation(tags, message))
    emails.append(email_sum)
    i= i+1
    progressBar.value = i
    progressText.value = i
            

In [None]:
df = pd.DataFrame(emails)
df.sort_values("timediff")