In [1]:
import mailbox
import email.utils
import pandas as pd
import exifread
from datetime import datetime, timezone
import pytz
from ipywidgets import IntProgress, IntText

In [2]:
filename = "tgthorley.mbox"
filepath = './data/test/'

mbox = mailbox.mbox(filepath + filename)

In [3]:
def create_email_summary(message):
    emaildate = email.utils.parsedate_to_datetime(message['Date']).astimezone(pytz.utc)
    email_sum = {
            "email_date": emaildate,
            "_id": message['Message-ID'],
            "subject": message['Subject']
        }
    return email_sum

In [4]:
def create_file(part):
    try:
        filename = part.get_filename()
        filepath = "./attachements/"+filename
        fb = open(filepath,'wb')
        fb.write(part.get_payload(decode=True))
        fb.close()
        return filepath
    except:
        return ""

In [5]:
#takes an image path and retruns exif data
def parse_img(imgpath):
    try:
        f = open(imgpath, 'rb')
        tags = exifread.process_file(f)
        return tags
    except:
        print("Failed to extract exif %s" % imgpath)
        return []

In [6]:
#takes a message with an attached immage  and returns the exif data
def getexif_from_attached_image(image):
    try:
        filepath = create_file(image)
        tags = parse_img(filepath)
        return tags
    except:
        print ("File not found", "./attachements/"+image.get_filename())
        return {}

In [7]:
def compare_sent_time_and_img_creation(tags, message):
    try:
        emaildate = email.utils.parsedate_to_datetime(message['Date']).astimezone(pytz.utc)
        email_sum = create_email_summary(message)
        if "EXIF DateTimeOriginal" in tags:
            
            imgdate = datetime.strptime(tags["EXIF DateTimeOriginal"].values, '%Y:%m:%d %H:%M:%S').astimezone(pytz.utc)
            timediff = emaildate - imgdate
            if timediff.total_seconds() > 0 and timediff.total_seconds() < 86400:
                print ('WARNING: Image taken less than a day after sending\n', message['Message-ID'], message['Subject'] )
            email_sum["img_date"] = imgdate
            email_sum["timediff"] = timediff
            
        return email_sum
    except: 
        print("Error", message['Subject'])
        return {}


In [8]:
emails = []
i=0
progressBar = IntProgress(description='Processing emails '+str(len(mbox)), min=0, max=len(mbox), style = {'description_width': 'initial'})
progressText = IntText(value=0, description='Processed so far: ', style = {'description_width': 'initial'})
display(progressBar, progressText)

for message in mbox:
    email_sum = {}
    if message.get_content_maintype() == 'multipart':
        for part in message.walk():
            if 'image' in part.get_content_type():
                tags = getexif_from_attached_image(part)
                email_sum.update(compare_sent_time_and_img_creation(tags, message))
    emails.append(email_sum)
    i= i+1
    progressBar.value = i
    progressText.value = i
            

IntProgress(value=0, description='Processing emails 2364', max=2364, style=ProgressStyle(description_width='in…

IntText(value=0, description='Processed so far: ', style=DescriptionStyle(description_width='initial'))

 <CAPRuAkYabyKs-W4n1w+-7hYhDGM5qn=2VwUaNOUDTe81VEiHfQ@mail.gmail.com> nudies
 <CAPRuAkbb_-=Cx7t6=s-mt1_RqHQoE8o6pYRFEOrLU--BnVtDig@mail.gmail.com> 
 <CAPRuAkahivFm9Jwct-1nQWTWaTSoRghT9qWan9mW=Orcmtm8JA@mail.gmail.com> Fwd: Best B'day Present Ever
 <CABUDHgLp5sCYrzJG_DLaoypdDvG=zunXjK+uFddmviZRgFWsBg@mail.gmail.com> Happy one month, Julian James!
 <CABUDHgLp5sCYrzJG_DLaoypdDvG=zunXjK+uFddmviZRgFWsBg@mail.gmail.com> Happy one month, Julian James!
 <CABUDHgLp5sCYrzJG_DLaoypdDvG=zunXjK+uFddmviZRgFWsBg@mail.gmail.com> Happy one month, Julian James!
 <CAPRuAkYDBeSmUX965NwzLHFQUvU4LmRAc7Zb1ey9vLZSkg4UTw@mail.gmail.com> 
 <CAPRuAkYDBeSmUX965NwzLHFQUvU4LmRAc7Zb1ey9vLZSkg4UTw@mail.gmail.com> 
 <CAPRuAkaM9A_Rhveync6iTyH3NZyF-tuZwpAKn-HGGjnFgMLWpw@mail.gmail.com> None
 <CAPRuAkaM9A_Rhveync6iTyH3NZyF-tuZwpAKn-HGGjnFgMLWpw@mail.gmail.com> None
 <CAPRuAkaM9A_Rhveync6iTyH3NZyF-tuZwpAKn-HGGjnFgMLWpw@mail.gmail.com> None
 <CAPRuAkaM9A_Rhveync6iTyH3NZyF-tuZwpAKn-HGGjnFgMLWpw@mail.gmail.com> None
 <CAPRuA

 <CAPRuAkbFCZNHdWrCopkXhikhWOnfam9QuqhtJa8x+mARUbUGRg@mail.gmail.com> pt 5
 <CAPRuAkbFCZNHdWrCopkXhikhWOnfam9QuqhtJa8x+mARUbUGRg@mail.gmail.com> pt 5
 <CAPRuAkZA7=vE0YPSka6Xr0kE-3m9XCmiBES6AcPAB4Kkaws-0Q@mail.gmail.com> 
 <CABUDHgLjS9L2p_qgObOf48A26Zbqkft-Y2Diupe190jnYxCeJQ@mail.gmail.com> Fwd: Important: your claim is ready to be processed
 <CABUDHgLjS9L2p_qgObOf48A26Zbqkft-Y2Diupe190jnYxCeJQ@mail.gmail.com> Fwd: Important: your claim is ready to be processed
 <2f4f926c2d3e8ac6edfb4f121360e94e@doctors.org.uk> Re: Vegister postal votes early!
 <2f4f926c2d3e8ac6edfb4f121360e94e@doctors.org.uk> Re: Vegister postal votes early!
 <2f4f926c2d3e8ac6edfb4f121360e94e@doctors.org.uk> Re: Vegister postal votes early!
 <ad93480f4c0893d711e778667479fb12@doctors.org.uk> Re: Vegister postal votes early!
 <ad93480f4c0893d711e778667479fb12@doctors.org.uk> Re: Vegister postal votes early!
 <ad93480f4c0893d711e778667479fb12@doctors.org.uk> Re: Vegister postal votes early!
 <CAPRuAkZoEFX5pnw2otsaCqfb8k1n

 <CAPRuAkbdRWhj8f92cQNFjvLYAQ1SAvu1h1=tM7hvRsntnrykPA@mail.gmail.com> Dan
 <CAF+sTEz7t3JYFid=FQssnQ2FnHByYdHbhuRE3-ZwZER5oVO6qQ@mail.gmail.com> Re: Texas
 <BLUPR0401MB1602F773D02EF611E7EB70FDCFD40@BLUPR0401MB1602.namprd04.prod.outlook.com> Re: Hunmet driver's can't be trusted - got home and drove straight
 into a lake...
 <595c1535.0a59ca0a.17973.01f5.GMRIR@mx.google.com> Delivery Status Notification (Failure)
 <CAPRuAkbUiVwu+KO_Ez1dMuoMtJ6kHFpf=XakdPAjW1f9s-cOUg@mail.gmail.com> Hunmet driver's can't be trusted - got home and drove straight into a lake...
 <CAEYjUPQMTJYBooX+nNyCW81xK-Ttd_SxRcbAnSROHdAhHPSb1g@mail.gmail.com> Re: Merry Christmas
 <CAPRuAkYmO-fcyESxh-Ncs+qOsWrSy_aJ3ZiK7OMBw68LqjZjdg@mail.gmail.com> Re: PPAR system
 <CAPRuAkaRfaZp+zP1tnQacgfgVZiOzVAqWjUpK-7Xm9y=XAf6cQ@mail.gmail.com> Re: Ref:188731/FCO Healthline
 <CABUDHgKZdNpMnXPcjf832V=+pCgi5uVtKCgkka=qFVnZmpPFEg@mail.gmail.com> A little giggle for you...
 <CAPRuAkb77r0qtYt9ANFj1cFFv7zZxE7v4xSerTNmZ7+=4LOPTQ@mail.gmail.

In [9]:
df = pd.DataFrame(emails)
df.sort_values("timediff")

Unnamed: 0,_id,email_date,img_date,subject,timediff
1219,<0638932e5c78c5a76969048271880819@doctors.org.uk>,2018-07-30 08:46:09+00:00,2018-07-31 00:31:25+00:00,News,-1 days +08:14:44
1227,<865c6df9255bd5a1229be60e25b229ed@doctors.org.uk>,2018-10-14 17:32:46+00:00,2018-10-15 04:11:53+00:00,Helman Tor,-1 days +13:20:53
1598,<CAPRuAkag_Kpe_=J1hc43x1sTj2N9b25NHXNBjQxS=huW...,2018-11-11 21:52:00+00:00,2018-11-12 03:47:37+00:00,,-1 days +18:04:23
1947,<CAPRuAkbbXm0NajNrCLR76189847EmdgUd1h1O089NxSz...,2013-11-10 15:39:51+00:00,2013-11-10 21:29:39+00:00,Mae and Buddy,-1 days +18:10:12
1553,<CAPRuAkZhG3JS27oP-8bRS5FNV706O6+b4nAXsCVjaPL6...,2018-10-08 00:56:49+00:00,2018-10-08 06:12:31+00:00,,-1 days +18:44:18
2287,<CAPRuAkZbsqzatpB+FrjTTMUKuogW9Phq9tnsLC_jERb=...,2018-12-05 03:30:20+00:00,2018-12-05 08:33:23+00:00,,-1 days +18:56:57
772,<CAPRuAkbEp87pMKrsph0-agGV=4Zx0PTpHiw18TvFk6z9...,2017-10-08 20:17:13+00:00,2017-10-09 01:18:06+00:00,,-1 days +18:59:07
1064,<CAPRuAkaykDRkoSxq7TXmXaDszt358es2OGJjckO7cMdn...,2015-03-06 22:02:30+00:00,2015-03-07 03:02:09+00:00,,-1 days +19:00:21
1929,<CAPRuAkaPhrOmFrsGP6KPmTfNM_KajEaRH7uPTkCOVyNC...,2014-10-02 18:45:12+00:00,2014-10-02 23:44:43+00:00,,-1 days +19:00:29
1208,<CAPRuAkY1q6G5gWyg_0s3WwGOekXVm1Eju9Hp_qzjuM-Y...,2015-02-10 16:34:57+00:00,2015-02-10 21:34:26+00:00,Nap 2,-1 days +19:00:31
