In [1]:
import mailbox
import email.utils
import pandas as pd
import exifread
from datetime import datetime, timezone
import pytz
from ipywidgets import IntProgress, IntText

In [2]:
filename = "tgthorley.mbox"
filepath = './data/test/'

mbox = mailbox.mbox(filepath + filename)

In [3]:
def create_email_summary(message):
    emaildate = email.utils.parsedate_to_datetime(message['Date']).astimezone(pytz.utc)
    email_sum = {
            "email_date": emaildate,
            "_id": message['Message-ID'],
            "subject": message['Subject']
        }
    return email_sum

In [4]:
def create_file(part):
    try:
        filename = part.get_filename()
        filepath = "./attachements/"+filename
        fb = open(filepath,'wb')
        fb.write(part.get_payload(decode=True))
        fb.close()
        return filepath
    except:
        return ""

In [5]:
#takes an image path and retruns exif data
def parse_img(imgpath):
    try:
        f = open(imgpath, 'rb')
        tags = exifread.process_file(f)
        return tags
    except:
        print("Failed to extract exif %s" % imgpath)
        return []

In [6]:
#takes a message with an attached immage  and returns the exif data
def getexif_from_attached_image(image):
    try:
        filepath = create_file(image)
        tags = parse_img(filepath)
        return tags
    except:
        print ("File not found", "./attachements/"+image.get_filename())
        return {}

In [31]:
def compare_sent_time_and_img_creation(tags, message):
    try:
        emaildate = email.utils.parsedate_to_datetime(message['Date']).astimezone(pytz.utc)
        email_sum = create_email_summary(message)
        if "EXIF DateTimeOriginal" in tags:
            
            imgdate = datetime.strptime(tags["EXIF DateTimeOriginal"].values, '%Y:%m:%d %H:%M:%S').astimezone(pytz.utc)
            timediff = emaildate - imgdate
            if timediff.total_seconds() > 0 and timediff.total_seconds() < 86400:
                print ('WARNING: Image taken less than a day after sending\n', message['Message-ID'], message['Subject'] )
            email_sum["img_date"] = imgdate
            email_sum["timediff"] = timediff
            
        return email_sum
    except: 
        print("Error", message['Subject'])
        return {}


In [84]:
def _convert_to_degress(value, ref):
    if ref.values == "N" or ref.values == "E":
        d = float(value.values[0].num) / float(value.values[0].den)
        m = float(value.values[1].num) / float(value.values[1].den)
        s = float(value.values[2].num) / float(value.values[2].den)
    elif ref.values == "S" or ref.values == "W":
        d = -(float(value.values[0].num) / float(value.values[0].den))
        m = -(float(value.values[1].num) / float(value.values[1].den))
        s = -(float(value.values[2].num) / float(value.values[2].den))  
    else: print(ref.values)

    return d + (m / 60.0) + (s / 3600.0)

In [91]:
def extract_geo(tags, email_sum):
    if 'GPS GPSLatitude' in tags:
        lat = tags['GPS GPSLatitude']
        latRef = tags['GPS GPSLatitudeRef']
        lat_decimal = _convert_to_degress(lat, latRef)
        long = tags['GPS GPSLongitude']
        longRef = tags['GPS GPSLongitudeRef']
        long_decimal = _convert_to_degress(long, longRef)
        email_sum['GPS GPSLatitude']= lat_decimal
        email_sum['GPS GPSLongitude']= long_decimal
        #print ("https://www.geoplaner.com/?z=10;p="+str(lat_decimal)+","+str(long_decimal)+";")
    return email_sum


In [None]:
emails = []
i=0
progressBar = IntProgress(description='Processing emails '+str(len(mbox)), min=0, max=len(mbox), style = {'description_width': 'initial'})
progressText = IntText(value=0, description='Processed so far: ', style = {'description_width': 'initial'})
display(progressBar, progressText)

for message in mbox:
    email_sum = {}
    if message.get_content_maintype() == 'multipart':
        for part in message.walk():
            if 'image' in part.get_content_type():
                tags = getexif_from_attached_image(part)
                email_sum.update(compare_sent_time_and_img_creation(tags, message))
                email_sum.update(extract_geo(tags, email_sum))
    emails.append(email_sum)
    i= i+1
    progressBar.value = i
    progressText.value = i
            

IntProgress(value=0, description='Processing emails 2364', max=2364, style=ProgressStyle(description_width='in…

IntText(value=0, description='Processed so far: ', style=DescriptionStyle(description_width='initial'))

https://www.geoplaner.com/?z=10;p=32.65623055555555,-16.91846388888889;
https://www.geoplaner.com/?z=10;p=51.867000555555556,-2.2303864722222224;


https://www.geoplaner.com/?z=10;p=51.71088027777778,-1.9603111666666666;
https://www.geoplaner.com/?z=10;p=51.71088027777778,-1.9603111666666666;


https://www.geoplaner.com/?z=10;p=51.867000555555556,-2.230389111111111;


https://www.geoplaner.com/?z=10;p=51.710880555555555,-1.9601722222222222;
https://www.geoplaner.com/?z=10;p=51.710880555555555,-1.9601722222222222;










https://www.geoplaner.com/?z=10;p=51.71088788888889,-1.9602635;
https://www.geoplaner.com/?z=10;p=51.71090316666667,-1.9602563055555555;
https://www.geoplaner.com/?z=10;p=51.71090316666667,-1.9602800277777777;
https://www.geoplaner.com/?z=10;p=51.710906972222226,-1.9602930555555556;
https://www.geoplaner.com/?z=10;p=51.710914611111114,-1.9602980277777777;
https://www.geoplaner.com/?z=10;p=51.710906972222226,-1.9602985277777778;
https://www.geoplaner.com/?z=10;p=51.71

In [None]:
df = pd.DataFrame(emails)
df.sort_values("timediff")