Import a lot of modules. Many of them are not really needed, but why to reinvent the wheel? All of them can be installed with `pip install` or `python-pip install`

In [None]:
import mailbox                                      # to read mbox (and others) format
from email_reply_parser import EmailReplyParser     # to split reply for the main message, see https://github.com/zapier/email-reply-parser
from collections import Counter                     # to accumulate, probably there are better solution based on real histograms
import email
from string import maketrans
from datetime import datetime
import calendar
import logging
logging.basicConfig(level=logging.INFO)
import re
from math import log, exp, sqrt
from tqdm import tqdm
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

import numpy as np
from sklearn.feature_extraction.text import CountVectorizer   # to parse text, it can do much more
import pandas as pd                                           # to make tables
#pd.core.format.set_option('notebook_repr_html', True)     # in HTML
from IPython.display import display_pretty, display_html, display_jpeg, display_png, display_json, display_latex, display_svg

%matplotlib inline

In [None]:
stopwords = STOPWORDS.copy()
mystopwords = '21 03 11 16 30 17 b9 05 14 04 13 19 06 12 a9 07 08 25 09 18 pj4 c3 a0 a3 ac 10 b2 della type 01 02 just 20 00 02c3 a8 utf alla del po 40miinfnit 2eit 40mi tra alle 3e 3a 2c questo ve al f2 gli su cosa quello nel pi ti sia e2 anche ec lo don anche com http https e1 ch sono ma ci se e8 e0 ruggero turra saluti cheers da salve una un le ho ciao hello hi non di che la plain text ha new mi 24 content il 2014 jan wed charset all six less being indeed over move anyway four not own through yourselves fify where mill only find before one whose system how somewhere with thick show had enough should to must whom seeming under ours has might thereafter latterly do them his around than get very de none cannot every whether they front during thus now him nor name several hereafter always who cry whither this someone either each become thereupon sometime side two therein twelve because often ten our eg some back up go namely towards are further beyond ourselves yet out even will what still for bottom mine since please forty per its everything behind un above between it neither seemed ever across she somehow be we full never sixty however here otherwise were whereupon nowhere although found alone re along fifteen by both about last would anything via many could thence put against keep etc amount became ltd hence onto or con among already co afterwards formerly within seems into others while whatever except down hers everyone done least another whoever moreover couldnt throughout anyhow yourself three from her few together top there due been next anyone eleven much call therefore interest then thru themselves hundred was sincere empty more himself elsewhere mostly on fire am becoming hereby amongst else part everywhere too herself former those he me myself made twenty these bill cant us until besides nevertheless below anywhere nine can of your toward my something and whereafter whenever give almost wherever is describe beforehand herein an as itself at have in seem whence ie any fill again hasnt inc thereby thin no perhaps latter meanwhile when detail same wherein beside also that other take which becomes you if nobody see though may after upon most hereupon eight but serious nothing such why a off whereby third i whole noone sometimes well amoungst yours their rather without so five the first whereas once'
for m in mystopwords.split():
    stopwords.add(m)

The next step is to download the whole gmail emails, it can be done here: https://www.google.com/settings/takeout. By the way this code works with any `mbox` file

In [None]:
my_identities = [x.encode('rot13') for x in 'ehttreb.gheen@zv.vasa.vg', 'tvheereb@tznvy.pbz', 'ehttreb.gheen@prea.pu', 'e.gheen@prea.pu', 'egheen@prea.pu', 'Ehttreb Gheen']

In [None]:
# very slow
mbox = mailbox.mbox('/home/turra/Posta inviata.mbox')
len(mbox)

In [None]:
stop_after = 0 # number of email to read. Set to 0 if you want to read all the emails
year = 2017      # year to read, skip the others. Set to 0 if you want to read all the years
box_name = "Posta inviata"

recipients_name = Counter()
recipients_email = Counter()
sent_hour = Counter(dict(zip(range(24), [0] * 24)))
sent_day = Counter(dict(zip(range(1, 7), [0] * 7)))
sent_month = Counter(dict(zip(range(1, 12), [0] * 11)))
len_replies = Counter()
words_count = Counter()

trantab = maketrans('\t\r\n', '   ')
r1 = re.compile(r'Il (giorno )?[0-9].*?@.*?scritto=?:', re.MULTILINE|re.DOTALL)
r7 = re.compile(r'On [0-9].*?@.*?wrote=?:', re.MULTILINE|re.DOTALL)
r2 = re.compile(r'From nobody.*?\n', re.DOTALL)
r3 = re.compile('Content-Type.*?$')
r4 = re.compile(r'[0-9].*?\@.*?:')
r5 = re.compile(r'\=[0-9]+[A-Z]')
r6 = re.compile(r' [0-9]+ ')

to_remove = ('Content-Disposition: inline',
             'Content-Type: text/plain; charset=ISO-8859-1',
             'Content-Type: text/plain; charset=windows-1252',
             'Content-Transfer-Encoding: quoted-printable',
             '.cern.ch')
cv = CountVectorizer(min_df=0, stop_words=stopwords, max_features=500)

for i, message in enumerate(tqdm(mbox)):
    try:
        if not message.has_key('X-Gmail-Labels'):
            continue
        if "Chat" in message['X-Gmail-Labels']:
            continue
        if box_name in message['X-Gmail-Labels']:
            To = message['To']       
            From = message['From']
            if not From:
                continue
            if not any(my_id in From for my_id in my_identities):
                continue
            date = datetime(*(email.utils.parsedate(message['date'])[:7]))
            if year and date.year != year:
                continue
            sent_hour.update({date.hour: 1})
            sent_day.update({date.isoweekday(): 1})
            sent_month.update({date.month: 1})

            if To:
                tos = [x.translate(trantab).strip() for x in To.split(',')]       
                to_parsed = email.utils.getaddresses(tos)
                for to in to_parsed:
                    recipients_name.update({to[0].lower(): 1})
                    recipients_email.update({to[1].lower(): 1})

            if message.is_multipart():           
                payload = str(message.get_payload()[0])
            else:
                payload = message.get_payload()

            erp = EmailReplyParser.read(payload)
            fragments = erp.fragments
            len_replies.update({len(fragments): 1})
            my_message = fragments[0].content

            my_message = reduce(lambda x, y: x.replace(y, ' '), to_remove, my_message)
            my_message = reduce(lambda y, x: x.sub('', y), (r1, r2, r3, r4, r5, r6, r7), my_message)
            my_message = my_message.strip()               

            if len(my_message):
                try:
                    counts = cv.fit_transform([my_message]).toarray().ravel()
                    words = np.array(cv.get_feature_names())
                    words_count.update(dict(zip(words, counts)))
                except:
                    logging.error('Cannot parse: "%s"', my_message)
            #print my_message
    except:
        print "problem with message"
        print message
        raise
    if stop_after and i > stop_after:
        break

In [None]:
sent_hour = pd.DataFrame.from_dict(sent_hour, orient="index")
sent_hour.columns = ['#emails']
sent_day = pd.DataFrame.from_dict(sent_day, orient="index")
sent_day.columns = ['#emails']
sent_month = pd.DataFrame.from_dict(sent_month, orient="index")
sent_month.columns = ['#emails']
len_replies = pd.DataFrame.from_dict(len_replies, orient="index")
len_replies.columns = ['#emails']

In [None]:
sent_hour.plot(kind='bar', figsize=(14, 6))
sent_day.plot(kind='bar', figsize=(14, 6))
sent_month.plot(kind='bar', figsize=(14, 6))
len_replies.plot(figsize=(14, 6), xlim=(0, 50))

In [None]:
for x, y in words_count.most_common(300):
    if x in stopwords:
        continue
    print "%s: %d" % (x, y)

In [None]:
from imageio import imread
import random

def grey_color_func(word, font_size, position, orientation, random_state=None, **kwargs):
    return "hsl(20, 80%%, %d%%)" % random.randint(30, 90)

mask = imread("/home/turra/bitmap.png")
#col = imread("/home/turra/2016_col.png")
#image_colors = ImageColorGenerator(col)

freq = [(a, (float(b) / words_count.most_common(1)[0][1]) **2) for a, b in words_count.most_common(700) if a not in stopwords]
wordcloud = WordCloud(width=800, height=1080, max_words=700, scale=2,
                      background_color="black",
                      mask=mask,
                      stopwords=stopwords).generate_from_frequencies(dict(freq))
wordcloud.recolor(color_func=grey_color_func, random_state=3)
#wordcloud.recolor(color_func=image_colors)
wordcloud.to_image()

In [None]:
wordcloud.to_file('temp.png')