In [None]:
revisions_file = '2020.10.30 SA2-141E Revisions.html'
meeting = 'SA2-141E'
fig_title = meeting + " Revisions"

In [None]:
# You will need pandas/numpy to run the first part of the script. Then plotly for the second part. You can install plotly via pip (pip install plotly)
# If the PNG export is not working, most probably orca is not installed. You can install it by executing in the Anaconda cmd: conda install -c plotly plotly-orca
import pandas as pd
import numpy as np
import plotly.express as px
import html2text
import re

In [None]:
import os
import os.path
plot_folders = meeting
plot_folder_html = os.path.join(plot_folders, 'html')
plot_folder_png  = os.path.join(plot_folders, 'png')

if not os.path.exists(plot_folders):
    os.mkdir(plot_folders)
if not os.path.exists(plot_folder_html):
    os.mkdir(plot_folder_html)
if not os.path.exists(plot_folder_png):
    os.mkdir(plot_folder_png)

In [None]:
with open(revisions_file, 'r') as file:
    tdoc_revisions_html = file.read()
# print(tdoc_revisions_html)

In [None]:
h = html2text.HTML2Text()
# Ignore converting links from HTML
h.ignore_links = True
tdoc_revisions_text = h.handle(tdoc_revisions_html)
tdocs = emails = re.findall(r'S2-[\d]{7}r[\d]{2}',tdoc_revisions_text)
tdocs = list(set(tdocs))
revision_list = [(tdoc[0:-3], tdoc[-2:]) for tdoc in tdocs]

df = pd.DataFrame(revision_list, columns =['Tdoc', 'Revision'])
df["Revision"] = df[["Revision"]].apply(pd.to_numeric)
# display(df)

In [None]:
df_per_tdoc = df.groupby("Tdoc")
maximums = df_per_tdoc.max()
maximums.sort_values(by ='Revision' , ascending=False, inplace=True)
maximums = maximums.reset_index()

display(maximums.iloc[0:21,:])

In [None]:
fig = px.histogram(
    maximums, 
    x="Revision",
    labels={
        "Revision": "Number of revisions",
        "count": "Number of TDocs"
    },)
fig.show()

fig.write_html(os.path.join(plot_folder_html,  fig_title + ".html"))
fig.write_image(os.path.join(plot_folder_png, fig_title + ".png"))

print('Mean: {0}'.format(maximums.Revision.mean()))
