In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import re

In [2]:
class JiraTickets:
    def __init__(self, file):
        self.file = file
        self._tickets_soup = self._make_soup()
        self._all_tickets = self._get_all_ticket_info()
        self._dict_keys = ['span_class','small_class','title', 'ticket_ref','other','group','status']
        self.tickets_df = self._create_tickets_df()
        self.tickets_count = len(self.tickets_df)
    
    def _make_soup(self):
        return BeautifulSoup(open(self.file), "lxml")
    
    def _get_all_ticket_info(self):
        attr_value_regex = re.compile(r"global-pages.home.ui.tab-container.tab.item-list.item-link#issue-[0-9]{6}")
        all_tickets = self._tickets_soup.find_all(attrs={"data-test-id":attr_value_regex})
        return all_tickets
    
    def _create_ticket_dict(self, x):
    #   Find all span tags  
        data = x.find_all('span')
    #   Flatten list of lists into a single list of string items
        flat_data = [str(item) for sublist in data for item in sublist]
    #   Create dict
        output = dict(zip(self._dict_keys, flat_data))
    #     output['url'] = TODO add format url in here e.g. f'https://{domain}.atlassian.net/browse/{ticket_ref}'
        return output
    
    def _create_tickets_df(self):    
        dict_output = []
        for x in self._all_tickets:
            dict_output.append(self._create_ticket_dict(x))
        df = pd.DataFrame(dict_output)  
        return df[self._dict_keys]
    

In [3]:
my_file = 'jira_your_work.htm'

In [4]:
x = JiraTickets(my_file)

In [5]:
x.file

'jira_your_work.htm'

In [6]:
x.tickets_df.head(1)

Unnamed: 0,span_class,small_class,title,ticket_ref,other,group,status
0,"<span class=""sc-iTlrqL biRwIE"">Analysis into t...","<small class=""sc-fcLdzD fHVdRH""><span>PA-1194<...",Analysis into typing time in Zendesk,PA-1194,·,Product Analytics,On Hold (Action)


In [9]:
print(x.tickets_count)

17
