# Key Term Search

This notebook contains Python code for searching for key terms of the three audit teams: Performance; Local Government; and ICT.

## Import Data

In [73]:
# Import data from spreadsheet
import pandas as pd
import numpy as np

data = pd.read_excel ("..\\data\\Hansard1102019.xlsx", sheet_name="Text")
text = pd.DataFrame(data, columns= ['HansardID','Text'])
text = text.astype({"HansardID":'str', "Text":'str'}) 
text.Text = text.Text.str.lower() # Convert to lowercase

text.head(2)

Unnamed: 0,HansardID,Text
0,HANSARD-10-26147.xml,climate change
1,HANSARD-10-26147.xml,the hon. m.c. parnell (14:55): i seek leave t...


In [76]:
data = pd.read_excel("..\\data\\AuditTeamTerms.xlsx", sheet_name="Performance Audit")
performance = pd.DataFrame(data)
performance.Term = performance.Term.str.lower() # Convert to lowercase
performance.head(2)

Unnamed: 0,Term
0,misappropriation
1,maladministration


In [47]:
data = pd.read_excel("..\\data\\AuditTeamTerms.xlsx", sheet_name="Local Government Audit")
government = pd.DataFrame(data)
government.Term = government.Term.str.lower() # Convert to lowercase
#government = government.replace(np.nan, '', regex=True)
#government = government.astype({"Term":'str', "Alternate":'str'}) 
government.head(2)

Unnamed: 0,Term,Alternate
0,council,
1,local government association,LGA


In [48]:
data = pd.read_excel("..\\data\\AuditTeamTerms.xlsx", sheet_name="IT Audit")
it = pd.DataFrame(data)
it.Term = it.Term.str.lower() # Convert to lowercase
it.head(2)

Unnamed: 0,Term
0,ict
1,it project


## Search for Key Terms in Hansard Text

In [77]:
# Performance Audit Team Terms
pattern = '|'.join(r"{}".format(x) for x in performance.Term)
text['MatchedTerm'] = text.Text.str.extract('('+ pattern + ')', expand=False)
performance_terms = pd.merge(performance, text, left_on= 'Term', right_on='MatchedTerm').drop('MatchedTerm', axis=1)
performance_terms.columns = ['Term','FileName','Text']
del text['MatchedTerm'] # Delete newly added column from text data
del performance_terms['Text'] # Delete unneeded Text column from results
performance_terms['AuditTeam'] = "Performance"
performance_terms.head(5)

Unnamed: 0,Term,FileName,AuditTeam
0,misappropriation,HANSARD-10-21593.xml,Performance
1,misappropriation,HANSARD-10-21593.xml,Performance
2,maladministration,HANSARD-11-33230.xml,Performance
3,maladministration,HANSARD-11-29185.xml,Performance
4,maladministration,HANSARD-11-29185.xml,Performance


In [88]:
# Local Government Audit Team Terms
pattern = '|'.join(r"{}".format(x) for x in government.Term)
text['MatchedTerm'] = text.Text.str.extract('('+ pattern + ')', expand=False)
government_terms = pd.merge(government, text, left_on= 'Term', right_on='MatchedTerm').drop('MatchedTerm', axis=1)
government_terms.columns = ['Term','Alternate','FileName','Text']
del text['MatchedTerm'] # Delete newly added column from text data
del government_terms['Text'] # Delete unneeded Text column from results
del government_terms['Alternate'] # Delete unneeded Alternate column from results
government_terms['AuditTeam'] = "Local Government"
government_terms.head(5)

Unnamed: 0,Term,FileName,AuditTeam
0,council,HANSARD-11-31978.xml,Local Government
1,council,HANSARD-11-28679.xml,Local Government
2,council,HANSARD-10-24234.xml,Local Government
3,council,HANSARD-10-24722.xml,Local Government
4,council,HANSARD-11-33354.xml,Local Government


In [90]:
# IT Audit Team Terms
pattern = '|'.join(r"{}".format(x) for x in it.Term)
text['MatchedTerm'] = text.Text.str.extract('('+ pattern + ')', expand=False)
it_terms = pd.merge(it, text, left_on= 'Term', right_on='MatchedTerm').drop('MatchedTerm', axis=1)
it_terms.columns = ['Term','FileName','Text']
del text['MatchedTerm'] # Delete newly added column from text data
del it_terms['Text'] # Delete unneeded Text column from results
it_terms['AuditTeam'] = "IT"
it_terms.head(5)

Unnamed: 0,Term,FileName,AuditTeam
0,ict,HANSARD-11-30055.xml,IT
1,ict,HANSARD-11-30055.xml,IT
2,ict,HANSARD-11-28679.xml,IT
3,ict,HANSARD-11-34502.xml,IT
4,ict,HANSARD-11-29713.xml,IT


In [97]:
# Merge results and output to Excel
merged_data = pd.concat([performance_terms, government_terms,it_terms], ignore_index=True)
merged_data.to_excel('.\\TermSearch.xlsx', sheet_name='TermSearch', index=False)