<a href="https://colab.research.google.com/github/taavip/extraxt_docx_comments/blob/main/extract_docx_comments.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import zipfile
from lxml import etree
import pandas as pd
import io
from io import BytesIO
from google.colab import files

!pip install xlsxwriter

ooXMLns = {'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}

def get_comments(docxFileBytes):
    docxZip = zipfile.ZipFile(BytesIO(docxFileBytes))

    try:
        commentsXML = docxZip.read('word/comments.xml')
    except KeyError:
        print("No comments found")
        return None

    et = etree.XML(commentsXML)
    comments = et.xpath('//w:comment', namespaces=ooXMLns)
    
    try:
        documentXML = docxZip.read('word/document.xml')
    except KeyError:
        print("No document found")
        return None

    et = etree.XML(documentXML)
    comment_data = []
    
    for c in comments:
        # Extract the comment text, author, date, and commented text
        text = c.xpath('string(.)', namespaces=ooXMLns)
        author = c.xpath('@w:author', namespaces=ooXMLns)[0]
        date = pd.to_datetime(c.xpath('@w:date', namespaces=ooXMLns)[0], utc=True).tz_localize(None)
        comment_id = c.xpath('@w:id', namespaces=ooXMLns)[0]
        comment_start = et.xpath(f'//w:commentRangeStart[@w:id="{comment_id}"]', namespaces=ooXMLns)[0]
        comment_end = et.xpath(f'//w:commentRangeEnd[@w:id="{comment_id}"]', namespaces=ooXMLns)[0]
        comment_text = ''.join([node.xpath('string(.)') for node in et.iter() if (node is comment_start or node is comment_end) or (node.getparent() is comment_start.getparent() and node.getnext() is comment_end)])
        
        # Add the comment data to a list
        comment_data.append({'Comment ID': comment_id, 'Commented Text': comment_text, 'Comment': text, 'Author': author, 'Date': date})
    
    # Create a Pandas DataFrame from the comment data and return it
    df = pd.DataFrame(comment_data, columns=['Comment ID', 'Commented Text', 'Comment', 'Author', 'Date'])
    return df



# Test the function with a sample document
uploaded = files.upload()

for filename, filebytes in uploaded.items():
    comments_df = get_comments(filebytes)
    if comments_df is not None:
        print(comments_df.head())

        # Save the DataFrame to an Excel file with a specific date format in the same directory as the original document
        output_file = filename.replace('.docx', '_comments.xlsx')
        with pd.ExcelWriter(output_file, engine='xlsxwriter') as writer:
            comments_df.to_excel(writer, index=False, sheet_name='Comments')
            workbook = writer.book
            worksheet = writer.sheets['Comments']
            date_format = workbook.add_format({'num_format': 'dd/mm/yyyy hh:mm:ss'})
            worksheet.set_column('E:E', None, date_format)

        print(f"Comments saved to '{output_file}'")

        # Download the generated Excel file
        files.download(output_file)
