In [1]:
import openai  # This imports the openai package.
import nltk  # This imports the nltk package.
import re  # This imports the re package.

# The above packages are third-party dependencies that will need to be installed.
# They can be listed in a requirements.txt file to make it easier to manage and
# install project dependencies.



In [4]:
import platform  # Module to check the current Python version
import os  # Module to interact with the operating system
import docx2txt  # A library to extract text from Word documents
import openai  # The OpenAI API client library
import re  # Module for regular expressions
from os.path import splitext, exists  # Functions for file paths and existence checks
import nltk  # The Natural Language Toolkit
nltk.download('punkt')  # Download the required Punkt tokenizer data
from nltk.tokenize import word_tokenize  # Tokenization function from NLTK
import datetime  # Module for working with dates and times

# Print Python version and package versions for debugging purposes
print('Python: ', platform.python_version())
print('re: ', re.__version__)
print('nltk: ', nltk.__version__)

# Method for cleaning up the content of a docx file
def clean_docx(file_path: str) -> str:
    """Clean up the content of a docx file to a string

    Args:
        file_path (str): path to docx file

    Returns:
        str: clean content
    """
    # Read the content of the file
    content = docx2txt.process(file_path)

    # Remove timestamps from the content
    pattern = r"\d+:\d+:\d+.\d+\s-->\s\d+:\d+:\d+.\d+"
    content = re.sub(pattern, "", content)

    # Remove duplicate spaces from the content
    pattern = r"\s+"
    content = re.sub(pattern, r" ", content)

    # Add a space after punctuation marks if it doesn't exist
    pattern = r"([\.!?])(\w)"
    content = re.sub(pattern, r"\1 \2", content)

    return content

# Method for converting a docx file to a clean text file
def docx_to_clean_file(file_in: str, file_out=None, **kwargs) -> str:
    """Save clean content of a docx file to text file

    Args:
        file_in (str): path to docx file
        file_out (None, optional): path to text file
        **kwargs (optional): arguments for other parameters
            - no_message (bool): do not show message of result.
                                 Default is False

    Returns:
        str: path to text file
    """
    # Set default values for parameters
    no_message = kwargs.get("no_message", False)
    if not file_out:
        filename = splitext(file_in)[0]
        file_out = "%s.txt" % filename
        i = 0
        while exists(file_out):
            i += 1
            file_out = "%s_%s.txt" % (filename, i)

    # Clean up the content of the docx file
    content = clean_docx(file_in)

    # Write the cleaned content to a text file
    with open(file_out, "w+", encoding="utf-8") as fp:
        fp.write(content)

    # Print a message if required
    if not no_message:
        print("clean content is written to file: %s" % file_out)

    return file_out

# Method for counting the number of tokens in a file
def count_tokens(filename):
    with open(filename, 'r') as f:
        text = f.read()
        
    tokens = word_tokenize(text)
    num_tokens = len(tokens)
    return num_tokens

# Method for breaking up a file into chunks
def break_up_file(tokens, chunk_size, overlap_size):
    if len(tokens) <= chunk_size:
        yield tokens
    else:
        chunk = tokens[:chunk_size]
        yield chunk
        yield from break_up_file(tokens[chunk_size-overlap_size:], chunk_size, overlap_size)
def break_up_file_to_chunks(filename, chunk_size=2000, overlap_size=100):
    with open(filename, 'r') as f:
        text = f.read()
    tokens = word_tokenize(text)
    return list(break_up_file(tokens, chunk_size, overlap_size))

def check_overlap(chunks):
    """
    Checks if the first and second chunks overlap.

    Args:
        chunks: A list of chunks of text.

    Returns:
        A string indicating if the overlap is good or not.
    """
    if chunks[0][-100:] == chunks[1][:100]:
        return 'Overlap is Good'
    else:
        return 'Overlap is Not Good'
    

    #converting each chunk to prompt text for openai
def convert_to_prompt_text(tokenized_text):
    prompt_text = " ".join(tokenized_text)
    prompt_text = prompt_text.replace(" 's", "'s")
    return prompt_text


def generate_mom (meeting_details):
    # Extract meeting information
    meeting_info = {}
    for line in meeting_details.split("\n"):
        if ":" in line:
            key, value = line.split(": ")
            meeting_info[key] = value

    # Extract the date from the filename
    filename = os.path.basename(filepath)
    match = re.search(r'\d{4}-\d{2}-\d{2}', filename)
    date = match.group() if match else "No date found in filename."

    # Convert the action items to a list
    #action_items = [item.strip() for item in action_items.split("\n") if item.strip()]

    # Create a new Word document
    doc = docx.Document()

    # Add the meeting information to the document
    doc.add_paragraph(f"Program name: {meeting_info['Program Name']}")
    doc.add_paragraph(f"Chair: {meeting_info['Chair']}")
    doc.add_paragraph(f"Date: {date}")
    doc.add_paragraph(f"Attendees: {meeting_info['Attendees']}")

    # Add the meeting summary to the document
    doc.add_paragraph("Meeting Summary:")
    doc.add_paragraph(meeting_summary)

    # Add the items discussed to the document
    # doc.add_paragraph("Items Discussed:")
    for item in items_discussed.split("\n"):
        doc.add_paragraph(item.strip())

    # Add the action items to the document
    # doc.add_paragraph("Action Items:")
    for item in action_items.split("\n"):
        doc.add_paragraph(item)

    # Save the document
    doc.save("items_discussed.docx")

Python:  3.9.12
re:  2.2.1
nltk:  3.7


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\varun\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [6]:
# Set the file path
filepath = "././FW_ Important_ PAC Minutes/Project Management PAC_2022-12-02.docx"

# Get the cleaned text file by calling the docx_to_clean_file function
# It reads the content of the docx file, cleans it, and saves the cleaned content to a text file.
# Then returns the path of the text file.
cleaned_file = docx_to_clean_file(filepath)

# Count the tokens of the cleaned file by calling the count_tokens function
# It tokenizes the text file and returns the number of tokens in the file.
token_count = count_tokens(cleaned_file)
print(f"Number of tokens: {token_count}")

# Break the cleaned file into chunks of text by calling the break_up_file_to_chunks function.
# It reads the cleaned text file, tokenizes it, and breaks it into chunks of specified size.
chunks = break_up_file_to_chunks(cleaned_file)

# Print the length of each chunk in tokens.
for i, chunk in enumerate(chunks):
    print(f"Chunk {i}: {len(chunk)} tokens")

# Check if the overlap is good or not by calling the check_overlap function.
# It checks if the last 100 tokens of the first chunk and the first 100 tokens of the second chunk are the same.
# If they are the same, it returns "Overlap is Good" else "Overlap is Not Good".
print(check_overlap(chunks))


clean content is written to file: ././FW_ Important_ PAC Minutes/Project Management PAC_2022-12-02_1.txt
Number of tokens: 12580
Chunk 0: 2000 tokens
Chunk 1: 2000 tokens
Chunk 2: 2000 tokens
Chunk 3: 2000 tokens
Chunk 4: 2000 tokens
Chunk 5: 2000 tokens
Chunk 6: 1180 tokens
Overlap is Good


In [None]:
import os
import re
import openai
import docx

# Set OpenAI API key
os.environ["OPENAI_API_KEY"] = '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'
openai.api_key = os.getenv("OPENAI_API_KEY")

prompt_response = []
# Break up the file into chunks of text
chunks = break_up_file_to_chunks(cleaned_file)

# Iterate over each chunk and generate a prompt to summarize it using OpenAI's API
for i, chunk in enumerate(chunks):
    # Convert the chunk to prompt text
    prompt_request = "Summarize this meeting transcript: " + convert_to_prompt_text(chunks[i])

    # Use OpenAI's API to summarize the text
    response = openai.Completion.create(
            model="text-davinci-003",
            prompt=prompt_request,
            temperature=.5,
            max_tokens=500,
            top_p=1,
            frequency_penalty=0,
            presence_penalty=0
    )
    
    # Add the generated summary to the list of prompt responses
    prompt_response.append(response["choices"][0]["text"])

# Print the generated prompt responses
print(prompt_response)


In [None]:
  
# prompt_request = "Consoloidate these meeting summaries: " + str(prompt_response)

# response = openai.Completion.create(
#         model="text-davinci-003",
#         prompt=prompt_request,
#         temperature=.5,
#         max_tokens=1000,
#         top_p=1,
#         frequency_penalty=0,
#         presence_penalty=0
#     )
# items_discussed = response["choices"][0]["text"]
# print(items_discussed)  

prompt_request = "Prepare a meeting summary to put it in a minutes of meeting: " + str(prompt_response)

response_1 = openai.Completion.create(
        model="text-davinci-003",
        prompt=prompt_request,
        temperature=.5,
        max_tokens=1000,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0
    )
meeting_summary = response_1["choices"][0]["text"]
print(meeting_summary)  

prompt_request = "Prepare a list of main items discussed from the given transcript to put it in a minutes of meeting: " + str(prompt_response)

response_2 = openai.Completion.create(
        model="text-davinci-003",
        prompt=prompt_request,
        temperature=.5,
        max_tokens=1000,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0
    )
items_discussed = response_2["choices"][0]["text"]
print(items_discussed)  

prompt_request = "Provide a list of action items/recommendations from the provided meeting transcript text: " + str(prompt_response)

response_3 = openai.Completion.create(
        model="text-davinci-003",
        prompt=prompt_request,
        temperature=.5,
        max_tokens=1000,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0
    )
action_items = response_3["choices"][0]["text"]
print(action_items) 

prompt_request = "Please provide Program name, Chair, Attendees from the provided meeting transcript text: " + str(prompt_response)

response_4 = openai.Completion.create(
        model="text-davinci-003",
        prompt=prompt_request,
        temperature=.5,
        max_tokens=1000,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0
    )
meeting_details = response_4["choices"][0]["text"]
print(meeting_details) 

# Check if all required variables are not None before calling the generate_mom function
if (meeting_summary is not None and action_items is not None and meeting_details is not None):
    # Call the generate_mom function with meeting_details as an argument
    generate_mom(meeting_details)
