In [71]:
import numpy as np
import os
import json
import re
import pandas as pd
from langchain_community.llms import Ollama
from langchain_core.output_parsers import JsonOutputParser
from langchain.prompts import PromptTemplate

In [72]:
def Availability(full_scraped_data):
    # Split the full scraped data into lines
    lines = full_scraped_data.strip().split('\n')

    # Find indices where the specific headers are present
    header_indices = [i for i, line in enumerate(lines) if line in ["Program Availability", "Semester", "Domestic", "International"]]

    # Initialize json_output with a default value
    json_output = None
    
    # Iterate over each found header index
    for start_index in header_indices[::4]:
        # Check if all subsequent headers are present
        if all(header in lines[start_index:start_index + 4] for header in ["Semester", "Domestic", "International"]):
            # Extract the next 16 lines starting from the found index
            chunk_data = lines[start_index:start_index + 16]

            # Extract headers and data
            headers = chunk_data[:4]
            data = [chunk_data[j:j + 4] for j in range(4, len(chunk_data), 4)]

            # Create a list to store the sentences
            sentences = []

            # Iterate over the data and create sentences
            for k in range(len(data)):
                semester_year = f"{data[k][0]} {data[k][1]}"
                domestic_availability = f"{semester_year} Domestic {data[k][2].lower()}"
                international_availability = f"{semester_year} International {data[k][3].lower()}"
                
                sentences.extend([domestic_availability, international_availability])

            # Convert to DataFrame
            columns = ['Semester', 'Year', 'Intake', 'Availability']
            data_for_df = [sentence.split()[:3] + [' '.join(sentence.split()[3:])] for sentence in sentences]
            df = pd.DataFrame(data_for_df, columns=columns)

            # Convert DataFrame to JSON
            json_output = json.loads(df.to_json(orient='records'))

    return json_output



In [73]:
def program_info1(scraped_data):
    # Define a regex pattern to match key-value pairs
    pattern = re.compile(r'([^:\n]+):\s*([^\n]+)')

    # Search for the Program Name to Program Availability subset
    subset_match = re.search(r'Program Name:.*Program Availability', scraped_data, re.DOTALL)

    if subset_match:
        # Find matches in the subset
        matches = pattern.findall(subset_match.group())

        # Create a dictionary from the matches
        metadata_dict = dict(matches)
        
        return metadata_dict
    else:
        return None  # Return None if the subset is not found

In [74]:
def extract_contact_chunk(full_scraped_data):
    # Find the starting index of the "Contact Us" section
    start_index1 = full_scraped_data.find(".\nContact Us")
    start_index2 = full_scraped_data.find('English assessment\nContact Us')
    start_index3 = full_scraped_data.find('Contact Us\nContact')

    # Choose the minimum valid start index
    valid_start_indices = [index for index in [start_index1, start_index2, start_index3] if index != -1]
    if not valid_start_indices:
        return None  # Return None if no valid start index is found

    start_index = min(valid_start_indices)

    # Find the ending index of the "Visit" section
    end_index1 = full_scraped_data.find("Visit", start_index)
    end_index2 = full_scraped_data.find("For more information about George Brown College", start_index)

    # Choose the minimum valid end index
    valid_end_indices = [index for index in [end_index1, end_index2] if index != -1]
    if not valid_end_indices:
        return None  # Return None if no valid end index is found

    end_index = min(valid_end_indices)

    # Extract the chunk between start and end indices
    contact_chunk = full_scraped_data[start_index:end_index].strip()

    # Remove the first line if it doesn't contain "English assessment" or "."
    lines = contact_chunk.split('\n')
    if lines and not any(keyword in lines[0] for keyword in ['Contact Us']):
        lines = lines[1:]

    return '\n'.join(lines)


In [75]:
def extract_contact(base_url1,chunk):
    prompt_temp1 = """"
    You are a helpful AI model who can create JSON from text. Your task is to extract all key information from the given text and present it as in JSON (structured format). Output should only be JSON object no other text required. Here are some examples:

    Example 1:
    Input:
    Contact Us
    School of Deaf and Deafblind Studies
    Email:
    communityservices@georgebrown.ca
    Our office hours are 8 a.m. – 4 p.m.
    Erika Stebbings, ASL & Deaf Studies Program Co-ordinator
    Email:
    erika.stebbings@georgebrown.ca

    Output:
    {
    "Contact": "School of Deaf and Deafblind Studies",
    "Contact email": "communityservices@georgebrown.ca",
    "Office Hours": "8 a.m. – 4 p.m",
    "Program Co-ordinator": "Erika Stebbings",
    "Co-ordinator email": "erika.stebbings@georgebrown.ca"
    }

    Example 2:
    Input:
    Contact Us
    School of Computer Technology
    Phone: 416-415-5000, ext. 4287
    Email:
    computertechnology@georgebrown.ca
    The office hours are:
    Monday – Friday: 9 a.m. – 4 p.m.
    Program Co-ordinator: Moe Fadaee
    Email:
    Moe.Fadaee@georgebrown.ca
    Phone: 416-415-5000, ext. 3229

    Output:
    {
    "Contact": "School of Computer Technology",
    "Contact email": "computertechnology@georgebrown.ca",
    "Phone": "416-415-5000, ext. 4287",
    "Office Hours": "Monday – Friday: 9 a.m. – 4 p.m.",
    "Program Co-ordinator": "Moe Fadaee",
    "Co-ordinator email": "Moe.Fadaee@georgebrown.ca",
    "Co-ordinator Phone": "416-415-5000, ext. 3229"
    }

    Now, extract the key information from the following text:"""
    llm1 =  Ollama(
    model="mistral",
    base_url=base_url1
    )

    parser = JsonOutputParser()
    prompt = PromptTemplate(
        template="{prompt_temp}.\n{query}\n\n {format_instructions}",
        input_variables=["query"],
        partial_variables={"format_instructions": parser.get_format_instructions()},
    )
    chain = prompt | llm1 | parser

    contact_info = chain.invoke({ "prompt_temp":prompt_temp1 ,"query": chunk })
    return contact_info


In [76]:
# Function to extract information from the scraped data
def extract_info(data,base_url1):
    program_info = {}
    
    # Extract Program Information
    program_info["Program Information"] = program_info1(data)
    
    
    # Extract Program Availability
    program_info['Availability'] =Availability(data)
    
    contact_chunk = extract_contact_chunk(data)
    print(contact_chunk)
    program_info['Contact Related Information'] = extract_contact(base_url1,contact_chunk)

    return program_info

In [77]:
def process_text_file(file_path, metadata_folder,base_url1):
    with open(file_path, 'r', encoding='utf-8') as file:
        full_scraped_data = file.read()

    file_name = os.path.splitext(os.path.basename(file_path))[0]  # Get the filename without extension
    metadata_file_path = os.path.join(metadata_folder, f"{file_name}.json")
    
    # Extract information
    extracted_info = extract_info(full_scraped_data,base_url1)


    # Save extracted information to JSON file in metadata folder
    with open(metadata_file_path, 'w', encoding='utf-8') as json_file:
        json.dump(extracted_info, json_file, ensure_ascii=False, indent=2)

In [78]:
base_url1 = "https://e590-35-247-83-61.ngrok-free.app"
folder_path = "test1"

In [79]:
if __name__ == "__main__":
    folder_path = "test1"
    metadata_folder = "metadata_"+folder_path

    # Create metadata folder if it doesn't exist
    os.makedirs(metadata_folder, exist_ok=True)


    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)

        # Check if the path is a file and has a .txt extension
        if os.path.isfile(file_path) and filename.lower().endswith('.txt'):
            print(filename.replace('.txt', ''))
            process_text_file(file_path, metadata_folder,base_url1)
            print('****************************************************************************************')
 

Applied A.I. Solutions Development Program Postgraduate T431  George Brown College2023
****************************************************************************************
Architectural Technician Program T132  George Brown College2023
****************************************************************************************
Architectural Technology Program T109  George Brown College2023
****************************************************************************************
Art and Design Foundation Program G108  George Brown College2023
****************************************************************************************
Autism and Behavioural Science Program Postgraduate C405  George Brown College2023
****************************************************************************************
Automation Technician Program Distance Education T950  George Brown College2023
****************************************************************************************
Bachelor of Business Ad