# Load the unstructured data

In [None]:
from bs4 import BeautifulSoup
import pickle
import requests
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from helpers.lookups import Path

In [None]:
with open('../data/scraped_links.pkl', 'rb') as file:
    scraped_links = pickle.load(file)


In [None]:
def extract_text(link):
    response = requests.get(link)
    soup = BeautifulSoup(response.content, "html.parser")

    # Fetch the description of the title
    body = soup.find_all("p")
    paragraphs = []
    for paragraph in body:
        paragraphs.append(paragraph.text)

    return paragraphs

In [None]:
# Clean the list of strings
def clean_text(data):
    # Get the list of French stopwords
    stop_words = set(stopwords.words('english'))
    
    cleaned_data = []
    for sentence in data:
        # Tokenize the sentence
        tokens = word_tokenize(sentence)
        # Remove stopwords and convert to lowercase
        cleaned_tokens = [token for token in tokens if token.lower() not in stop_words]
        # Join the cleaned tokens back into a sentence
        cleaned_sentence = " ".join(cleaned_tokens)
        # Remove empty sentences
        if cleaned_sentence:
            cleaned_data.append(cleaned_sentence)
    
    return cleaned_data


In [None]:
    
for link in scraped_links:
    text = extract_text(link)
    cleaned_text = clean_text(text)

    # Get the filename
    filename = link.split("/")[3]
    if filename not in ["admission", "international", "aide", "servicesocial", "sio", "sip"]:
        filename = "institutions"
        
    with open("../data/scraped_text/" + filename + ".txt", 'a') as file:
        file.write(link + "\n")

        for t in cleaned_text:
            file.write(t + "\n")    

        file.write("------------------------------------------------------\n")


## Implement API

In [None]:
from web_scraping import web_scrape

web_scrape(scraped_links)