Import all libraries

In [64]:
import os
import zipfile
import requests
import nltk
import matplotlib.pyplot as plt
import fasttext as ft
import fasttext.util as ftutil
import numpy as np
import re
from nltk.tokenize import sent_tokenize, word_tokenize
from tqdm import tqdm
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize

Implement download functions to download the data.

In [None]:
def download_and_extract(url, zip_path, extract_path):
    if os.path.exists(extract_path):
        print(f"File {extract_path} already exists.")
    else:
        print(f"Downloading and extracting the {zip_path} to {extract_path}.")
        response = requests.get(url, stream=True)
        total_size = int(response.headers.get('content-length', 0))
        block_size = 1024
        with open(zip_path, 'wb') as file, tqdm(
            desc=zip_path,
            total=total_size,
            unit='iB',
            unit_scale=True,
            unit_divisor=1024,
        ) as bar:
            for data in response.iter_content(block_size):
                file.write(data)
                bar.update(len(data))
        if not os.path.exists(extract_path):
            os.makedirs(extract_path)
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            for member in zip_ref.namelist():
                member_path = os.path.join(extract_path, os.path.relpath(member, start=os.path.commonpath(zip_ref.namelist())))
                if member.endswith('/'):
                    os.makedirs(member_path, exist_ok=True)
                else:
                    os.makedirs(os.path.dirname(member_path), exist_ok=True)
                    with open(member_path, 'wb') as f:
                        f.write(zip_ref.read(member))
        os.remove(zip_path)
        print(f"Download and extraction of {extract_path} complete.")

def download(url, file_path):
    if os.path.exists(file_path):
        print(f"File {file_path} already exists.")
    else:
        print(f"Downloading the file {file_path}")
        response = requests.get(url, stream=True)
        total_size = int(response.headers.get('content-length', 0))
        block_size = 1024

        with open(file_path, 'wb') as file, tqdm(
            desc=file_path,
            total=total_size,
            unit='iB',
            unit_scale=True,
            unit_divisor=1024,
        ) as bar:
            for data in response.iter_content(block_size):
                file.write(data)
                bar.update(len(data))
        print(f"Download of {file_path} complete.")

In [66]:
# Wikipedia dataset
wiki_url = "https://www.kielipankki.fi/download/wikipedia-fi/wikipedia-fi-2017-src/wikipedia-fi-2017-src.zip"
wiki_zip_path = "wikipedia-fi-2017-src.zip"
wiki_extract_path = "data/wikipedia-fi-2017-src"
download_and_extract(wiki_url, wiki_zip_path, wiki_extract_path)

# Finnish Stopwords
stopword_url = "http://members.unine.ch/jacques.savoy/clef/finnishST.txt"
stopwords_path = "data/finnishST.txt"
download(stopword_url, stopwords_path)

# Finnish Stemmer
stemmer_url = "http://members.unine.ch/jacques.savoy/clef/finnishStemmer.txt"
stemmer_path = "data/finnishStemmer.txt"
download(stemmer_url, stemmer_path)

# FinnWordNet
finnwordnet_url = "https://www.kielipankki.fi/download/FinnWordNet/v2.0/FinnWordNet-2.0.zip"
finnwordnet_zip_path = "FinnWordNet-2.0.zip"
finnwordnet_extract_path = "data/FinnWordNet"
download_and_extract(finnwordnet_url, finnwordnet_zip_path, finnwordnet_extract_path)

### Task 1:

Consider the wordings: “climate change”, “emission”, “resilience”, “sustainability” (need to consider their Finnish translations). Suggest a script where your input each of these wordings and output the corresponding Wikipedia pages, highlighting all the (linked) entities in these pages.

In [67]:
terms = {
    'ilmastonmuutos',
    'päästö',
    'joustavuus',
    'ekologinen kestävyys'
}

In [68]:
# Function to load the extracted Wikipedia dataset
def load_wikipedia_dataset(directory, max_print=5):
    data = {}

    # Iterate over all dataset parts
    for filename in os.listdir(directory):
        if filename.startswith("wiki_part") and filename.endswith(".VRT"):
            file_path = os.path.join(directory, filename)
            print(f"Processing file: {file_path}")

            with open(file_path, 'r', encoding='utf-8') as file:
                content = file.read()

                # Use regex to find all documents within the <doc>...</doc> tags
                docs = re.findall(r'<doc id="(.*?)" url="(.*?)" title="(.*?)">(.*?)</doc>', content, re.DOTALL)

                # Process each document found
                for doc_id, url, title, doc_content in docs:

                    # Only store the data if the title matches one of the specified terms
                    if title.casefold() in terms:
                        data[title.casefold()] = {
                            'id': doc_id,
                            'url': url,
                            'content': doc_content
                        }
    return data


# Load Finnish stopwords
def load_stopwords(file_path):
    with open(file_path, 'r', encoding='ISO-8859-1') as f:
        stopwords = f.read().splitlines()
        print(f"Loaded {len(stopwords)} stopwords from {file_path}")
    return stopwords

# Function to highlight linked entities
def highlight_linked_entities(content):
    highlighted_content = re.sub(r'(<link entity=")(.*?)(">)(.*?)(</link>)', r'\1\2\3**\4**\5', content)
    return highlighted_content

# Function to tokenize text into sentences and remove stopwords
def remove_stopwords(sentence, stopwords, language = 'finnish'):
    words = word_tokenize(text = sentence, language = language)
    filtered_words = [word for word in words if word.lower() not in stopwords]
    return ' '.join(filtered_words)

# Function to remove '#' from the middle of a word
def remove_hash_from_words(text):
    return re.sub(r'(\w)#(\w)', r'\1\2', text)

Load the wikipedia dataset

In [69]:
dataset_directory = "data/wikipedia-fi-2017-src"  # Path to the extracted dataset
wikipedia_data = load_wikipedia_dataset(dataset_directory)

Load the finnish stopwords

In [70]:
finnish_stopwords = load_stopwords(stopwords_path)

Highlight the entities in the wikipedia pages

In [71]:
for title, info in wikipedia_data.items():
    highlighted_content = highlight_linked_entities(info['content'])
    #print(f"Title: {title}\nHighlighted Content: {highlighted_content}\n")

Get the pages for the given wordings and get the sentences

In [72]:
# Function to extract third column data from sentences
def extract_third_column(sentence):
    third_column_data = []
    lines = sentence.strip().split('\n')
    for line in lines:
        if line.strip() and not line.startswith('<'):
            columns = line.split('\t')
            if len(columns) > 2:
                third_column_data.append(columns[2])
    return third_column_data

def process_paragraphs(input_text):
    #print(f"Input Text: {input_text}")
    paragraphs = re.findall(r'<paragraph>(.*?)</paragraph>', input_text, re.DOTALL)
    all_third_column_data = []
    #print(f"Paragraphs: {paragraphs}")
    for paragraph in paragraphs:
        sentences = re.findall(r'<sentence>(.*?)</sentence>', paragraph, re.DOTALL)
        #print(f"Sentences: {sentences}")
        for sentence in sentences:
            #print(f"Sentence: {sentence}")
            third_column_data = extract_third_column(sentence)
            all_third_column_data.append(' '.join(third_column_data))
    return all_third_column_data

#print(f"Wikipedia data: {wikipedia_data['ilmastonmuutos']['content']}")

# Process each item in wikipedia_data
for item in wikipedia_data.items():
    third_column_data = process_paragraphs(item[1]['content'])
    print(f"Item {item[0]} third column data:")
    for j, data in enumerate(third_column_data):
        filtered_sentence = remove_stopwords(data, finnish_stopwords)
        print(f"Sentence {j+1}: {filtered_sentence}")
        # Save the filtered sentence to a variable if needed
        # For example: sentence_var = filtered_sentence
    print("\n" + "#" * 80 + "\n")




sentences = re.findall(r'<sentence>(.*?)</sentence>', data['ilmastonmuutos']['content'], re.DOTALL)
all_words = []

if sentences:
    for idx, sentence_content in enumerate(sentences):
        print(f"Processing sentence {idx + 1}...")

        # Split each sentence into individual lines
        lines = sentence_content.strip().split('\n')

        # Extract the word (3nd column) from each line
        words = [line.split('\t')[2] for line in lines if line.strip()]
        all_words.extend(words)
else:
    print("No sentences found in the document.")

print(f"Words: {all_words}")

Download the Finnish Wikipedia 2017 dataset, Finnish Stopwords, Stemmer and FinnWordNet.

### Task 2

Assume the content of each webpage is a single document. Use relevant NLTK script to create a corpus constituted of the four document, and appropriate proprocessing and lemmatization, to construct the TfIdfVectorizer of each document and then calculate the cosine similarity of each pair of these documents. Provide the result in a table and comment on the findings.