Import all libraries

In [1]:
import os
import zipfile
import requests
import nltk
import matplotlib.pyplot as plt
import fasttext as ft
import fasttext.util as ftutil
import numpy as np
import re
import pandas as pd
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem.snowball import SnowballStemmer
from tqdm import tqdm
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize

Implement download functions to download the data.

In [2]:
def download_and_extract(url, zip_path, extract_path):
    if os.path.exists(extract_path):
        print(f"File {extract_path} already exists.")
    else:
        print(f"Downloading and extracting the {zip_path} to {extract_path}.")
        response = requests.get(url, stream=True)
        total_size = int(response.headers.get('content-length', 0))
        block_size = 1024
        with open(zip_path, 'wb') as file, tqdm(
            desc=zip_path,
            total=total_size,
            unit='iB',
            unit_scale=True,
            unit_divisor=1024,
        ) as bar:
            for data in response.iter_content(block_size):
                file.write(data)
                bar.update(len(data))
        if not os.path.exists(extract_path):
            os.makedirs(extract_path)
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            for member in zip_ref.namelist():
                member_path = os.path.join(extract_path, os.path.relpath(member, start=os.path.commonpath(zip_ref.namelist())))
                if member.endswith('/'):
                    os.makedirs(member_path, exist_ok=True)
                else:
                    os.makedirs(os.path.dirname(member_path), exist_ok=True)
                    with open(member_path, 'wb') as f:
                        f.write(zip_ref.read(member))
        os.remove(zip_path)
        print(f"Download and extraction of {extract_path} complete.")

def download(url, file_path):
    if os.path.exists(file_path):
        print(f"File {file_path} already exists.")
    else:
        print(f"Downloading the file {file_path}")
        response = requests.get(url, stream=True)
        total_size = int(response.headers.get('content-length', 0))
        block_size = 1024

        with open(file_path, 'wb') as file, tqdm(
            desc=file_path,
            total=total_size,
            unit='iB',
            unit_scale=True,
            unit_divisor=1024,
        ) as bar:
            for data in response.iter_content(block_size):
                file.write(data)
                bar.update(len(data))
        print(f"Download of {file_path} complete.")

In [3]:
# Wikipedia dataset
wiki_url = "https://www.kielipankki.fi/download/wikipedia-fi/wikipedia-fi-2017-src/wikipedia-fi-2017-src.zip"
wiki_zip_path = "wikipedia-fi-2017-src.zip"
wiki_extract_path = "data/wikipedia-fi-2017-src"
download_and_extract(wiki_url, wiki_zip_path, wiki_extract_path)

# Finnish Stopwords
stopword_url = "http://members.unine.ch/jacques.savoy/clef/finnishST.txt"
stopwords_path = "data/finnishST.txt"
download(stopword_url, stopwords_path)

# Finnish Stemmer
# stemmer_url = "http://members.unine.ch/jacques.savoy/clef/finnishStemmer.txt"
# stemmer_path = "data/finnishStemmer.txt"
# download(stemmer_url, stemmer_path)

# FinnWordNet
finnwordnet_url = "https://www.kielipankki.fi/download/FinnWordNet/v2.0/FinnWordNet-2.0.zip"
finnwordnet_zip_path = "FinnWordNet-2.0.zip"
finnwordnet_extract_path = "data/FinnWordNet"
download_and_extract(finnwordnet_url, finnwordnet_zip_path, finnwordnet_extract_path)

File data/wikipedia-fi-2017-src already exists.
File data/finnishST.txt already exists.
File data/FinnWordNet already exists.


### Task 1:

Consider the wordings: “climate change”, “emission”, “resilience”, “sustainability” (need to consider their Finnish translations). Suggest a script where your input each of these wordings and output the corresponding Wikipedia pages, highlighting all the (linked) entities in these pages.

In [4]:
terms = {
    'ilmastonmuutos',
    'päästö',
    'joustavuus',
    'ekologinen kestävyys'
}

In [5]:
# Function to load the extracted Wikipedia dataset
def load_wikipedia_dataset(directory):
    data = {}
    data_titles = {}
    # Iterate over all dataset parts
    files = [f for f in os.listdir(directory) if f.startswith("wiki_part") and f.endswith(".VRT")]
    for filename in tqdm(files, desc="Processing files"):
        if filename.startswith("wiki_part") and filename.endswith(".VRT"):
            file_path = os.path.join(directory, filename)
            #print(f"Processing file: {file_path}")

            with open(file_path, 'r', encoding='utf-8') as file:
                content = file.read()

                # Use regex to find all documents within the <doc>...</doc> tags
                docs = re.findall(r'<doc id="(.*?)" url="(.*?)" title="(.*?)">(.*?)</doc>', content, re.DOTALL)

                # Process each document found
                for doc_id, url, title, doc_content in docs:
                    data_titles[title.casefold()] = {
                            'id': doc_id,
                        }
                    # Only store the data if the title matches one of the specified terms
                    if title.casefold() in terms:
                        data[title.casefold()] = {
                            'id': doc_id,
                            'url': url,
                            'content': doc_content
                        }
    return data, data_titles


# Load Finnish stopwords
def load_stopwords(file_path):
    with open(file_path, 'r', encoding='ISO-8859-1') as f:
        stopwords = f.read().splitlines()
        print(f"Loaded {len(stopwords)} stopwords from {file_path}")
    return stopwords

# Function to highlight linked entities
def highlight_linked_entities(content):
    highlighted_content = re.sub(r'(<link entity=")(.*?)(">)(.*?)(</link>)', r'\1\2\3**\4**\5', content)
    return highlighted_content

# Function to tokenize text into sentences and remove stopwords
def remove_stopwords(sentence, stopwords, language = 'finnish'):
    words = word_tokenize(text = sentence, language = language)
    filtered_words = [word for word in words if word.lower() not in stopwords]
    return ' '.join(filtered_words)

# Function to remove '#' from the middle of a word
def remove_hash_from_words(text):
    return re.sub(r'(\w)#(\w)', r'\1\2', text)

Load the wikipedia dataset

In [6]:
dataset_directory = "data/wikipedia-fi-2017-src"  # Path to the extracted dataset
wikipedia_data, wikipedia_title_data = load_wikipedia_dataset(dataset_directory)
print(f"number of wikipedia titles", len(wikipedia_title_data))

Processing files: 100%|██████████| 66/66 [00:55<00:00,  1.20it/s]

number of wikipedia titles 425318





Load the finnish stopwords

In [7]:
finnish_stopwords = load_stopwords(stopwords_path)

Loaded 747 stopwords from data/finnishST.txt


Highlight the entities in the wikipedia pages

In [8]:
for title, info in wikipedia_data.items():
    highlighted_content = highlight_linked_entities(info['content'])
    #print(f"Title: {title}\nHighlighted Content: {highlighted_content}\n")

Get the pages for the given wordings and get the sentences

In [9]:
# Function to extract third column data from sentences
def extract_third_column(sentence):
    third_column_data = []
    lines = sentence.strip().split('\n')
    for line in lines:
        if line.strip() and not line.startswith('<'):
            columns = line.split('\t')
            if len(columns) > 2:
                third_column_data.append(columns[2])
    return third_column_data

def process_paragraphs(input_text):
    #print(f"Input Text: {input_text}")
    paragraphs = re.findall(r'<paragraph>(.*?)</paragraph>', input_text, re.DOTALL)
    all_third_column_data = []
    #print(f"Paragraphs: {paragraphs}")
    for paragraph in paragraphs:
        sentences = re.findall(r'<sentence>(.*?)</sentence>', paragraph, re.DOTALL)
        #print(f"Sentences: {sentences}")
        for sentence in sentences:
            #print(f"Sentence: {sentence}")
            third_column_data = extract_third_column(sentence)
            all_third_column_data.append(' '.join(third_column_data))
    return all_third_column_data

#print(f"Wikipedia data: {wikipedia_data['ilmastonmuutos']['content']}")

# Process each item in wikipedia_data
for item in wikipedia_data.items():
    third_column_data = process_paragraphs(item[1]['content'])
    #print(f"Item {item[0]} third column data:")
    for j, data in enumerate(third_column_data):
        filtered_sentence = remove_stopwords(data, finnish_stopwords)
        #print(f"Sentence {j+1}: {filtered_sentence}")
        # Save the filtered sentence to a variable if needed
        # For example: sentence_var = filtered_sentence
    #print("\n" + "#" * 80 + "\n")


all_words = {}

print(wikipedia_data)
for title, document in wikipedia_data.items():
    sentences = re.findall(r'<sentence>(.*?)</sentence>', document['content'], re.DOTALL)
    document_words = set()
    if sentences:
        for idx, sentence_content in enumerate(sentences):
            # print(f"Processing sentence {idx + 1}...")

            # Split each sentence into individual lines
            lines = sentence_content.strip().split('\n')

            # cleaned_lines = re.sub(r'[^\w\s]', '', lines)
            # Extract the word (3nd column) from each line
            words = [line.split('\t')[2] for line in lines if line.strip()]
            for word in words:
                cleaned_word = re.sub(r'[^\w\s]', '', word)
                if not cleaned_word or cleaned_word in finnish_stopwords:
                    continue
                document_words.add(cleaned_word.casefold())

    all_words[title] = document_words

for title, words in all_words.items():
    print(f"\nWords for document '{title}': {words}")

{'joustavuus': {'id': '62900', 'url': 'https://fi.wikipedia.org/wiki?curid=62900', 'content': '\n<paragraph>\n<sentence>\n1\tJoustavuus\tjoustavuus\tNOUN\t_\tCase=Nom|Number=Sing\t0\troot\t_\t_\n</sentence>\n</paragraph>\n<paragraph>\n<sentence>\n1\tJoustavuus\tjoustavuus\tNOUN\t_\tCase=Nom|Number=Sing\t4\tnsubj:cop\t_\t_\n2\ton\tolla\tVERB\t_\tMood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin|Voice=Act\t4\tcop\t_\t_\n3\tyrityksen\tyritys\tNOUN\t_\tCase=Gen|Number=Sing\t4\tnmod:poss\t_\t_\n4\tominaisuus\tominaisuus\tNOUN\t_\tCase=Nom|Number=Sing\t0\troot\t_\t_\n5\t,\t,\tPUNCT\t_\t_\t7\tpunct\t_\t_\n6\tjoka\tjoka\tPRON\t_\tCase=Nom|Number=Sing|PronType=Rel\t7\tnsubj\t_\t_\n7\tkertoo\tkertoa\tVERB\t_\tMood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin|Voice=Act\t4\tacl:relcl\t_\t_\n8\tsiitä\tse\tPRON\t_\tCase=Ela|Number=Sing|PronType=Dem\t7\tnmod\t_\t_\n9\t,\t,\tPUNCT\t_\t_\t12\tpunct\t_\t_\n10\tmiten\tmiten\tADV\t_\t_\t12\tadvmod\t_\t_\n11\tyritys\tyritys\tNOUN\t_\tCase=Nom|Numbe

In [10]:
entities_found = {}

for i, topic_words in enumerate(all_words):
    found_words = []
    for word in all_words[topic_words]:
        if word.casefold() in [title.casefold() for title in wikipedia_title_data]:
            found_words.append(word.casefold())
    entities_found[topic_words] = found_words


# Example usage of the entities_found dictionary
for topic, found_entities in entities_found.items():
    print(f"Document '{topic}' contains the following entities: {found_entities}")

Document 'joustavuus' contains the following entities: ['tuote', 'vuokratyövoima', 'johtaja', 'piiri', 'yritys', 'ilmiö', 'jousto', 'asema', 'anna', 'strategia', 'voima', 'havainto', 'ominaisuus', 'projekti', 'vaara', 'suomi', '2015', 'ennakointi', 'ns', 'työntekijä', 'malli', 'idea', 'ydin', 'työttömyysvakuutus', 'pohja', 'mahdollisuus', 'määrä', 'muutos', 'tarve', 'varjo', 'käsite', 'toimiala', 'työ', 'laji', 'joustavuus', 'atkinson']
Document 'ilmastonmuutos' contains the following entities: ['suhde', 'metsä', 'no', 'hiki', 'celsiusaste', 'muoto', 'syy', 'liike', 'kivihiili', 'reaktio', 'teollisuusmaa', 'tulivuorenpurkaus', 'tila', 'aurinko', 'tieto', 'miljoona', 'päivä', 'tutkimus', 'havainto', 'kalkkikivi', 'sadanta', 'dynamiikka', 'suomi', 'elinkaari', 'mannerlaatta', 'v', 'valo', 'suunta', '0', 'jäätikkö', 'pohja', 'määrä', 'hiili', 'pohjoinen', 'lämpötila', 'suure', 'maakaasu', 'ilmakehä', 'siitepöly', 'voimakkuus', 'maatalous', 'rata', 'meri', 'energia', 'merkitys', 'ympäristö

Download the Finnish Wikipedia 2017 dataset, Finnish Stopwords, Stemmer and FinnWordNet.

### Task 2

Assume the content of each webpage is a single document. Use relevant NLTK script to create a corpus constituted of the four document, and appropriate proprocessing and lemmatization, to construct the TfIdfVectorizer of each document and then calculate the cosine similarity of each pair of these documents. Provide the result in a table and comment on the findings.

In [18]:
def extract_second_column(sentence):
    second_column_data = []
    lines = sentence.strip().split('\n')
    for line in lines:
        if line.strip() and not line.startswith('<'):
            columns = line.split('\t')
            if len(columns) > 2:
                second_column_data.append(columns[1])
    return second_column_data


def process_sentences(input_text):
    paragraphs = re.findall(r'<paragraph>(.*?)</paragraph>', input_text, re.DOTALL)
    all_second_column_data = []
    for paragraph in paragraphs:
        sentences = re.findall(r'<sentence>(.*?)</sentence>', paragraph, re.DOTALL)
        #print(f"Sentences: {sentences}")
        for sentence in sentences:
            #print(f"Sentence: {sentence}")
            second_column_data = extract_second_column(sentence)
            all_second_column_data.append(' '.join(second_column_data))
    return all_second_column_data

# Preprocessing function to remove stopwords, stemming, and tokenize the document
def preProcess(doc, stopwords):
    clean_stopwords = [word.casefold() for word in stopwords]
    stemmer = SnowballStemmer("finnish")
    sentences = process_sentences(doc)
    print(f"Sentences: {sentences}")
    #sentences = sent_tokenize(text=sentences.casefold(), language='finnish')

    tokens = []
    for sentence in sentences:
        words = word_tokenize(sentence)

        words = [stemmer.stem(word) for word in words]

        words = [word for word in words if word.isalpha() and word not in clean_stopwords]

        words_set = list(set(words))

        tokens.extend(words_set)

    return ' '.join(tokens)


In [19]:
# Preprocess the document
corpus = []

for title, document in wikipedia_data.items():
    processed_doc = preProcess(document['content'], finnish_stopwords)
    print(f"Processed Document: {processed_doc}")
    corpus.append(processed_doc)

# Print the processed document
print("\nCorpus:")
for doc in corpus:
    print(doc)

Sentences: ['Joustavuus', 'Joustavuus on yrityksen ominaisuus , joka kertoo siitä , miten yritys pystyy sopeutumaan kysynnässä tapahtuviin määrällisiin ja laadullisiin muutoksiin siten , että yritys ei muutosten myötä joutuisi kilpailijoitaan epäedullisempaan asemaan .', 'Anna Pollert on arvostellut käsitettä siitä , että joustavuuden varjolla tehdään toisistaan hyvinkin paljon poikkeavia , työntekijöiden asemaan vaikuttavia toimenpiteitä .', 'Joustavuudella on pyritty oikeuttamaan jopa keskenään ristiriitaisia toimenpiteitä .', 'Täsmällisemmässä kielenkäytössä joustavuus jaetaan kahteen lajiin : " joustavuus lyhytaikaisessa ennakoinnissa " ( " flexibility for short-term predicatability " tai määrällinen jousto ) ja " joustavuus pitkäaikaisessa sopeutumisessa " ( " flexibility for long-term adaptability " tai toiminnallinen jousto ) .', 'Joustavuus lyhytaikaisessa ennakoinnissa merkitsee yritykselle sitä , että se voi nopeasti sopeuttaa tuotantokustannuksensa kysynnän mukaan .', 'Tekni

In [20]:
# TF-IDF Vectorization of each document
tf = TfidfVectorizer(use_idf=True, min_df=1)
tfidf_matrix = tf.fit_transform(corpus)

In [21]:

# Compute cosine similarity of each pair of these documents
cosine_sim = cosine_similarity(tfidf_matrix)

In [22]:
# Provide the result in a table
cosine_sim_df = pd.DataFrame(cosine_sim,
    index=[f'Doc{i+1}' for i in range(len(cosine_sim))],
    columns=[f'Doc{i+1}' for i in range(len(cosine_sim))])

print(cosine_sim_df)

# Comment on the findings


          Doc1      Doc2      Doc3      Doc4
Doc1  1.000000  0.062607  0.011806  0.000000
Doc2  0.062607  1.000000  0.058166  0.030478
Doc3  0.011806  0.058166  1.000000  0.012154
Doc4  0.000000  0.030478  0.012154  1.000000


### Task 3

Repeat 2) when the documents are restricted only to clickable entities of each document

In [41]:
# Preprocessing function to remove stopwords, stemming
def preProcess_entites(words, stopwords):
    clean_stopwords = [word.casefold() for word in stopwords]
    stemmer = SnowballStemmer("finnish")

    words = [stemmer.stem(word) for word in words]

    words = [word for word in words if word.isalpha() and word not in clean_stopwords]

    words = list(set(words))

    return ' '.join(words)

In [42]:
# Preprocess the document
corpus_entity = []
for document, found_entities in entities_found.items():
    processed_doc = preProcess_entites(found_entities, finnish_stopwords)
    corpus_entity.append(processed_doc)

# Print the processed document
print("\Corpus_entity:")
for doc in corpus_entity:
    print(doc)


Corpus:
asem ennakoint vaara tuote projekt suomi mahdollisuus määrä piiri tarv johtaj voima yritys strateg muutos ilmiö käsit havainto pohj jousto atkinso an mal ns ide toimial varj työ vuokratyövoim työntekij laji yd joustavuus ominaisuus työttömyysvakuutus
tekij aksel maatalous troposfäär rata energ meri merkitys maaper ympäristö ilmasto aktiivisuus no tutk hiki muoto syy liike toimi pohj meta reaktio tarkoitus tulivuorenpurkaus ilmastonmuutos sää neliökilometr vuosis meriv siper jakso mets lap tila kosteiko tieto lämpökapasiteet öljy ppm tulevaisuus takaisinkytken ikirou c maa kalkkikiv dynamiik teollisuusm päivä tutkimus havainto suhd eteenp aurinkokun säteily elinkaar järv pudo päästö kasvihuoneilmiö pala nitraat hiilidioksid toimin milj todistusaineisto biodiversiteet co vesi kiertor suomi pelto pilvisyys palo pohjoin yleiskiel poltoa v merivir valo arvo käsit maailmanhistor artikkel tapahtum ihmin ilmastomal kivihiilikau epälineaarin suo puu suun proses vuorovaikutus kasvihuone

In [47]:
# TF-IDF Vectorization of each document
tf = TfidfVectorizer(use_idf=True, min_df=1)
tfidf_matrix_entity = tf.fit_transform(corpus_entity)

In [48]:
# Compute cosine similarity of each pair of these documents
cosine_sim_entity = cosine_similarity(tfidf_matrix_entity)

In [49]:
# Provide the result in a table
cosine_sim_df_entity = pd.DataFrame(cosine_sim_entity,
    index=[f'Doc{i+1}' for i in range(len(cosine_sim_entity))],
    columns=[f'Doc{i+1}' for i in range(len(cosine_sim_entity))])

print(cosine_sim_df_entity)

# Comment on the findings


          Doc1      Doc2      Doc3      Doc4
Doc1  1.000000  0.066520  0.022306  0.000000
Doc2  0.066520  1.000000  0.152217  0.045353
Doc3  0.022306  0.152217  1.000000  0.000000
Doc4  0.000000  0.045353  0.000000  1.000000


### Task 4

Write a script that explores the clickable entities in each of the four concepts (exploring only once), and generates new extended vocabulary for each concepts, and compute the new similarity between each pair of concepts using extent of overlapping of overall vocabulary and reduced vocabulary (when restricting the vocabulary to clickable entities). We shall refer to the case where the clickable entities are further explored as extended vocabulary case.

### Task 5


We want to assess the importance of each of the four concepts by the number of clickable entities including in the webpage of the given concept. Write a script that implements this strategy to determine the importance of each concept. Next, we want to use the extended vocabulary by quantifying the importance of individual entity in the original webpage by the number of clickable links it generates. Summarize a table highlighting the order of importance of the four concepts according to each of the original and extended vocabulary.

### Task 6

We want to assess the similarity between the concepts is reflected in the their webpage content. Use a script to calculate Wu and Palmer WordNet semantic similarity between each pair of the four concepts and then compare result with the Jaccard similarity obtained by both original vocabulary in 2) and extended vocabulary 4), and comment on the compatibility between the semantic similarity and the above constructed Jaccard similarity measures

### Task 7


Now we want to evaluate the emotion of the text in each dataframe. For this purpose, use WordnetAffect (https://github.com/clemtoy/WNAffect), which generates an emotion state and the part of speech tag of each word matched in the lexicon, and then compile the overall output for each concept. Repeat this process for both restricted and extended vocabulary cases, and comment on the compatibility between the emotion gained from the corresponding Wikipedia page and the intuitive emotion from individual concept.

### Task 8


We want to repeat this process for sentiment polarity. For this purpose, use the Vader sentiment https://github.com/cjhutto/vaderSentiment to assess the sentiment of each the Wikipedia page associated to each concept (both with original vocabulary and extended vocabulary, by aggregating the sentiment of individual clickable entity’s page) to compute the sentiment associated with each concept. Comment on the sentiment of each concept in restricted and extended vocabulary and its compatibility with the intuitive sentiment gained from the inherent definition of these concepts.

### Task 9


We want to investigate the similarity using the embedding representation. For this purpose, write a program that uses word2vec embedding to evaluate 

i) the consistency of the Wikipedia page by calculating the cosine similarity between the underlined concept and the average word2vec embedding of every words contained in the page

ii) the similarity between the various concepts by computing the cosine similarity between the aggregated embedding vector of the corresponding pages. Comment on the compatibility between webpage content and concept and similarity between the various concepts.

### Task 10


Repeat 9) when the extended vocabulary is considered.

### Task 11


Use appropriate literature to comment on the findings. Also, identify any additional input that would allow you to further elucidate any of the preceding, and use appropriate literature of corpus linguistic literature to justify your findings and comment on the obtained results. Finally, comment on the limitations and structural weakness of the data processing pipeline.