In [1]:
import string
import numpy as np
import pandas as pd
import re
import pickle as pkl

from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize, WordNetLemmatizer, PorterStemmer, pos_tag, RegexpParser
nltk.download('averaged_perceptron_tagger')
wn = WordNetLemmatizer()
ps = PorterStemmer()
stopwords = stopwords.words('english')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\alexa\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [2]:
def cleantxt(text):
    text = text.replace('\r', ' ')
    text = text.replace('\n', ' ')
    text = text.replace('\t', ' ')
    text = text.strip()
    text = ''.join([wrd.lower() for wrd in text if wrd not in string.punctuation])
    tokens = re.split('\W+', text)
    tokens = [wrd for wrd in tokens if wrd not in stopwords]
    tokens = [wn.lemmatize(wrd) for wrd in tokens]
    return tokens

def cosine(u, v):
    return np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v))

In [3]:
df = pd.read_csv("../files/preprocessed_articles.csv")

In [4]:
df

Unnamed: 0,link,title,text
0,https://news.un.org//en/story/2023/10/1142552,"World News in Brief: Sandstorm alert, albinism...",19 october 2023 the un world meteorological or...
1,https://news.un.org//en/story/2023/10/1142552,"World News in Brief: Sandstorm alert, albinism...",19 october 2023 wmo chief petteri taalas said ...
2,https://news.un.org//en/story/2023/10/1142552,"World News in Brief: Sandstorm alert, albinism...",19 october 2023 wmo said that exposure to dust...
3,https://news.un.org//en/story/2023/10/1142552,"World News in Brief: Sandstorm alert, albinism...",19 october 2023 according to wmo in 2022 hotsp...
4,https://news.un.org//en/story/2023/10/1142552,"World News in Brief: Sandstorm alert, albinism...",19 october 2023 prof taalas stressed wmos comm...
...,...,...,...
15862,https://news.un.org//en/story/2004/09/116712-c...,Concerns of island nations top final session o...,30 september 2004 in highlighting the devastat...
15863,https://news.un.org//en/story/2004/09/116712-c...,Concerns of island nations top final session o...,30 september 2004 talbak nazarov foreign minis...
15864,https://news.un.org//en/story/2004/09/116712-c...,Concerns of island nations top final session o...,30 september 2004 surinames foreign minister m...
15865,https://news.un.org//en/story/2004/09/116712-c...,Concerns of island nations top final session o...,30 september 2004 for carlos morales troncoso ...


In [5]:
sents = list(df["text"])

In [6]:
sent_tokens = [cleantxt(sent) for sent in sents]

In [7]:
tagged_data = [TaggedDocument(d, [i]) for i, d in enumerate(sent_tokens)]

In [13]:
tagged_data[1]

TaggedDocument(words=['19', 'october', '2023', 'wmo', 'chief', 'petteri', 'taalas', 'said', 'partly', 'due', 'poor', 'water', 'land', 'management', 'phenomenon', 'also', 'exacerbated', 'higher', 'temperature', 'drought', 'brought', 'warming', 'climate', 'leading', 'higher', 'evaporation', 'drier', 'soil'], tags=[1])

In [14]:
## Train doc2vec model
model = Doc2Vec(tagged_data,
                 vector_size = 20,
                 window = 2,
                 min_count = 1,
                 epochs = 100)

'''
vector_size = Dimensionality of the feature vectors.
window = The maximum distance between the current and predicted word within a sentence.
min_count = Ignores all words with total frequency lower than this.
'''

'\nvector_size = Dimensionality of the feature vectors.\nwindow = The maximum distance between the current and predicted word within a sentence.\nmin_count = Ignores all words with total frequency lower than this.\n'

In [15]:
embeddings_sent2vec = model.wv.vectors
embeddings_rows = model.wv.vectors.shape[0]
embeddings_cols = model.wv.vectors.shape[1]

In [16]:
with open(f'../files/embeddings_sent2vec_{embeddings_rows}_{embeddings_cols}.pkl', 'wb') as f:
    pkl.dump(model, f)

In [20]:
new_sent = input("Enter your query: ")
test_doc = cleantxt(new_sent)
test_doc_vector = model.infer_vector(test_doc)
text_ind = model.dv.most_similar(positive = [test_doc_vector])[0][0]

print(new_sent)
print(df['text'][text_ind])


2 july 2022 hazards and shocks can emerge from outside and within the system exposure to them can be indirect meaning that effects can be felt in places that are not directly affected by the hazard  in this case covid19  but end up being affected as a result of interconnectedness finally the vulnerability of one system can also turn into a hazard or shock for other interdependent systems 
provide information about drought 


In [18]:
df.loc[text_ind]

link         https://news.un.org//en/story/2022/05/1118942
title    Deputy UN chief praises resilience of Bali stu...
text     24 may 2022 thankfully the alarm that prompted...
Name: 5209, dtype: object