In [34]:
import pandas as pd
df = pd.read_csv("articles_paragraphs.csv")

In [35]:
df_eng = df[df['language_code'] =='en'].reset_index()

In [36]:
import pandas as pd
import string
import spacy

# load spacy nlp model
nlp = spacy.load('en_core_web_sm')

# define function for pre-processing and tokenization
def preprocess_text_lemma(text):
    # lowercase
    text = text.lower()
    # remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # lemmatize
    doc = nlp(text)
    lemmatized_text = [token.lemma_ for token in doc]
    # remove stopwords and short words
    stopwords = spacy.lang.en.stop_words.STOP_WORDS
    tokens = [token for token in lemmatized_text if token not in stopwords and len(token) > 2]
    return tokens

# apply pre-processing and tokenization to the 'content' column of each row
tokenized_paragraphs_lemma = []
for paragraph in df_eng['content']:
    tokens = preprocess_text_lemma(paragraph)
    tokenized_paragraphs_lemma.append(tokens)

# print the resulting list of lists of tokens
print(tokenized_paragraphs_lemma)


[['decade', 'transform', 'multiple', 'field', 'knowledge', 'medicine', 'transformation', 'different', 'way', 'enhance', 'medicine', 'use', 'article', 'introduce', 'help', 'discover', 'new', 'drug', 'understand', 'mystery', 'cancer', 'learn', 'billion', 'relation', 'different', 'research', 'resource'], ['time', 'help', 'human', 'research', '2007', 'adam', 'robot', 'generate', 'hypothesis', 'gene', 'code', 'critical', 'enzyme', 'catalyze', 'reaction', 'yeast', 'saccharomyce', 'cerevisiae', 'adam', 'use', 'robotic', 'test', 'prediction', 'lab', 'physically', 'researcher', 'university', 'aberystwyth', 'cambridge', 'independently', 'test', 'adamsadam', 'hypothese', 'function', 'gene', 'new', 'accurate', 'wrong', 'example', 'multiple', 'application', 'field', 'ready', 'learn'], ['turn', 'drugdiscovery', 'paradigm', 'upside', 'use', 'patientdriven', 'biology', 'datum', 'derive', 'morepredictive', 'hypothesis', 'traditional', 'trialanderror', 'approach', 'example', 'boston', 'berg', 'biotechno

In [119]:
import gensim
import numpy as np


# Train Word2Vec model
lemmaModel = gensim.models.Word2Vec(tokenized_paragraphs_lemma,vector_size=40, window=15, min_count=2)

# Calculate the meaning vector per paragraph
paragraph_vectors_lemma = []
for paragraph_tokens in tokenized_paragraphs_lemma:
    vectors = []
    for token in paragraph_tokens:
        if token in lemmaModel.wv.key_to_index:
            vectors.append(lemmaModel.wv[token])
    if len(vectors) > 0:
        paragraph_vectors_lemma.append(np.mean(vectors, axis=0))
    else:
        paragraph_vectors_lemma.append(np.zeros(lemmaModel.vector_size))

print(paragraph_vectors_lemma[383])

[-0.00327893  0.03944102 -0.09980662  0.0522343   0.02751541 -0.03238024
  0.04721755  0.04451482 -0.04670327  0.05308595  0.06214707 -0.08413063
  0.01783952  0.05278116  0.07953061  0.10683276 -0.05686279 -0.01607859
 -0.00771376 -0.12513117  0.00335613 -0.02242343  0.05100606 -0.01361149
 -0.19957817 -0.03677157 -0.05388326  0.13539499 -0.09421708  0.07989203
  0.16561136 -0.02799485  0.04595849 -0.05684455 -0.01471273  0.07824632
  0.24570285 -0.19210239 -0.18640898 -0.20190234]


In [120]:
df_eng['vector'] = paragraph_vectors_lemma

In [121]:
import numpy as np
from gensim.models import KeyedVectors


def cosine_similarity_list(vectors_list, query_vector):
    #Compute the cosine similarity between the vector representation of the input and the vector representations of each sentence in the text
    similarity_scores = []
    for vector in vectors_list:
        score = query_vector.dot(vector) / (np.linalg.norm(query_vector) * np.linalg.norm(vector))
        similarity_scores.append(score)

    # Sort the sentences in descending order of their cosine similarity to the input and return the top-N most similar sentences
    n = 100
    most_similar_sentences = [[vectors_list[idx],idx] for idx in np.argsort(similarity_scores)[::-1][:n] if np.sum(vectors_list[idx]) != 0]

    return most_similar_sentences[:20]



## Prompt from user

In [142]:
userPrompt = "future"
tokenized_prompt = preprocess_text_lemma(userPrompt)
print(tokenized_prompt)

promptVector_lemma = np.zeros((lemmaModel.vector_size,))
word_count = 0

for token in tokenized_prompt:
    if token in lemmaModel.wv.key_to_index:
        promptVector_lemma += lemmaModel.wv[token]
        word_count += 1
        print(token)

if word_count > 0:
    promptVector_lemma /= word_count
    
print(promptVector_lemma)
   


['future']
future
[-0.01898879  0.04280987 -0.06714597  0.02819885  0.03896099  0.00096235
  0.02034589  0.02433257 -0.06274707  0.04873357  0.03925842 -0.06325546
  0.03639759  0.02193121  0.03932838  0.08782372 -0.06756749 -0.03353357
 -0.03148305 -0.09792775 -0.00348939 -0.01787145  0.03879825  0.0158415
 -0.13689144 -0.04410299 -0.01510664  0.07906873 -0.08300583  0.06586024
  0.12888141 -0.03131858  0.03210738 -0.0392584  -0.0345674   0.04660358
  0.17833264 -0.13864037 -0.15131871 -0.17896739]


In [143]:
var = cosine_similarity_list(df_eng['vector'],promptVector_lemma)

  score = query_vector.dot(vector) / (np.linalg.norm(query_vector) * np.linalg.norm(vector))


In [144]:
var=np.array(var)
var[:,1]
possible_solutions=df_eng.iloc[var[:,1]]

  var=np.array(var)


In [145]:
for paragraph in possible_solutions["content"]:
    print(paragraph, "\n")

"People who wear an Apple Watch know that an abnormally high or low heart rate or rhythm may suggest atrial fibrillation" (Future Today Institute, 2021). 

AI is going to be an indispensable tool for retails, Equinox AI lab wants to be your partner in this important step. AI helps you in almost every aspect of your business, from logistics to user experience. Retailers that don't adapt to new technologies are more prone to disappear in the near future. 

Thanks to smartphones and smartwatches use, blood pressure readings and electrocardiograms are just a click away. In addition, data recorded and stored in the cloud daily has made it possible to monitor and diagnose people's health status in real-time. (Future Today Institute, 2021). 

Although AI is rising and its long-term implications are uncertain, its future applications in healthcare delivery and how each of us thinks about our health may be transformative. 

For much of human history, random numbers were only used in games of ch