# Search Engine Experiments

Test of different tokenization methods and Word Embedding for the search engine of the articles of Equinox by Asesoftware

## CSV of Articles

CSV Columns: “article_name”, “enumeration_in_article”, “content” 
“stringWithAllText”

In [1]:
import pandas as pd
df = pd.read_csv("articles_paragraphs.csv")

## Data Preprocessing and Tokenization

### Whitespace Tokenization

##### Per Paragraph

In [2]:
import pandas as pd
import string
import spacy

'''
In this example, we use the Spacy library to preprocess and tokenize the text, 
lowercasing the text, removing punctuation, lemmatizing the words, and removing stopwords 
and short words. We then apply this function to each paragraph in the 'content' column of the CSV file using a for loop, 
and append the resulting list of tokens to a list of lists. The final result is a list of lists, where each 
sublist contains the tokens of each paragraph.

'''

# load spacy nlp model
nlp = spacy.load('en_core_web_sm')

# define function for pre-processing and tokenization
def preprocess_text(text):
    # lowercase
    text = text.lower()
    # remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # lemmatize
    doc = nlp(text)
    lemmatized_text = [token.lemma_ for token in doc]
    # remove stopwords and short words
    stopwords = spacy.lang.en.stop_words.STOP_WORDS
    tokens = [token for token in lemmatized_text if token not in stopwords and len(token) > 2]
    return tokens

# apply pre-processing and tokenization to the 'content' column of each row
tokenized_paragraphs_lemma = []
for paragraph in df['content']:
    tokens = preprocess_text(paragraph)
    tokenized_paragraphs_lemma.append(tokens)

# print the resulting list of lists of tokens
print(tokenized_paragraphs_lemma)


[['decade', 'transform', 'multiple', 'field', 'knowledge', 'medicine', 'transformation', 'different', 'way', 'enhance', 'medicine', 'use', 'article', 'introduce', 'help', 'discover', 'new', 'drug', 'understand', 'mystery', 'cancer', 'learn', 'billion', 'relation', 'different', 'research', 'resource'], ['time', 'help', 'human', 'research', '2007', 'adam', 'robot', 'generate', 'hypothesis', 'gene', 'code', 'critical', 'enzyme', 'catalyze', 'reaction', 'yeast', 'saccharomyce', 'cerevisiae', 'adam', 'use', 'robotic', 'test', 'prediction', 'lab', 'physically', 'researcher', 'university', 'aberystwyth', 'cambridge', 'independently', 'test', 'adamsadam', 'hypothesis', 'function', 'gene', 'new', 'accurate', 'wrong', 'example', 'multiple', 'application', 'field', 'ready', 'learn'], ['understand', 'cancer', 'discover', 'new', 'drug'], ['turn', 'drugdiscovery', 'paradigm', 'upside', 'use', 'patientdriven', 'biology', 'datum', 'derive', 'morepredictive', 'hypothesis', 'traditional', 'trialanderror

## Con Stemmer


In [None]:
import pandas as pd
import string
import spacy
from nltk.stem import SnowballStemmer

# load spacy nlp model
nlp = spacy.load('en_core_web_sm')
# load stemmer
stemmer = SnowballStemmer('english')

# define function for pre-processing and tokenization
def preprocess_text(text):
    # lowercase
    text = text.lower()
    # remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # stem
    doc = nlp(text)
    stemmed_text = [stemmer.stem(token.text) for token in doc]
    # remove stopwords and short words
    stopwords = spacy.lang.en.stop_words.STOP_WORDS
    tokens = [token for token in stemmed_text if token not in stopwords and len(token) > 2]
    return tokens

# apply pre-processing and tokenization to the 'content' column of each row
tokenized_paragraphs_stem = []
for paragraph in df['content']:
    tokens = preprocess_text(paragraph)
    tokenized_paragraphs_stem.append(tokens)

# print the resulting list of lists of tokens
print(tokenized_paragraphs_stem)


## Sin Stemmer ni lemma, solamente lapabras completas

In [None]:
import pandas as pd
import string
import spacy

# load spacy nlp model
nlp = spacy.load('en_core_web_sm')

# define function for pre-processing and tokenization
def preprocess_text(text):
    # lowercase
    text = text.lower()
    # remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # split into words
    words = text.split()
    # remove stopwords and short words
    stopwords = spacy.lang.en.stop_words.STOP_WORDS
    tokens = [word for word in words if word not in stopwords and len(word) > 2]
    return tokens

# apply pre-processing and tokenization to the 'content' column of each row
tokenized_paragraphs_word = []
for paragraph in df['content']:
    tokens = preprocess_text(paragraph)
    tokenized_paragraphs_word.append(tokens)

# print the resulting list of lists of tokens
print(tokenized_paragraphs_word)


##### Full String

### Word-based Tokenization

##### Per Paragraph

##### Full String

### Sub-word Tokenization

##### Per paragrpah

##### Full String

## Word Embedding

##### Per Paragraph

In [6]:
import gensim
import numpy as np

'''
Here we train the Word2Vec model with a list of lists where each sublist is a tokenized paragraph.
After we get the word vectors per paragraph, we compute our paragraph meaning vector as the mean
of its word vectors.
'''

# Train Word2Vec model
model = gensim.models.Word2Vec(tokenized_paragraphs_lemma, window=5, min_count=1, workers=4)

# Calculate the meaning vector per paragraph
paragraph_vectors = []
for paragraph_tokens in tokenized_paragraphs_lemma:
    vectors = []
    for token in paragraph_tokens:
        if token in model.wv.key_to_index:
            vectors.append(model.wv[token])
    if len(vectors) > 0:
        paragraph_vectors.append(np.mean(vectors, axis=0))
    else:
        paragraph_vectors.append(np.zeros(model.vector_size))

print(paragraph_vectors[0])

[-2.9649467e-03  6.1455765e-03  3.0737326e-03  2.4381594e-03
  7.8803702e-04 -7.4613574e-03  1.6645188e-03  1.3484553e-02
 -5.2193855e-03 -3.2514653e-03 -2.7808370e-03 -1.2025104e-02
 -1.8585963e-03  2.5773924e-03  4.8169520e-04 -2.7539867e-03
  1.7280449e-03 -9.4014630e-03  1.8742986e-03 -9.9097360e-03
  2.8030234e-04  2.0274494e-03  8.3815577e-03 -1.9289101e-03
 -2.7193563e-04 -6.3419191e-04 -6.8623936e-03 -7.4281506e-03
 -4.4748746e-03  3.9913733e-03  5.6347428e-03 -6.5132353e-04
  5.3469220e-04 -4.1943751e-03 -2.3671319e-03  5.4560262e-03
 -1.0591987e-04 -5.7991245e-03 -2.9220181e-03 -8.4385145e-03
  4.0685423e-03 -7.3179421e-03 -1.8550502e-03 -9.5589185e-04
  7.5655547e-03 -5.4572904e-03 -3.7190970e-03 -3.2205796e-03
  2.9629858e-03  8.3316751e-03  1.5776495e-03 -5.4622698e-03
 -3.4658706e-03 -2.1657587e-03 -3.9475830e-03  2.9198665e-03
  6.9798324e-03  7.2516687e-04 -3.4835101e-03 -9.4130053e-04
 -1.3690722e-03  1.4428118e-03 -2.1867219e-03  1.9970911e-03
 -9.2395339e-03  8.57307

##### Full String

## Prompt Embedding

## Similarity