## Preparing our database

**This part the code must be executed in your own local machine with an active elasticsearch database**


In [1]:
!pip install elasticsearch==7.9.1
!pip install beautifulsoup4 requests lxml



In [3]:
from elasticsearch import Elasticsearch
from pprint import pprint

# This url (http://localhost) means your own machine in port 9200
es = Elasticsearch(["http://localhost:9200"])

# Checking
if es.ping():
    print("Connected to Elasticsearch!")
else:
    print("Could not connect to Elasticsearch")

Connected to Elasticsearch!


First try

In [4]:
doc1 = {
   "title": "document 1",
   "text": "In this tutorial we will learn about NLP"
}

doc2 = {
   "title": "document 2",
   "text": "In these tutorials , people learned the power of nlp"
}
response = es.index(index="documents", body=doc1)
pprint(response)
response = es.index(index="documents", body=doc2)
pprint(response)

{'_id': 'CkJ5YZkBj6Sdu-Bf9zER',
 '_index': 'documents',
 '_primary_term': 1,
 '_seq_no': 0,
 '_shards': {'failed': 0, 'successful': 1, 'total': 2},
 '_type': '_doc',
 '_version': 1,
 'result': 'created'}
{'_id': 'C0J5YZkBj6Sdu-Bf9zFX',
 '_index': 'documents',
 '_primary_term': 1,
 '_seq_no': 1,
 '_shards': {'failed': 0, 'successful': 1, 'total': 2},
 '_type': '_doc',
 '_version': 1,
 'result': 'created'}


In [8]:
query = {
   "query": {
       "match": {
           "text": "people"
       }
   }
}
results = es.search(index="documents", body=query)
pprint(results["hits"]["hits"])


[{'_id': 'C0J5YZkBj6Sdu-Bf9zFX',
  '_index': 'documents',
  '_score': 0.6768591,
  '_source': {'text': 'In these tutorials , people learned the power of nlp',
              'title': 'document 2'},
  '_type': '_doc'}]


Second

In [9]:
index_name = "documents_new"

settings = {
    "settings": {
        "analysis": {
            "analyzer": {
                "english_stemmer": {
                    "tokenizer": "standard",
                    "filter": ["lowercase", "porter_stem"]
                }
            }
        }
    },
    "mappings": {
        "properties": {
            "text": {
                "type": "text",
                "analyzer": "english_stemmer"
            }
        }
    }
}

# Create index with custom stemming analyzer
es.indices.create(index=index_name, body=settings)

{'acknowledged': True, 'shards_acknowledged': True, 'index': 'documents_new'}

In [10]:
doc1 = {
    "title": "document 1",
    "text": "In this tutorial we will learn about NLP"
}


doc2 = {
    "title": "document 2",
    "text": "In these tutorials , people learned the power of nlp"
}
response = es.index(index="documents_new", body=doc1)
pprint(response)
response = es.index(index="documents_new", body=doc2)
pprint(response)

{'_id': 'EEJ7YZkBj6Sdu-BffTFu',
 '_index': 'documents_new',
 '_primary_term': 1,
 '_seq_no': 0,
 '_shards': {'failed': 0, 'successful': 1, 'total': 2},
 '_type': '_doc',
 '_version': 1,
 'result': 'created'}
{'_id': 'EUJ7YZkBj6Sdu-BffTGj',
 '_index': 'documents_new',
 '_primary_term': 1,
 '_seq_no': 1,
 '_shards': {'failed': 0, 'successful': 1, 'total': 2},
 '_type': '_doc',
 '_version': 1,
 'result': 'created'}


## Query processing


In [11]:
from elasticsearch import Elasticsearch
from pprint import pprint

# This url (http://localhost) means your own machine in port 9200
es = Elasticsearch(["http://localhost:9200"])

# Checking
if es.ping():
    print("Connected to Elasticsearch!")
else:
    print("Could not connect to Elasticsearch")

Connected to Elasticsearch!


Which is the query processing in an index? how my analyzer is working?

In [13]:
analyze_result = es.indices.analyze(
    index="documents_new",
    body={"text": "tutorials NLP learned", "analyzer": "english_stemmer"}
)

# Print tokens
pprint(analyze_result)

{'tokens': [{'end_offset': 9,
             'position': 0,
             'start_offset': 0,
             'token': 'tutori',
             'type': '<ALPHANUM>'},
            {'end_offset': 13,
             'position': 1,
             'start_offset': 10,
             'token': 'nlp',
             'type': '<ALPHANUM>'},
            {'end_offset': 21,
             'position': 2,
             'start_offset': 14,
             'token': 'learn',
             'type': '<ALPHANUM>'}]}


## Retrieval ranking in elasticsearch
With explain=True we can see the process of BM25 over the index

In [16]:
query = {
    "query": {
        "match": {
            "text": "learn people"
        }
    },
    "explain": True  # This enables scoring explanation
}

results = es.search(index="documents_new", body=query)
pprint(results["hits"]["hits"])

[{'_explanation': {'description': 'sum of:',
                   'details': [{'description': 'weight(text:learn in 1) '
                                               '[PerFieldSimilarity], result '
                                               'of:',
                                'details': [{'description': 'score(freq=1.0), '
                                                            'computed as boost '
                                                            '* idf * tf from:',
                                             'details': [{'description': 'boost',
                                                          'details': [],
                                                          'value': 2.2},
                                                         {'description': 'idf, '
                                                                         'computed '
                                                                         'as '
                                  

# Exercise of the module

Create the index

In [17]:
settings={
  "settings": {
    "analysis": {
      "filter": {
        "spanish_stemmer": {
          "type": "stemmer", "language": "spanish"}
      },
      "analyzer": {"spanish_analyzer": {"tokenizer": "standard","filter": ["lowercase","spanish_stemmer"]}
      }
    }
  },
  "mappings": {
    "properties": {
      "content": {"type": "text","analyzer": "spanish_analyzer"},
      "user": {"type": "keyword"}
    }
  }
 }
# Create index with custom stemming analyzer
es.indices.create(index='posts', body=settings)

{'acknowledged': True, 'shards_acknowledged': True, 'index': 'posts'}

In [18]:

import requests
from bs4 import BeautifulSoup


url_page= 'https://www.losviajeros.com/index.php?name=Forums&file=viewtopic&t=73370&postdays=0&postorder=asc&start='

def obtain_posts_from_page(index):
    # URL of the web
    url = url_page +str(index)  # Web index

    #  HTTP Get to URL
    response = requests.get(url)

    posts_lst=[]

    if response.status_code == 200:
        # Parse
        soup = BeautifulSoup(response.content, 'html.parser')

        # Find 'div' with class = 'table1'
        table1_div = soup.find('div', {'class': 'table1'})

        if table1_div:
            # search all the tables inside 'table1'
            tables = table1_div.find_all('table')

            # Search all posts

            for table in tables:
                postbody_div = table.find('div', {'class': 'postbody'})
                if postbody_div:
                    # Extract the content of the post
                    post_content = postbody_div.get_text(strip=True)
                    #print(post_content)
                else:
                    continue
                user_div = table.find('a', {'rel': 'author'})
                if user_div:
                    # Extract the user
                    user = user_div.get_text(strip=True)
                    print(user)
                else:
                    continue
                #<a href="/index.php?name=Your_Account&amp;profile=25777" rel="author" style="word-break: break-word;">ANEROL27</a>
                posts_lst.append((post_content,user))

    return posts_lst


def obtain_posts_from_forum():

    iterator=0
    aumenter=20

    lst_posts=[]
    condition=True
    while(condition):
        new_posts= obtain_posts_from_page(iterator)
        if len(new_posts)==0:

            break

        lst_posts.extend(new_posts)
        iterator=iterator+aumenter



    return lst_posts

myposts= obtain_posts_from_forum()


Hinschberger
Hinschberger
Hofcat
Agni_Mani
Hinschberger
Moralinda
Miki
Moralinda
Inmvidmol
Hinschberger
Inmvidmol
Arweny45
MACNUFELIA
M2ra
Arweny45
Rosazul
Morgana666
Morgana666
Morgana666
Bartomeu
Arale
MEMI
Arale
Viruss
Sergiom23
Xampa
Gadir37
La_marquesina
Jymar
Morgana666
Jymar
Chamiceru
Hector
Miki
Hector
Miki
Hector
Chamiceru
Phpu
Carlosblay
Bartomeu
ANEROL27
Miki
Amarala
Diosaisis
Jandrus
Miki
Rubemaradsl
ANEROL27
Relos
Combustio
Miki
Combustio
Miki
Canariaviajera
Bretema
Canariaviajera
Canariaviajera
Bretema
Canariaviajera
Arale
Bartomeu
Canariaviajera
Reydmus
Bretema
Reydmus
Bretema
VANESSAO
Bretema
Reydmus
Dovima
Bretema
VANESSAO
Goran.fiodoric
Bretema
SaraBcn
Fugaz2
Pcanavate
Mansur
Mansur
VANESSAO
Roilu
Bartomeu
VANESSAO
Majaviajera
H2OMADRID
EneritzyAlex
H2OMADRID
Goran.fiodoric
Martuxi
Orion49
Aran+
Murdock
Sergio77
Murdock
Ronh
Miki
Ronh
Ronh
Panda
Elvirang
Dovima
Panda
Bartomeu
Rowina
Brnc
Monlis
Brnc
Orion49
Orion49
Laura_angel
Dabra
Rowina
Yennefer
Xana65
Orion49
Carm

Put the posts into the database

In [22]:

### POST INTO ELASTICSEARCH

from elasticsearch import Elasticsearch

def post_content(es, element):
    post = {
        "content": element[0],
        "user": element[1]
    }

    response = es.index(index="posts", body=post)

    # Mostrar el resultado de la inserción
    print(f"Result: {response}")


# This url (http://localhost) means your own machine in port 9200
es = Elasticsearch(["http://localhost:9200"])

# Checking
if es.ping():
    print("Connected to Elasticsearch!")
else:
    print("Could not connect to Elasticsearch")


for post in myposts:
    post_content(es, post)

Connected to Elasticsearch!
Result: {'_index': 'posts', '_type': '_doc', '_id': 'FUJ-YZkBj6Sdu-BfvTFv', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 0, '_primary_term': 1}
Result: {'_index': 'posts', '_type': '_doc', '_id': 'FkJ-YZkBj6Sdu-BfvTGN', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 1, '_primary_term': 1}
Result: {'_index': 'posts', '_type': '_doc', '_id': 'F0J-YZkBj6Sdu-BfvTGS', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 2, '_primary_term': 1}
Result: {'_index': 'posts', '_type': '_doc', '_id': 'GEJ-YZkBj6Sdu-BfvTGW', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 3, '_primary_term': 1}
Result: {'_index': 'posts', '_type': '_doc', '_id': 'GUJ-YZkBj6Sdu-BfvTGa', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '

In [23]:
## Do some queries!!
query = {
    "query": {
        "match": {
            "content": "piramide"
        }
    }
}

results = es.search(index="posts", body=query)
pprint(results["hits"]["hits"])

[{'_id': '9EJ-YZkBj6Sdu-BfxTOj',
  '_index': 'posts',
  '_score': 3.2134707,
  '_source': {'content': 'Nuevos precios en la zona de GizaNuevos precios de '
                         'las entradas al yacimiento arqueológico de la meseta '
                         'de las Pirámides de Giza.Con vigencia del 1 de mayo '
                         'de 2022.* Para las pirámides de Giza es de 240 EGP '
                         'por persona* Para entrar en la Gran Pirámide de King '
                         'Khufu/Cheops 440 EGP por persona* Billete combinado '
                         'para la entrada en la gran pirámide y el ticket de '
                         'la zona por sólo 600 EGP por persona.* Para entrar '
                         'en la segunda pirámide de King Khafra/Chephren son '
                         '100 EGP por persona.Ver imagen:',
              'user': 'Bartomeu'},
  '_type': '_doc'},
 {'_id': 'sEJ-YZkBj6Sdu-BfxTMV',
  '_index': 'posts',
  '_score': 3.0866895,
  '_source': {

**MY WORK:**
let´s add this to the World2Vec and create our ouwn word embedding

In [None]:
%pip install gensim
%pip install numpy
%pip install pandas
%pip install nltk

In [144]:
import re
from gensim.models import Word2Vec
from tqdm import tqdm

tqdm.pandas()

import nltk
nltk.download('punkt')  #
from nltk.tokenize import sent_tokenize

def process_text(post):
    lower = post[0].lower()
    # Remove line breaks
    elements = lower.replace('\n', ' ').split()

    # Remove elements <1
    filtered_elements = [element for element in elements if len(element) > 1]

    # Join
    text = ' '.join(filtered_elements)

    sentences = sent_tokenize(text)

    return sentences

def preprocessing(titles_array):

    """
    Take in an array of titles, and return the processed titles.

    (e.g. input: 'i am a boy', output - 'am boy')  -> since I remove those words with length 1
    """

    processed_array = []

    for title in tqdm(titles_array):
        # remove other non-alphabets symbols with space (i.e. keep only alphabets and whitespaces).
        processed = re.sub('[^a-zA-Z ]', '', title)

        words = processed.split()

        # keep words that have length of more than 1 (e.g. gb, bb), remove those with length 1.
        processed_array.append((' '.join([word for word in words if len(word) > 1])).split())

    return processed_array

# Let's remove stopwords from tokens. The `stopwords` package from the 'nltk.corpus' is used.
def clean_stopwords(input_array):
    import nltk
    nltk.download('stopwords')
    from nltk.corpus import stopwords
    if len(input_array) == 0:
        return [""]
    # create a set with stopwords
    stop_words = set(stopwords.words('spanish'))

    for sentence in input_array:
        words = sentence.split(" ")
        filtered_tokens = [word for word in words if word not in stop_words]  # remove stopwords
    # Rebuild string and return wrapped in list
    return [" ".join(filtered_tokens)]

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/federicosvendsen/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [145]:
train_sentences=[]

# Leer cada archivo en la lista
for post in myposts:
    result = process_text(post)
    cleaned = clean_stopwords(result)
    result = preprocessing(cleaned)
    train_sentences.extend(result)


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/federicosvendsen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
100%|██████████| 1/1 [00:00<00:00, 2786.91it/s]
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/federicosvendsen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
100%|██████████| 1/1 [00:00<00:00, 13486.51it/s]
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/federicosvendsen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
100%|██████████| 1/1 [00:00<00:00, 13148.29it/s]
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/federicosvendsen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
100%|██████████| 1/1 [00:00<00:00, 8848.74it/s]
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/federicosvendsen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
100%|██████████| 1/1 [00:00<00:00, 15827.

In [146]:
model = Word2Vec(sentences=train_sentences,
                 sg=0,
                 vector_size=400,
                 workers=4,
                 epochs=6

                 )

In [84]:
# Now that we have the model, let´ssee the clusters
%pip install scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [147]:
import gensim
from gensim.models import Word2Vec
from gensim.models import FastText
from sklearn.cluster import KMeans
import numpy as np


word_vectors = model.wv.vectors
kmeans = KMeans(n_clusters=50)  # Number of clusters
kmeans.fit(word_vectors)
cluster_assignments = kmeans.labels_

# Link words to their clusters
word_clusters = {}
for word, cluster in zip(model.wv.index_to_key, cluster_assignments):
    if cluster not in word_clusters:
        word_clusters[cluster] = []
    word_clusters[cluster].append(word)

# Print the words of each cluster
for cluster, words in word_clusters.items():
    print(f"Cluster {cluster + 1}:")
    print(", ".join(words))

Cluster 11:
si, entrada
Cluster 33:
museo
Cluster 21:
precios, entradas, luxor, estudiante, tumbas, solo, temple
Cluster 20:
segn, compra, caso, pago
Cluster 12:
gracias, le, nuevo, egipcios, aswan
Cluster 39:
mas, da
Cluster 35:
cairo, museos, sitios, egp, ir, hacer, giza
Cluster 37:
valle, lugares
Cluster 26:
reyes, karnak, visitar, recinto, lugar
Cluster 13:
precio, templo, egipto, libras, puede, carnet, creo, pagar, das, web, dos, hace, aos, pirmide, partir, bien
Cluster 6:
tarjeta, descuento, simbel, euros, estan, comprar, entrar, ahora, antigedades, momias, oficial
Cluster 19:
no, taquillas, hora
Cluster 4:
ver, egipcio, as, tumba, ser, ministerio, of
Cluster 48:
turismo
Cluster 14:
online, mayormente, favoreciendo
Cluster 46:
ejemplo, tipo
Cluster 40:
semana, viernes
Cluster 18:
piramides
Cluster 25:
local, fin, sabado, haycompensa
Cluster 2:
bartomeu, pass, monumentos, nefertari, visitas, ramses, fotos, vale, fecha, edfu, cada, cuenta, mejor, foto, barrio
Cluster 36:
bancaria
C

In [148]:
model.wv.most_similar('turismo')

[('da', 0.9995073080062866),
 ('mas', 0.9994552135467529),
 ('viernes', 0.9989041090011597),
 ('semana', 0.9987962245941162),
 ('tipo', 0.9985004663467407),
 ('lugar', 0.9984822273254395),
 ('visitar', 0.99802565574646),
 ('recinto', 0.9966112375259399),
 ('ejemplo', 0.9965640902519226),
 ('fin', 0.9965439438819885)]