In [1]:
import os, arrow
from elasticsearch import Elasticsearch
from datetime import datetime
from config import SOC_ES_HOST,SOC_ES_USER, SOC_ES_PASSWORD, SOC_ES_PORT

import requests
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import numpy as np
import nltk
import pandas as pd

from datetime import timezone

In [2]:
from elasticsearch import Elasticsearch

es = Elasticsearch([SOC_ES_HOST], http_auth=(SOC_ES_USER, SOC_ES_PASSWORD), port=SOC_ES_PORT, timeout=60)

if es.ping():
    print("Elasticsearch is connected.")
else:
    print("Elasticsearch is not connected.")


Elasticsearch is connected.


In [3]:
from datetime import datetime

def date_function(day, month, year):
    date_obj = datetime(year, month, day)
    date_str = date_obj.isoformat(timespec='milliseconds')
    return date_str

print(date_function(15, 1, 2020)) # Output = 2020-01-15T00:00:00.000


2020-01-15T00:00:00.000


In [5]:
from datetime import datetime


def date_function(month, day, year):
    date_object = datetime(year, month, day, tzinfo=timezone.utc)
    return date_object.strftime('%d/%m/%Y')

def set_el_string(query: str) -> str:
    return "%s" % query.replace('|', 'OR') \
        .replace('/', '') \
        .replace('&', ' AND ')

doc = {
    "size": 1,
    "query": {
        "bool": {
            "must": [
                {
                    "match": {
                        "full_text": "covid"
                    }
                }
            ],
            "filter": [
                {
                    "term": {
                        "lang": "en"
                    }
                },
                {
                    "range": {
                        "created_at": {
                            "gte": "01/03/2020",
                            "lte": "01/04/2020",
                            "format": "dd/MM/yyyy"
                        }
                    }
                }
            ]
        }
    }
}




def get_news(index, keyword=None, query=None, start=None, end=None, doc=None, size=10):
    if doc is None:
        doc = {
            "size": size,
            "query": {
                "bool": {
                    "must": [
                        {
                            "query_string": {
                                "query": "*"
                            }
                        }
                    ],
                    "filter": [
                        {
                            "range": {
                                "created_at": {
                                    "gte": "",
                                    "lte": "",
                                    "format": "dd/MM/yyyy"
                                }
                            }
                        }
                    ]
                }
            }
        }

        if keyword is not None:
            query = set_el_string(keyword['keywords']) + " AND " + query

        doc["query"]["bool"]["must"][0]["query_string"]["query"] = query

        if start is not None and end is not None:
            doc["query"]["bool"]["filter"][0]["range"]["created_at"]["gte"] = start
            doc["query"]["bool"]["filter"][0]["range"]["created_at"]["lte"] = end

    if es.indices.exists(index=index):
        res = es.search(index=index, body=doc, scroll="1m")
        print("Elasticsearch response:", res)

        data = []
        scroll_id = res["_scroll_id"]

        while len(res["hits"]["hits"]):
            for hit in res["hits"]["hits"]:
                data.append(hit)

            res = es.scroll(scroll_id=scroll_id, scroll="1m")
            scroll_id = res["_scroll_id"]

        return data
    else:
        print("Index not found")
        return []




In [8]:
# Get the data 

index = 'covidtweets'
data = []

doc = {
    "size": 1, # Will be extended later.
    "query": {
        "bool": {
            "must": [
                {
                    "match": {
                        "full_text": "covid"
                    }
                }
            ],
            "filter": [
                {
                    "term": {
                        "lang": "en"
                    }
                },
                {
                    "range": {
                        "created_at": {
                            "gte": "01/03/2020", # Note that those dates can be changed. First, we're trying to get for 2 days, later it will be extended. 
                            "lte": "02/03/2020",
                            "format": "dd/MM/yyyy"
                        }
                    }
                }
            ]
        }
    }
}



dataset = []
df = pd.DataFrame()
tmp = pd.DataFrame()
data = list()
index = 'covidtweets'
data = get_news(index=index, doc=doc, start=date_function(3, 1, 2020), end=date_function(4, 1, 2020))




Elasticsearch response: {'_scroll_id': 'FGluY2x1ZGVfY29udGV4dF91dWlkDnF1ZXJ5VGhlbkZldGNoAhRHSlVHN0ljQnZIU1Y2bHdSZGs1awAAAAAAAAAlFlIzd1JSTWRkVG9DOVV3Wmc0TUE5b1EUR1pVRzdJY0J2SFNWNmx3UmRrNWsAAAAAAAAAJhZSM3dSUk1kZFRvQzlVd1pnNE1BOW9R', 'took': 16, 'timed_out': False, '_shards': {'total': 2, 'successful': 2, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 4918, 'relation': 'eq'}, 'max_score': 2.4995708, 'hits': [{'_index': 'covidtweets', '_type': '_doc', '_id': '49609778023624449796863802106702267079932631352518639618.0', '_score': 2.4995708, '_source': {'lang': 'en', 'full_text': '#covid-19 #economy\nCovid-19 est un Black Swan https://t.co/OxSQ2pYEh3', 'created_at': 'Sun Mar 01 05:16:04 +0000 2020', 'coordinates': None, 'in_reply_to_user_id_str': None, 'entities': {'symbols': [], 'user_mentions': [], 'hashtags': [{'indices': [0, 6], 'text': 'covid'}, {'indices': [10, 18], 'text': 'economy'}], 'urls': [{'url': 'https://t.co/OxSQ2pYEh3', 'indices': [46, 69], 'expanded_url': 'https://w

In [None]:
# Convert data to dataframe

for item in data:
    dataset.append(item['_source'])

tmp = pd.DataFrame.from_dict(dataset)
df = pd.concat([df, tmp])