In [None]:
from elasticsearch import Elasticsearch, helpers
from datetime import datetime
import pandas as pd 
import json 
import csv
import uuid
import time
from dateutil import parser

Read data from JSON

In [None]:
data = pd.read_json('../data/boulder_flood_geolocated_tweets.json', lines=True)

In [None]:
data.head()

In [None]:
data = data[["created_at", "text", "coordinates"]]
data.head()

In [None]:
data.isna().sum()

In [None]:
data = data[data['coordinates'].notna()]

In [None]:
data.shape

In [None]:
data.dtypes

In [None]:
long_lat = []
for i in data["coordinates"]:
    long_lat.append(list(i["coordinates"]))

In [None]:
len(long_lat)

In [None]:
data.drop("coordinates", axis=1, inplace=True)

In [None]:
data.insert(0, 'coordinates', long_lat)

In [None]:
data.head()

In [None]:
client = Elasticsearch(hosts="http://localhost:9200")

In [None]:
settings = {
    "mappings": {
        "properties": {
            "text" : {"type": "text", "fielddata": True},
            "created_at" : {"type":"date"},
            "coordinates": {"type": "geo_shape"}
        }
    }
}

In [None]:
client.indices.create(index="tweets_index", body=settings)

In [None]:
index_name = 'tweets_index'

# Set the sleep duration in seconds
sleep_duration = 1

for _,row in data.iterrows():
    # Create the bulk insertion data
    bulk_data = [
        {
            '_index': index_name,
            '_id': uuid.uuid4().int,   # unique id for the document
            '_source': row.to_dict()  # convert the row to a dictionary and use it as the source data
        }
        for i in range(100)
    ]

# Iterate through the bulk data and perform the insertion
for data in bulk_data:
    client.create(index=index_name, id=data['_id'], body=data['_source'])
    time.sleep(sleep_duration)

In [None]:
def make_query(text, distance = None, corrs= None, sdate=None, edate=None):
    body = {
        "query": {
            "bool": {
                "must":
                [
                    {
                        "exists": {
                            "field": "coordinates"
                        }
                    },
                    {
                        "fuzzy":{
                            "text": text

                        }
                    },
                    {
                      "geo_distance": {
                        "distance": distance,
                        "coordinates": corrs
                           
                      }
                    },
                    {
                        "range":{
                            "created_at":{
                                "gte": sdate,
                                "lte": edate
                            }
                        }
                    }
                ]
            }
        }
    }

    print(text, distance, corrs, sdate, edate)


In [None]:
query = make_query("Flood",  "200km", [-105.3375,  40.6112] , "2013-01-01", "2014-12-31")

In [None]:
def search_res(query):
# Perform a search using the client and the provided query
    search = client.search(index=index_name, body=query)
    # Initialize an empty list to store the search results
    data = []
    # Iterate over the search hits
    for i in range(len(search["hits"]["hits"])):
        # Create a dictionary to store the search result
        if search["hits"]["hits"][i]["_source"]["coordinates"] == None: 
            continue
        else:
            doc = {
                "score":search["hits"]["hits"][i]["_score"],
                # "date":search["hits"]["hits"][i]["_source"]["created_at"],
                # "text": search["hits"]["hits"][i]["_source"]["text"],
                "lat": search["hits"]["hits"][i]["_source"]["coordinates"][0],
                "lng": search["hits"]["hits"][i]["_source"]["coordinates"][1]
            }
        # Add the dictionary to the list of search results
        data.append(doc)
    # Return the list of search results
    return data

In [None]:
search_res(query)