# 1. Elasticsearch: Create twitter_covid index

# 2. Elasticsearch: Create twitter_covid index pattern

# 3. import preprocessed data to Elasticsearch

In [13]:
import csv
from elasticsearch import Elasticsearch
from datetime import datetime
import ast

# Connect to Elasticsearch
es = Elasticsearch(['http://localhost:9200'])

# Date format conversion function
def convert_date(date_string):
    return datetime.strptime(date_string, '%Y-%m-%d %H:%M:%S').isoformat()

# Safely parse hashtags
def parse_hashtags(hashtags_string):
    try:
        # Attempt to safely parse the string using ast.literal_eval
        return ast.literal_eval(hashtags_string)
    except:
        # If parsing fails, return an empty list
        return []

# Open and read the CSV file
with open('covid19_tweets.csv', 'r', encoding='utf-8') as file:
    csv_reader = csv.DictReader(file)
    
    for row in csv_reader:
        # Convert date fields
        row['user_created'] = convert_date(row['user_created'])
        row['date'] = convert_date(row['date'])
        
        # Convert numerical fields
        row['user_followers'] = int(row['user_followers'])
        row['user_friends'] = int(row['user_friends'])
        row['user_favourites'] = int(row['user_favourites'])
        
        # Convert boolean fields
        row['user_verified'] = row['user_verified'].lower() == 'true'
        row['is_retweet'] = row['is_retweet'].lower() == 'true'
        
        # Safely handle the hashtags field
        row['hashtags'] = parse_hashtags(row['hashtags'])
        
        # Use user_name as the document ID
        doc_id = row['user_name']
        
        # Index the row data as a document into Elasticsearch
        es.index(index='twitter_covid', id=doc_id, body=row)

print("Data import completed")

Data import completed


# 4. Analyse ES Data

## 4.1 Geographic Characteristics

In [23]:
import pandas as pd
from elasticsearch import Elasticsearch
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut
import folium
from collections import defaultdict
import re

# Connect to Elasticsearch
es = Elasticsearch(['http://localhost:9200'])

def get_data_from_es(index_name, size=10000):
    query = {
        "query": {
            "match_all": {}
        },
        "size": size
    }
    response = es.search(index=index_name, body=query)
    return [hit['_source'] for hit in response['hits']['hits']]

# Fetch data
data = get_data_from_es('twitter_covid')
df = pd.DataFrame(data)

In [25]:
import pandas as pd
import re
from elasticsearch import Elasticsearch
from collections import Counter

# Connect to Elasticsearch
es = Elasticsearch(['http://localhost:9200'])

def get_data_from_es(index_name, size=10000):
    """
    Fetch data from Elasticsearch
    """
    query = {
        "query": {
            "match_all": {}
        },
        "size": size
    }
    response = es.search(index=index_name, body=query)
    return [hit['_source'] for hit in response['hits']['hits']]

# Get data
data = get_data_from_es('twitter_covid')
df = pd.DataFrame(data)

def clean_location(location):
    """
    Clean location data by removing special characters and standardizing format
    """
    if pd.isna(location) or not isinstance(location, str):
        return None
    
    # Convert to lowercase and remove special characters
    location = re.sub(r'[^\w\s,]', '', location.lower()).strip()
    
    # Remove common prefixes/suffixes that don't add geographic information
    prefixes_suffixes = ['the', 'a', 'an', 'in', 'at', 'on', 'from']
    location_parts = location.split()
    location_parts = [part for part in location_parts if part not in prefixes_suffixes]
    location = ' '.join(location_parts)
    
    # Remove very short or likely non-geographic locations
    if len(location) < 2 or location in ['earth', 'world', 'global', 'international']:
        return None
    
    return location

# Apply the cleaning function to the location column
df['cleaned_location'] = df['user_location'].apply(clean_location)

# Remove rows with None values in cleaned_location
df = df.dropna(subset=['cleaned_location'])

# Count occurrences of each unique location
location_counts = Counter(df['cleaned_location'])

# Get the top N locations
def print_top_locations(n=20):
    """
    Print the top N most common locations
    """
    print(f"Top {n} locations:")
    for location, count in location_counts.most_common(n):
        print(f"{location}: {count}")

# Print top 20 locations
print_top_locations(20)

# Optionally, save the cleaned and unique locations to a file
unique_locations = pd.DataFrame(location_counts.items(), columns=['Location', 'Count'])
unique_locations = unique_locations.sort_values('Count', ascending=False)
unique_locations.to_csv('unique_locations.csv', index=False)
print("\nUnique locations saved to 'unique_locations.csv'")

Top 20 locations:
united states: 135
india: 120
london, england: 107
united kingdom: 91
lagos, nigeria: 87
london: 78
new delhi, india: 72
australia: 55
nairobi, kenya: 53
usa: 53
nigeria: 52
south africa: 50
uk: 50
new york, ny: 49
johannesburg, south africa: 44
canada: 41
chicago, il: 41
los angeles, ca: 41
mumbai, india: 38
washington, dc: 36

Unique locations saved to 'unique_locations.csv'


In [27]:
from elasticsearch import Elasticsearch
import pandas as pd
from collections import Counter
import re

# Connect to Elasticsearch
es = Elasticsearch(['http://localhost:9200'])

def get_long_covid_data(index_name, size=10000):
    """
    Fetch data from Elasticsearch with specified long COVID related hashtags
    """
    query = {
        "query": {
            "bool": {
                "should": [
                    {"match": {"hashtags": "longhaulers"}},
                    {"match": {"hashtags": "longcovid"}},
                    {"match": {"hashtags": "longhauler"}},
                    {"match": {"hashtags": "longtermcare"}},
                    {"match": {"hashtags": "longhaul"}}
                ]
            }
        },
        "size": size
    }
    response = es.search(index=index_name, body=query)
    return [hit['_source'] for hit in response['hits']['hits']]

# Get long COVID related data
data = get_long_covid_data('twitter_covid')
df = pd.DataFrame(data)

def clean_location(location):
    """
    Clean location data by removing special characters and standardizing format
    """
    if pd.isna(location) or not isinstance(location, str):
        return None
    
    # Convert to lowercase and remove special characters
    location = re.sub(r'[^\w\s,]', '', location.lower()).strip()
    
    # Remove common prefixes/suffixes that don't add geographic information
    prefixes_suffixes = ['the', 'a', 'an', 'in', 'at', 'on', 'from']
    location_parts = location.split()
    location_parts = [part for part in location_parts if part not in prefixes_suffixes]
    location = ' '.join(location_parts)
    
    # Remove very short or likely non-geographic locations
    if len(location) < 2 or location in ['earth', 'world', 'global', 'international']:
        return None
    
    return location

# Apply the cleaning function to the location column
df['cleaned_location'] = df['user_location'].apply(clean_location)

# Remove rows with None values in cleaned_location
df = df.dropna(subset=['cleaned_location'])

# Count occurrences of each unique location
location_counts = Counter(df['cleaned_location'])

def print_top_locations(n=20):
    """
    Print the top N most common locations
    """
    print(f"Top {n} locations for long COVID related tweets:")
    for location, count in location_counts.most_common(n):
        print(f"{location}: {count}")

# Print top 20 locations
print_top_locations(20)

# Save the cleaned and unique locations to a file
unique_locations = pd.DataFrame(location_counts.items(), columns=['Location', 'Count'])
unique_locations = unique_locations.sort_values('Count', ascending=False)
unique_locations.to_csv('long_covid_locations.csv', index=False)
print("\nUnique locations for long COVID related tweets saved to 'long_covid_locations.csv'")

# Print some statistics
print(f"\nTotal number of tweets with long COVID related hashtags: {len(df)}")
print(f"Number of unique locations: {len(location_counts)}")

Top 20 locations for long COVID related tweets:
nyc: 2
oslo, norway: 1
wwwlaurienadelcom: 1
barcelona: 1
nc, usa: 1
sigtuna, sverige: 1
cheltenham, uk: 1
menlo park sfo world: 1
den haag, nederland: 1
north america: 1
bengaluru, india: 1
west sussex: 1
indonesia: 1
university of leicester: 1
cleveland ohio: 1
los angeles, ca: 1
everett, washington: 1
fierce be with youalways: 1
maryland metro dc: 1

Unique locations for long COVID related tweets saved to 'long_covid_locations.csv'

Total number of tweets with long COVID related hashtags: 20
Number of unique locations: 19


In [37]:
import pandas as pd
import folium
from folium.plugins import MarkerCluster
from collections import defaultdict

# Load data from CSV files
# all_locations = pd.read_csv('unique_locations.csv')
# long_covid_locations = pd.read_csv('long_covid_locations.csv')


# Create DataFrame
long_covid_data = pd.DataFrame([
    ('New York City', 2), ('Oslo', 1), ('Barcelona', 1), ('North Carolina', 1),
    ('Sigtuna', 1), ('Cheltenham', 1), ('Menlo Park', 1), ('The Hague', 1),
    ('North America', 1), ('Bengaluru', 1), ('West Sussex', 1), ('Indonesia', 1),
    ('Leicester', 1), ('Cleveland', 1), ('Los Angeles', 1), ('Everett', 1),
    ('Maryland', 1)
], columns=['Location', 'Count'])

covid_data = pd.DataFrame([
    ('United States', 135), ('India', 120), ('London', 185), ('United Kingdom', 141),
    ('Lagos', 87), ('New Delhi', 72), ('Australia', 55), ('Nairobi', 53),
    ('Nigeria', 52), ('South Africa', 94), ('New York', 49), ('Canada', 41),
    ('Chicago', 41), ('Los Angeles', 41), ('Mumbai', 38), ('Washington DC', 36)
], columns=['Location', 'Count'])

# Location Dictionary
coordinates = {
    'New York City': (40.7128, -74.0060), 'Oslo': (59.9139, 10.7522),
    'Barcelona': (41.3851, 2.1734), 'North Carolina': (35.7596, -79.0193),
    'Sigtuna': (59.6173, 17.7231), 'Cheltenham': (51.8979, -2.0744),
    'Menlo Park': (37.4538, -122.1822), 'The Hague': (52.0705, 4.3007),
    'North America': (54.5260, -105.2551), 'Bengaluru': (12.9716, 77.5946),
    'West Sussex': (50.9280, -0.4617), 'Indonesia': (-0.7893, 113.9213),
    'Leicester': (52.6369, -1.1398), 'Cleveland': (41.4993, -81.6944),
    'Los Angeles': (34.0522, -118.2437), 'Everett': (47.9790, -122.2021),
    'Maryland': (39.0458, -76.6413), 'United States': (37.0902, -95.7129),
    'India': (20.5937, 78.9629), 'London': (51.5074, -0.1278),
    'United Kingdom': (55.3781, -3.4360), 'Lagos': (6.5244, 3.3792),
    'New Delhi': (28.6139, 77.2090), 'Australia': (-25.2744, 133.7751),
    'Nairobi': (-1.2921, 36.8219), 'Nigeria': (9.0820, 8.6753),
    'South Africa': (-30.5595, 22.9375), 'New York': (40.7128, -74.0060),
    'Canada': (56.1304, -106.3468), 'Chicago': (41.8781, -87.6298),
    'Mumbai': (19.0760, 72.8777), 'Washington DC': (38.9072, -77.0369)
}

# Create Map
m = folium.Map(location=[20, 0], zoom_start=2)

# Add COVID-19 Location
covid_cluster = MarkerCluster(name="COVID-19 Tweets").add_to(m)
for _, row in covid_data.iterrows():
    if row['Location'] in coordinates:
        folium.CircleMarker(
            location=coordinates[row['Location']],
            radius=min(int(row['Count']/5), 20),
            popup=f"{row['Location']}: {row['Count']}",
            color='blue',
            fill=True,
            fill_opacity=0.7
        ).add_to(covid_cluster)

# Add Long COVID Location
for _, row in long_covid_data.iterrows():
    if row['Location'] in coordinates:
        folium.CircleMarker(
            location=coordinates[row['Location']],
            radius=10,  # 固定大小以突出显示
            popup=f"Long COVID - {row['Location']}: {row['Count']}",
            color='red',
            fill=True,
            fill_opacity=0.9,
            weight=2
        ).add_to(m)

# Add tag
legend_html = '''
<div style="position: fixed; bottom: 50px; left: 50px; width: 220px; height: 90px; 
    border:2px solid grey; z-index:9999; font-size:14px; background-color:white;
    ">&nbsp; <b>Legend</b> <br>
    &nbsp; COVID-19 tweets &nbsp; <i class="fa fa-circle fa-1x" style="color:blue"></i><br>
    &nbsp; Long COVID tweets &nbsp; <i class="fa fa-circle fa-1x" style="color:red"></i>
</div>
'''
m.get_root().html.add_child(folium.Element(legend_html))

# Add Layer control
folium.LayerControl().add_to(m)

# Save map
m.save('covid_tweet_map.html')

print("Map saved as 'covid_tweet_map.html'")

Map saved as 'covid_tweet_map.html'


In [47]:
import pandas as pd
import folium
from folium.plugins import MarkerCluster

# Load data from CSV files
# all_locations = pd.read_csv('unique_locations.csv')
# long_covid_locations = pd.read_csv('long_covid_locations.csv')

# Create dataframes for long COVID and general COVID data
long_covid_data = pd.DataFrame([
    ('New York City', 2), ('Oslo', 1), ('Barcelona', 1), ('North Carolina', 1),
    ('Sigtuna', 1), ('Cheltenham', 1), ('Menlo Park', 1), ('The Hague', 1),
    ('North America', 1), ('Bengaluru', 1), ('West Sussex', 1), ('Indonesia', 1),
    ('Leicester', 1), ('Cleveland', 1), ('Los Angeles', 1), ('Everett', 1),
    ('Maryland', 1)
], columns=['Location', 'Count'])

covid_data = pd.DataFrame([
    ('United States', 135), ('India', 120), ('London', 185), ('United Kingdom', 141),
    ('Lagos', 87), ('New Delhi', 72), ('Australia', 55), ('Nairobi', 53),
    ('Nigeria', 52), ('South Africa', 94), ('New York', 49), ('Canada', 41),
    ('Chicago', 41), ('Los Angeles', 41), ('Mumbai', 38), ('Washington DC', 36)
], columns=['Location', 'Count'])

# Dictionary of coordinates for each location
coordinates = {
    'New York City': (40.7128, -74.0060), 'Oslo': (59.9139, 10.7522),
    'Barcelona': (41.3851, 2.1734), 'North Carolina': (35.7596, -79.0193),
    'Sigtuna': (59.6173, 17.7231), 'Cheltenham': (51.8979, -2.0744),
    'Menlo Park': (37.4538, -122.1822), 'The Hague': (52.0705, 4.3007),
    'North America': (54.5260, -105.2551), 'Bengaluru': (12.9716, 77.5946),
    'West Sussex': (50.9280, -0.4617), 'Indonesia': (-0.7893, 113.9213),
    'Leicester': (52.6369, -1.1398), 'Cleveland': (41.4993, -81.6944),
    'Los Angeles': (34.0522, -118.2437), 'Everett': (47.9790, -122.2021),
    'Maryland': (39.0458, -76.6413), 'United States': (37.0902, -95.7129),
    'India': (20.5937, 78.9629), 'London': (51.5074, -0.1278),
    'United Kingdom': (55.3781, -3.4360), 'Lagos': (6.5244, 3.3792),
    'New Delhi': (28.6139, 77.2090), 'Australia': (-25.2744, 133.7751),
    'Nairobi': (-1.2921, 36.8219), 'Nigeria': (9.0820, 8.6753),
    'South Africa': (-30.5595, 22.9375), 'New York': (40.7128, -74.0060),
    'Canada': (56.1304, -106.3468), 'Chicago': (41.8781, -87.6298),
    'Mumbai': (19.0760, 72.8777), 'Washington DC': (38.9072, -77.0369)
}

# Create map
m = folium.Map(location=[20, 0], zoom_start=2)

# Function to create circle marker with count inside
def create_circle_marker(location, count, color, radius):
    folium.CircleMarker(
        location=location,
        radius=radius,
        popup=f"{location}: {count}",
        color=color,
        fill=True,
        fill_opacity=0.7
    ).add_to(m)
    folium.Marker(
        location=location,
        icon=folium.DivIcon(html=f"""
            <div style="font-family: courier new; color: black">
            <strong>{count}</strong>
            </div>""")
    ).add_to(m)

# Add COVID-19 points
for _, row in covid_data.iterrows():
    if row['Location'] in coordinates:
        create_circle_marker(
            location=coordinates[row['Location']],
            count=row['Count'],
            color='blue',
            radius=min(int(row['Count']/5), 20)
        )

# Add Long COVID points
for _, row in long_covid_data.iterrows():
    if row['Location'] in coordinates:
        create_circle_marker(
            location=coordinates[row['Location']],
            count=row['Count'],
            color='red',
            radius=10  # Fixed size to highlight
        )

# Add legend
legend_html = '''
<div style="position: fixed; bottom: 50px; left: 50px; width: 220px; height: 90px; 
    border:2px solid grey; z-index:9999; font-size:14px; background-color:white;
    ">&nbsp; <b>Legend</b> <br>
    &nbsp; COVID-19 tweets &nbsp; <i class="fa fa-circle fa-1x" style="color:blue"></i><br>
    &nbsp; Long COVID tweets &nbsp; <i class="fa fa-circle fa-1x" style="color:red"></i>
</div>
'''
m.get_root().html.add_child(folium.Element(legend_html))

# Save map
m.save('covid_tweet_map.html')

print("Map saved as 'covid_tweet_map.html'")

Map saved as 'covid_tweet_map.html'
