In [1]:
import pyspark
sc = pyspark.SparkContext('local[*]')

# do something to prove it works
rdd = sc.parallelize(range(1000))
rdd.takeSample(False, 5)

[804, 898, 487, 923, 413]

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
import matplotlib.pyplot as plt
import pandas as pd
from geopy.geocoders import Nominatim
import numpy as np
from pycountry_convert import country_alpha2_to_continent_code, country_name_to_country_alpha2
import folium
from folium.plugins import MarkerCluster
# import geopandas as gpd
# import seaborn as sns
# from fuzzywuzzy import process

print('Modules are imported.')

Modules are imported.


In [2]:
spark = SparkSession.builder.appName("Covid19Analysis").getOrCreate()

covid_data = spark.read.csv('part-00000-d15c0132-834c-4976-ad28-7286de542a86-c000.csv', header=True, inferSchema=True)

In [3]:
# Perform necessary data transformations
covid_data = covid_data.withColumn("year", F.year("date"))
covid_data = covid_data.withColumn("month", F.month("date"))

# Filter data for the year 2021
covid_data_2021 = covid_data.filter(covid_data.year == 2021)

# Calculate total confirmed cases by country and month for the year 2021
total_confirmed_2021_monthly = covid_data_2021.groupBy("country", "month").agg(F.sum("confirmed").alias("total_confirmed"))

# Order the result by country and month
total_confirmed_2021_monthly = total_confirmed_2021_monthly.orderBy("country", "month")

# Show the result
total_confirmed_2021_monthly.show()

+-----------+-----+---------------+
|    country|month|total_confirmed|
+-----------+-----+---------------+
|afghanistan|    1|        1671259|
|afghanistan|    2|        1553102|
|afghanistan|    3|        1737068|
|afghanistan|    4|        1733260|
|afghanistan|    5|        1991950|
|afghanistan|    6|        2848717|
|afghanistan|    7|        4241348|
|afghanistan|    8|        4696019|
|afghanistan|    9|        4628009|
|afghanistan|   10|        4827238|
|afghanistan|   11|        4701215|
|afghanistan|   12|        4889195|
|    albania|    1|        2088163|
|    albania|    2|        2610775|
|    albania|    3|        3652195|
|    albania|    4|        3865660|
|    albania|    5|        4089156|
|    albania|    6|        3973546|
|    albania|    7|        4113962|
|    albania|    8|        4262873|
+-----------+-----+---------------+
only showing top 20 rows



In [4]:
total_confirmed_2021 = total_confirmed_2021_monthly.groupBy("country").agg(F.sum("total_confirmed").alias("total_confirmed"))

total_confirmed_2021 = total_confirmed_2021.orderBy("country")

non_countries = ['winter_olympics_2022', 'summer_olympics_2020', 'ms_zaandam', 'diamond_princess', 'holy_see']

total_confirmed_2021 = total_confirmed_2021[~total_confirmed_2021['country'].isin(non_countries)]

df = total_confirmed_2021.toPandas()

# Modify the 'country' column
df['country'] = df['country'].str.replace('_', ' ').str.title()

# Display the updated DataFrame
print(df)

                country  total_confirmed
0           Afghanistan         39518380
1               Albania         51106141
2               Algeria         57172988
3               Andorra          5084330
4                Angola         14797145
..                  ...              ...
191             Vietnam        135539309
192  West Bank And Gaza        115974691
193               Yemen          2403947
194              Zambia         52593029
195            Zimbabwe         30065437

[196 rows x 2 columns]


In [5]:
def get_continent(country):
    try:
        cn_a2_code = country_name_to_country_alpha2(country)
    except:
        cn_a2_code = 'Unknown' 
    try:
        cn_continent = country_alpha2_to_continent_code(cn_a2_code)
    except:
        cn_continent = 'Unknown' 
    return (cn_a2_code, cn_continent)

In [6]:
df[['country_alpha2', 'continent']] = df['country'].apply(lambda x: pd.Series(get_continent(x)))

print(df)

                country  total_confirmed country_alpha2 continent
0           Afghanistan         39518380             AF        AS
1               Albania         51106141             AL        EU
2               Algeria         57172988             DZ        AF
3               Andorra          5084330             AD        EU
4                Angola         14797145             AO        AF
..                  ...              ...            ...       ...
191             Vietnam        135539309             VN        AS
192  West Bank And Gaza        115974691        Unknown   Unknown
193               Yemen          2403947             YE        AS
194              Zambia         52593029             ZM        AF
195            Zimbabwe         30065437             ZW        AF

[196 rows x 4 columns]


In [7]:
# Specify a custom user_agent
custom_user_agent = "my-application"
geolocator = Nominatim(user_agent=custom_user_agent)

def geolocate(country):
    try:
        # Geolocate the center of the country
        loc = geolocator.geocode(country)
        # And return latitude and longitude
        return (loc.latitude, loc.longitude)
    except:
        # Return missing value
        return np.nan

In [10]:
# Apply the geolocate function to the 'country' column
df[['latitude', 'longitude']] = df['country'].apply(lambda x: pd.Series(geolocate(x)))

# Display the updated DataFrame
#print(df)

# Save the updated DataFrame to a CSV file
#df.to_csv('output_file.csv', index=False)

corrections = {
    'Albania': (41.153332, 20.168331),
    'Antarctica': (-90.0, 0.0),
    'Bosnia And Herzegovina': (43.915886, 17.679076),
    'Brazil': (-14.235004, -51.92528),
    'China': (35.86166, 104.195397),
    'Germany': (51.165691, 10.451526),
    #'Guinea-Bissau': (11.803749, -15.180413),
    'Iran': (32.427908, 53.688046),
    'Korea North': (40.339852, 127.510093),
    'Kuwait': (29.375859, 47.977405),
    'Moldova': (47.411631, 28.369885),
    'Morocco': (31.791702, -7.09262),
    'Russia': (61.52401, 105.318756),
    'Seychelles': (-4.679574, 55.491977),
    #'Timor-Leste': (-8.874217, 125.727539),
    'Us': (37.09024, -95.712891),
    'Egypt': (26.820553, 30.802498),
    'Georgia': (42.3154, 43.3569),
    'Mongolia': (46.8625, 103.8467),
    'Sweden': (60.1282, 18.6435),
    'Greece': (39.0742, 21.8243),
    'Hungary': (47.1625, 19.5033),
    'Lebanon': (33.8547, 35.8623),
    'Israel': (31.0461, 34.8516),
    'Colombia': (4.5709, -74.2973),
    'Cambodia': (12.5657, 104.9910)
}


# Update the DataFrame with corrected coordinates
for country, (lat, lon) in corrections.items():
    df.loc[df['country'] == country, ['latitude', 'longitude']] = lat, lon

# Display the updated DataFrame
print(df)


                country  total_confirmed country_alpha2 continent   latitude  \
0           Afghanistan         39518380             AF        AS  33.768006   
1               Albania         51106141             AL        EU  41.153332   
2               Algeria         57172988             DZ        AF  28.000027   
3               Andorra          5084330             AD        EU  42.540717   
4                Angola         14797145             AO        AF -11.877577   
..                  ...              ...            ...       ...        ...   
191             Vietnam        135539309             VN        AS  15.926666   
192  West Bank And Gaza        115974691        Unknown   Unknown  31.904966   
193               Yemen          2403947             YE        AS  16.347124   
194              Zambia         52593029             ZM        AF -14.518912   
195            Zimbabwe         30065437             ZW        AF -18.455496   

      longitude  
0     66.238514  
1  

In [11]:
# Empty map
world_map = folium.Map(tiles="cartodbpositron")
marker_cluster = MarkerCluster().add_to(world_map)

# For each coordinate, create a CircleMarker for valid coordinates
for i in range(len(df)):
    lat = df.iloc[i]['latitude']
    long = df.iloc[i]['longitude']
    
    # Check for NaN values
    if not np.isnan(lat) and not np.isnan(long):
        radius = 5
        popup_text = """country : {}<br>
                        total_confirmed : {}<br>"""
        popup_text = popup_text.format(df.iloc[i]['country'], df.iloc[i]['total_confirmed'])
        
        folium.CircleMarker(location=[lat, long], radius=radius, popup=popup_text, fill=True).add_to(marker_cluster)

# Show the map
world_map