In [1]:
###Installing required libraries

#!pip install country_converter --upgrade
#!pip install plotly
#!pip install world_bank_data
#!pip install 'umap-learn==0.3.10'

In [2]:
# Importing Libraries

import pandas as pd
import numpy as np
import re
import country_converter as coco
from datetime import datetime
from os.path import isfile
import plotly.express as px
import world_bank_data as wb
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import seaborn as sns    
from sklearn.cluster import DBSCAN
import json as json
import os
import umap.umap_ as umap
from google.cloud import storage
from sklearn.metrics import silhouette_score

In [3]:
fileName = "allData.pkl"
URL = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/"

PLOTLY_COLORS=['rgb(255, 127, 14)','rgb(31, 119, 180)', 
                       'rgb(148, 103, 189)', 'rgb(140, 86, 75)',
                       'rgb(44, 160, 44)', 'rgb(214, 39, 40)',
                       'rgb(188, 189, 34)', 'rgb(23, 190, 207)',
                      'rgb(227, 119, 194)', 'rgb(127, 127, 127)']


In [4]:
%%html
<style>
g.pointtext {
    display: none;
    traceorder: reversed;
}

In [5]:
def global_demographics(data):
    data_copy = data.copy()
    ## Reading the global Population from World Bank.
    pop = wb.get_series('SP.POP.TOTL', date='2018', id_or_value='id', simplify_index=True)
    countries = data_copy['Country'].unique()
    codes_IOS3 = coco.convert(list(countries), to='ISO3')
    map_ISO3 = dict(zip(countries, codes_IOS3))
    data_copy.insert(4, 'Population', \
        [pop[c] if c in pop else 0 for c in [map_ISO3[country] for country in data_copy.Country]]
    )
    ## Chinese provinces reading from Wikipedia.
    pop_china = china_demographics().set_index('Province')['Population']
    indx = (data_copy.Country == 'China') & (data_copy.State != '<all>')
    data_copy.loc[indx, 'Population'] = [pop_china[p] if p in pop_china else 0 for p in data_copy.loc[indx, 'State']]
    return data_copy


# Reading global data
def load_global_data(file, featureName): 
    dict_agg = { featureName:sum, 'Lat':np.median, 'Long':np.median }
    data_all = pd.read_csv(URL + file) \
             .rename(columns={ 'Province/State':'State','Country/Region':'Country' }) \
             .melt(id_vars=['Lat', 'Long','Country', 'State'], var_name='date', value_name=featureName) \
             .astype({'date':'datetime64[ns]', featureName:'Int64'}, errors='ignore')

    data_china = data_all[data_all.Country == 'China']
    data_all = data_all.groupby(['Country', 'date']).agg(dict_agg).reset_index()
    data_all.loc[data_all.Country == 'United Kingdom', 'Lat'] = 55.3781
    data_all.loc[data_all.Country == 'United Kingdom', 'Long'] = -3.4360
    data_all.loc[data_all.Country == 'Denmark', 'Lat'] = 56.2639
    data_all.loc[data_all.Country == 'Denmark', 'Long'] = 9.5018
    data_all.loc[data_all.Country == 'Netherlands', 'Lat'] = 52.1326
    data_all.loc[data_all.Country == 'Netherlands', 'Long'] = 5.2913
    data_all.loc[data_all.Country == 'France', 'Lat'] = 46.2276
    data_all.loc[data_all.Country == 'France', 'Long'] = 2.2137

    data_all['State'] = '<all>'
    return pd.concat([data_all, data_china]) 

# Reading the USA data
def load_us_data(file, featureName, keepPopulation=False): 
    id_vars=['Country', 'State', 'Lat', 'Long']
    dict_agg = { featureName:sum, 'Lat':np.median, 'Long':np.median }
    if keepPopulation:
        id_vars.append('Population')
        dict_agg['Population'] = sum 
    data_all = pd.read_csv(URL + file).iloc[:, 6:] \
             .drop('Combined_Key', axis=1) \
             .rename(columns={ 'Long_':'Long','Province_State':'State','Country_Region':'Country' }) \
             .melt(id_vars=id_vars, var_name='date', value_name=featureName) \
             .astype({'date':'datetime64[ns]', featureName:'Int64'}, errors='ignore') \
             .groupby(['date','Country', 'State']).agg(dict_agg).reset_index()
    return data_all

def china_demographics():
    
    client = storage.Client()
    bucket = client.get_bucket('cluster_hotspot')
    blob = bucket.get_blob('input/china_population_wiki.tsv')
    stage = blob.download_to_filename("china_population.tsv")  
    china=open('china_population.tsv', 'r') 
    
    ls = []
    for i, text in enumerate(china.readlines()):
        if i % 3 == 0:
            l = ''
        l += text.strip()
        if i % 3 == 2:
            ls = ls + [l.split('\t')]
    df_china = pd.DataFrame.from_records(ls).iloc[:, [2,5,1]]
    df_china.columns = ['Province_Orig','Population','ISO']
    df_china.Population = [int(re.sub(',|\[8\]', '', p)) for p in df_china.Population]
    df_china['Province'] = [ \
        re.sub("Uyghur|Municipality.*|Province.*|Autonomous.*|Special.*|Zhuang.*|Hui", "", s).strip() \
        for s in df_china['Province_Orig']]
    return df_china.sort_values('Province')


# Refreshes the data and saves in pickle file
def refresh_data():
    data_all = load_global_data("time_series_covid19_confirmed_global.csv", "CumConfirmed") \
        .merge(load_global_data("time_series_covid19_deaths_global.csv", "CumDeaths"))
    data_all = global_demographics(data_all)
    data_US = load_us_data("time_series_covid19_confirmed_US.csv", "CumConfirmed") \
        .merge(load_us_data("time_series_covid19_deaths_US.csv", "CumDeaths", keepPopulation=True))
    data_final = pd.concat([data_all, data_US])
    data_final.to_pickle(fileName)
    return data_final

def all_data():
    if not os.path.isfile(fileName):
        refresh_data()
    data_all = pd.read_pickle(fileName)
    return data_all

In [7]:
def read_conditional_date(indx, df):
    if len(indx) == 0:
        indx = np.nan
        date = np.nan
        days_until = np.nan
    else:
        indx = indx[0]
        date = df.iloc[indx]['date']
        days_until = (datetime.now() - date).days
    return indx, date, days_until

def prepare_data(df):
    cummulative_cases = df.select_dtypes(include='Int64').astype('float')
    new_cases = cummulative_cases.diff()  
    new_cases = new_cases.apply(fix_na)
    cummulative_cases = new_cases.cumsum()
    new_cases.columns = [column.replace('Cum', 'New') for column in new_cases.columns]
    df_all = cummulative_cases.join(new_cases)
    gma7_df = moving_avg(df_all, len=7)
    return cummulative_cases, new_cases, df_all, gma7_df

# Fixing NAs and zeros in the data
def fix_na(column_df):
    ind = np.where(column_df[1:] == 0.0)[0] + 1
    for i in ind[ind < column_df.size - 1]:
        column_df.iloc[i] = column_df.iloc[i+1] = 0.5 * column_df.iloc[i+1]
    return column_df

def moving_avg(df, len=7):
    return df.apply(np.log).rolling(len).mean().apply(np.exp)

In [8]:
#Reading existing features and calculating new features
def get_features(df):
    if df.size < 29+7:
        return { }
    ## Remove last row if it seems broken (confirmed cases dropped by >85%).
    if df.iloc[-1]['CumConfirmed'] < 0.15 * df.iloc[-2]['CumConfirmed']:
        df = df[:-1]
    last = df.iloc[-1]
    cummulative_cases, new_cases, df_all, gma7_df = prepare_data(df)

    ## Index of Outbreak Date (cases > 100/20M).
    outbreak_ind, outbreak_date, days_since_outbreak = read_conditional_date(np.where(cummulative_cases.CumConfirmed / df.Population > 5 / 1E6)[0], df)
    ## Index of Outbreak Date (cases > 1000/20M).
    ind_10X, date_10X, _ = read_conditional_date(np.where(cummulative_cases.CumConfirmed / df.Population > 30 / 1E6)[0], df)
    ## Index of Peak week.
    peak_ind = np.argmax(gma7_df.NewDeaths)
    date_peak = df.iloc[peak_ind]['date']
    ## Early Motality.
    earlyMortality = gma7_df.NewDeaths.iloc[outbreak_ind + 17] / gma7_df.NewConfirmed.iloc[outbreak_ind + 3] \
        if (gma7_df.shape[0] > outbreak_ind + 17) else np.nan
    
    new_conf_w0tow2 = gma7_df.NewConfirmed.iloc[-1] / gma7_df.NewConfirmed.iloc[-15]
    new_conf_w2tow4 = gma7_df.NewConfirmed.iloc[-15] / gma7_df.NewConfirmed.iloc[-29]
    return {
        'Lat':last.Lat,'Long':last.Long,'Population':last.Population,'OutbreakDate':outbreak_date,
        'DaysSinceOutbreak':days_since_outbreak, 'DaysSincePeak':(datetime.now() - date_peak).days,
        'Current_Accel':new_conf_w0tow2 / new_conf_w2tow4,'DaysTo10X':ind_10X - outbreak_ind,
        'CasesPerMm':last.CumConfirmed / last.Population * 1E6,'DeathsPerMm':last.CumDeaths / last.Population * 1E6,
        'Early_Mortality':earlyMortality,'New_Conf_W0toW2':new_conf_w0tow2, 'New_Conf_W2toW4':new_conf_w2tow4
    }

In [9]:
data = all_data()[['Country', 'State', 'date', 'Lat', 'Long', 'Population', 'CumConfirmed', 'CumDeaths']]

In [10]:
countries = data['Country'].unique()
countries.sort()
data

Unnamed: 0,Country,State,date,Lat,Long,Population,CumConfirmed,CumDeaths
0,Afghanistan,<all>,2020-01-22,33.939110,67.709953,37172386.0,0,0
1,Afghanistan,<all>,2020-01-23,33.939110,67.709953,37172386.0,0,0
2,Afghanistan,<all>,2020-01-24,33.939110,67.709953,37172386.0,0,0
3,Afghanistan,<all>,2020-01-25,33.939110,67.709953,37172386.0,0,0
4,Afghanistan,<all>,2020-01-26,33.939110,67.709953,37172386.0,0,0
...,...,...,...,...,...,...,...,...
27661,US,Virginia,2021-05-12,37.373732,-78.158270,8535519.0,668147,10934
27662,US,Washington,2021-05-12,47.125212,-120.738013,7614893.0,418020,5614
27663,US,West Virginia,2021-05-12,38.843154,-80.665911,1792147.0,157215,2748
27664,US,Wisconsin,2021-05-12,44.397070,-89.560937,5822434.0,667637,7687


In [11]:
# Filtering data and viewing
data[(data.date == '2021-05-12') & (data.Population > 1E6)].sample(frac=2, replace=True)

Unnamed: 0,Country,State,date,Lat,Long,Population,CumConfirmed,CumDeaths
107774,China,Guangxi,2021-05-12,23.829800,108.788100,46026629.0,275,2
107793,China,Shanghai,2021-05-12,31.202000,121.449100,23019148.0,2025,7
27661,US,Virginia,2021-05-12,37.373732,-78.158270,8535519.0,668147,10934
63917,Panama,<all>,2021-05-12,8.538000,-80.782100,4176873.0,368930,6285
80612,Syria,<all>,2021-05-12,34.802075,38.996815,16906283.0,23543,1676
...,...,...,...,...,...,...,...,...
41975,Japan,<all>,2021-05-12,36.204824,138.252924,126529100.0,660884,11148
28619,Ethiopia,<all>,2021-05-12,9.145000,40.489700,109224559.0,264367,3938
88244,Uzbekistan,<all>,2021-05-12,41.377491,64.585262,32956100.0,95467,666
5246,Azerbaijan,<all>,2021-05-12,40.143100,47.576900,9939771.0,328159,4726


In [12]:
## Calculate features
features_select = data.groupby(['Country', 'State']).apply(get_features)
features_select = pd.DataFrame(list(features_select), index=features_select.index)
features_select.head()



Unnamed: 0_level_0,Unnamed: 1_level_0,Lat,Long,Population,OutbreakDate,DaysSinceOutbreak,DaysSincePeak,Current_Accel,DaysTo10X,CasesPerMm,DeathsPerMm,Early_Mortality,New_Conf_W0toW2,New_Conf_W2toW4
Country,State,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
Afghanistan,<all>,33.93911,67.709953,37172386.0,2020-04-01,408.0,301,0.757504,21.0,1687.220185,72.984285,,1.595709,2.106535
Albania,<all>,41.1533,20.1683,2866376.0,2020-03-12,428.0,69,0.916769,10.0,45997.105753,845.318269,0.202997,0.465673,0.507951
Algeria,<all>,28.0339,1.6596,42228429.0,2020-03-23,417.0,399,0.693388,13.0,2952.560703,79.330443,0.685575,1.007431,1.452911
Andorra,<all>,42.5063,1.5218,77006.0,2020-03-01,439.0,406,0.723701,16.0,174921.434693,1649.222139,,0.562773,0.777632
Angola,<all>,-11.2027,17.8739,30809762.0,2020-06-17,331.0,4,0.977546,39.0,954.405295,20.934923,0.23942,1.471433,1.505232


In [13]:
features_select['Region'] = features_select.index.get_level_values('Country')
is_region = (features_select.index.get_level_values('State') != '<all>')
features_select.loc[is_region, 'Region'] = features_select.index.get_level_values('Country')[is_region] + ':' + \
    features_select.index.get_level_values('State')[is_region]

In [14]:
features_select[features_select.index.get_level_values('Country') == 'US'].sort_values("DeathsPerMm", ascending=False).head(30)

Unnamed: 0_level_0,Unnamed: 1_level_0,Lat,Long,Population,OutbreakDate,DaysSinceOutbreak,DaysSincePeak,Current_Accel,DaysTo10X,CasesPerMm,DeathsPerMm,Early_Mortality,New_Conf_W0toW2,New_Conf_W2toW4,Region
Country,State,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
US,Grand Princess,0.0,0.0,0.0,2020-03-15,425.0,2,,0.0,inf,inf,,,,US:Grand Princess
US,New Jersey,40.28047,-74.61683,8882190.0,2020-03-13,427.0,389,,5.0,113608.6,2913.921004,2.620131,,,US:New Jersey
US,New York,42.617764,-74.964273,19453561.0,2020-03-08,432.0,397,0.983767,7.0,106796.0,2715.441147,3.451553,0.562779,0.572066,US:New York
US,Massachusetts,42.134404,-71.078504,6892503.0,2020-03-07,433.0,384,0.893924,6.0,101441.7,2569.748609,,0.58158,0.650593,US:Massachusetts
US,Rhode Island,41.564203,-71.286523,1059361.0,2020-03-13,427.0,152,,5.0,141832.7,2544.930387,,,,US:Rhode Island
US,Mississippi,32.754156,-89.530957,2976149.0,2020-03-17,423.0,117,,4.0,105554.9,2434.01792,,,,US:Mississippi
US,Arizona,33.214988,-110.847909,7278717.0,2020-03-19,421.0,118,0.871641,4.0,119612.3,2394.652794,0.510678,0.90438,1.03756,US:Arizona
US,Connecticut,41.474231,-72.433998,3565287.0,2020-03-14,426.0,383,,5.0,96525.47,2289.016284,0.821099,,,US:Connecticut
US,Louisiana,30.775978,-91.762533,4648794.0,2020-03-13,427.0,391,,4.0,99810.19,2250.691255,1.927014,,,US:Louisiana
US,South Dakota,44.071478,-98.816762,884659.0,2020-03-11,429.0,172,,12.0,139622.2,2247.193551,,,0.598215,US:South Dakota


In [15]:
data_plot = features_select.merge(data, how='left').sort_values(['Region', 'date'])
data_plot['days'] = (data_plot.date - data_plot.OutbreakDate).dt.days
data_plot = data_plot[(data_plot.days >= 0) & (data_plot.days <=50)]
data_plot.sample(5)

Unnamed: 0,Lat,Long,Population,OutbreakDate,DaysSinceOutbreak,DaysSincePeak,Current_Accel,DaysTo10X,CasesPerMm,DeathsPerMm,Early_Mortality,New_Conf_W0toW2,New_Conf_W2toW4,Region,Country,State,date,CumConfirmed,CumDeaths,days
24348,27.6104,111.7088,65683722.0,2020-02-01,468.0,2,,,15.924798,0.060898,,,,China:Hunan,China,Hunan,2020-02-12,946,2,11.0
98343,60.128161,18.643501,10175214.0,2020-03-04,436.0,385,,5.0,101023.329829,1402.132673,0.343191,,,Sweden,Sweden,<all>,2020-04-12,10912,899,39.0
79776,17.607789,8.081666,22442948.0,2020-04-03,406.0,122,,21.0,237.268295,8.555026,,0.469585,,Niger,Niger,<all>,2020-05-18,909,55,45.0
60186,48.0196,66.9237,18276452.0,2020-03-26,414.0,263,0.432327,10.0,22123.987741,185.320433,,0.481658,1.114106,Kazakhstan,Kazakhstan,<all>,2020-04-15,1295,16,20.0
25299,32.9711,119.455,78659903.0,2020-02-07,462.0,2,,,9.166042,0.0,,,,China:Jiangsu,China,Jiangsu,2020-02-09,468,0,2.0


In [16]:
features_select.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Lat,Long,Population,OutbreakDate,DaysSinceOutbreak,DaysSincePeak,Current_Accel,DaysTo10X,CasesPerMm,DeathsPerMm,Early_Mortality,New_Conf_W0toW2,New_Conf_W2toW4,Region
Country,State,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Afghanistan,<all>,33.93911,67.709953,37172386.0,2020-04-01,408.0,301,0.757504,21.0,1687.220185,72.984285,,1.595709,2.106535,Afghanistan
Albania,<all>,41.1533,20.1683,2866376.0,2020-03-12,428.0,69,0.916769,10.0,45997.105753,845.318269,0.202997,0.465673,0.507951,Albania
Algeria,<all>,28.0339,1.6596,42228429.0,2020-03-23,417.0,399,0.693388,13.0,2952.560703,79.330443,0.685575,1.007431,1.452911,Algeria
Andorra,<all>,42.5063,1.5218,77006.0,2020-03-01,439.0,406,0.723701,16.0,174921.434693,1649.222139,,0.562773,0.777632,Andorra
Angola,<all>,-11.2027,17.8739,30809762.0,2020-06-17,331.0,4,0.977546,39.0,954.405295,20.934923,0.23942,1.471433,1.505232,Angola


In [17]:
def get_features(features, names=[ 'Current_Accel','Lat','Long','DaysTo10X', 'Early_Mortality']):
    data = features[names + ['Region']].set_index('Region')
    data = data.replace([np.inf, -np.inf], np.nan)
    data = data.dropna()
    return data

#Plotting cluster using UMAP
def plot_clusters_umap(data, random_state=7):
    standard_embedding = umap.UMAP(random_state=random_state
    ).fit_transform(data)
    
    clusterable_embedding = umap.UMAP(random_state=random_state, min_dist=0.0, n_neighbors=10,
    ).fit_transform(data)
    
    #Clustering using DBSCAN
    labels = DBSCAN(min_samples=2).fit_predict(clusterable_embedding)
    
    print("Silhouette Score")
    print(silhouette_score(clusterable_embedding, labels))
    
    clustered = (labels >= 0)
    print(np.unique(labels, return_counts=True))
    
   
    return labels

In [18]:
f = get_features(features_select, names=['DaysTo10X', 'Early_Mortality', 'Current_Accel'])
labels = plot_clusters_umap(f, random_state=14)
f

(array([0, 1, 2, 3, 4, 5, 6, 7, 8]), array([ 5, 13, 12, 12,  5,  8, 11,  3,  4]))


Unnamed: 0_level_0,DaysTo10X,Early_Mortality,Current_Accel
Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Albania,10.0,0.202997,0.916769
Algeria,13.0,0.685575,0.693388
Angola,39.0,0.239420,0.977546
Argentina,13.0,0.094783,0.868009
Austria,6.0,0.117335,0.674255
...,...,...,...
US:Washington,6.0,0.547704,0.673569
US:Wisconsin,4.0,0.447470,0.970534
Ukraine,11.0,0.136408,0.716558
Vietnam,273.0,0.058346,14.178697


In [19]:
f['cluster'] = labels
f.reset_index(inplace=True)
f[['Country','State']] = f.Region.str.split(':',expand=True)
f

Unnamed: 0,Region,DaysTo10X,Early_Mortality,Current_Accel,cluster,Country,State
0,Albania,10.0,0.202997,0.916769,0,Albania,
1,Algeria,13.0,0.685575,0.693388,1,Algeria,
2,Angola,39.0,0.239420,0.977546,2,Angola,
3,Argentina,13.0,0.094783,0.868009,1,Argentina,
4,Austria,6.0,0.117335,0.674255,3,Austria,
...,...,...,...,...,...,...,...
68,US:Washington,6.0,0.547704,0.673569,3,US,Washington
69,US:Wisconsin,4.0,0.447470,0.970534,5,US,Wisconsin
70,Ukraine,11.0,0.136408,0.716558,1,Ukraine,
71,Vietnam,273.0,0.058346,14.178697,2,Vietnam,


In [20]:
final=data_plot.merge(f, left_on=['Country'], right_on=['Country'])
pd.set_option('display.max_rows', final.shape[0]+1)
final.head()

Unnamed: 0,Country,State_x,date,Lat,Long,Population,CumConfirmed,CumDeaths,Region,DaysTo10X,Early_Mortality,Current_Accel,cluster,State_y
0,Albania,<all>,2020-01-22,41.1533,20.1683,2866376.0,0,0,Albania,10.0,0.202997,0.916769,0,
1,Albania,<all>,2020-01-23,41.1533,20.1683,2866376.0,0,0,Albania,10.0,0.202997,0.916769,0,
2,Albania,<all>,2020-01-24,41.1533,20.1683,2866376.0,0,0,Albania,10.0,0.202997,0.916769,0,
3,Albania,<all>,2020-01-25,41.1533,20.1683,2866376.0,0,0,Albania,10.0,0.202997,0.916769,0,
4,Albania,<all>,2020-01-26,41.1533,20.1683,2866376.0,0,0,Albania,10.0,0.202997,0.916769,0,


In [21]:
final.drop_duplicates(subset = ['Lat','Long'], keep = 'first', inplace = True) 
final.drop(columns=['Population','State_x','State_y','Region','date'],inplace=True)
final.dropna()
final.head()

Unnamed: 0,Country,Lat,Long,CumConfirmed,CumDeaths,DaysTo10X,Early_Mortality,Current_Accel,cluster
0,Albania,41.1533,20.1683,0,0,10.0,0.202997,0.916769,0
477,Algeria,28.0339,1.6596,0,0,13.0,0.685575,0.693388,1
954,Angola,-11.2027,17.8739,0,0,39.0,0.23942,0.977546,2
1431,Argentina,-38.4161,-63.6167,0,0,13.0,0.094783,0.868009,1
1908,Austria,47.5162,14.5501,0,0,6.0,0.117335,0.674255,3


In [22]:
final.to_json(r'clusters.json',orient='records')

In [23]:
with open('clusters.json') as f:
    data = json.load(f)

geojson = {
    "type": "FeatureCollection",
    "features": [
    {
        "type": "Feature",
        "geometry" : {
            "type": "Point",
            "coordinates": [d["Long"], d["Lat"]],
            },
        "properties" : d,
     } for d in data]
}

#with open('clusters.json', 'w') as outfile:
 #json.dump(geojson, outfile)
    
client = storage.Client()
bucket = client.get_bucket('cluster_hotspot')
# Set our bucket 

blob = bucket.blob('output/clusters.json')
blob.upload_from_string(
data=json.dumps(geojson),
content_type='application/json'
 )
   