In [1]:
import pandas as pd
import numpy as np

#Cases&Deaths
data1=pd.read_csv('case+deaths.csv')
data1=data1.drop(labels=['Country_code','WHO_region','Cumulative_cases','Cumulative_deaths'], axis=1)
#print(data1.info())
#print(data1.columns)

#Vacc
data2=pd.read_csv('vaccinations.csv')
data2=data2[['location','date','daily_vaccinations']]
#print(data2.info())
#print(data2.columns)

data2=data2.fillna(0) #change NA cells to 0
#print(data1.duplicated())
#print(data2.duplicated())

#Reorder columns
data1=data1[['Country','Date_reported','New_cases','New_deaths']]
data2=data2[['location','date','daily_vaccinations']]
data1=data1.rename(columns={'Date_reported':'date'})
data2=data2.rename(columns={'location':'Country'})
#print(data1.head())
#print(data2.head())

#Some countries appear in data1 and not data2. Want to take intersection
set1=set()
for row in data1.index:
  if data1.iloc[row, 0] not in set1:
    set1.add(data1.iloc[row, 0])

set2=set()
for row in data2.index:
  if data2.iloc[row, 0] not in set2:
    set2.add(data2.iloc[row,0])

finalset=set1.intersection(set2) #set of countries
#print(len(set1),len(set2),len(finalset))



In [2]:
popn={}
with open('countries_population.txt','r') as read:
    for line in read:
        line = line.split(': ')
        popn[line[0]]=int(line[1])
#print(popn)

In [3]:
count=0
countries_dict={}
PYTHONHASHSEED = 123
for country in finalset:
  countries_dict[country]=count
  count+=1

with open('countries.txt', 'w') as writefile:
  for country in countries_dict:
    writefile.write(f'{country}: {countries_dict[country]}\n')

timestep=14 
predict=3

#Our input is 14*3 datapoints (of cases/deaths/vacs) and we hope to predict 3 days of deaths. Adjust above if necc

#We want our input to be 14*(no. of vars=3) length vector and our desired output to be the the next 3 death counts
#So each row should be [<cases for 14 days><vac><deaths for 14 days><deaths for 3 days>]  

readable=pd.DataFrame() #Full data in readable format
train=pd.DataFrame()
valid=pd.DataFrame()
test=pd.DataFrame()
#Merge tables
for country in finalset:
    temp1=data1.loc[lambda data1: data1['Country']==country]
    temp2=data2.loc[lambda data2: data2['Country']==country]
    comb=pd.merge(temp1,temp2,on=['Country','date'],how='outer')
    
    comb=comb.fillna(0) #note that cumsum fills na with na, i.e. 1 na 2 -> 1 na 3
    total=comb.loc[:,'daily_vaccinations'].cumsum()
    comb.loc[:,'daily_vaccinations']=total #want total vaccinations
    
    total=comb.loc[:, 'New_cases'].cumsum()
    comb.loc[:, 'New_cases'] = total
    
    comb=comb.rename(columns={'daily_vaccinations':'totalvac','New_cases':'totalcases'})
    comb=comb[['Country','date','totalcases','totalvac','New_deaths']]
    
    comb[['totalcases','totalvac','New_deaths']] = comb[['totalcases','totalvac','New_deaths']].div(popn[country])

    readable=pd.concat([readable,comb])
    temp=pd.DataFrame()

    for row in comb.index[:-predict-timestep]: #we take consecutive blocks of timestep+predict as our data point
        input=np.array(comb.iloc[row:row+timestep,-3:]) #1 block of input, shape is (timestep,3) 
        input=np.transpose(input) #(3,timestep)
        input=np.reshape(input,[1,3*timestep]) #flatten it; shape= (1,3*timestep)
        
        output=np.array(comb.iloc[row+timestep:row+timestep+predict,-1]) #shape = (predict,1)
        output=np.reshape(output,[1,predict]) #shape = (1,predict)
        unitdata=np.concatenate([[[countries_dict[country]]],input,output],axis=1) #Format: [<country num><timestep days of cases><... of deaths><... of vac><predict days of deaths>]
    
        unitdata=pd.DataFrame(unitdata)
        temp=pd.concat([temp,unitdata]) #add unitdata to temp

    temp = temp.sample(frac=1, random_state=123).reset_index(drop=True) #shuffle, use random state for reproducibility 
    num_rows=len(temp.index)
    train_temp=temp.iloc[:int(num_rows*0.8),:] #80% of data will be train and 10% will be validation and 10% will be test
    valid_temp=temp.iloc[int(num_rows*0.8):int(num_rows*0.9),:]
    test_temp=temp.iloc[int(num_rows*0.9):,:]
    train=pd.concat([train,train_temp])
    valid=pd.concat([valid,valid_temp])
    test=pd.concat([test,test_temp])
    print(country)

readable.to_csv('norm_full_data_readable.csv')
#IMPORTANT: Format: [<encoding of country><timestep days of cases><... of deaths><... of vac><predict days of deaths>]
train.to_csv('norm_train.csv')
valid.to_csv('norm_valid.csv')
test.to_csv('norm_test.csv')

Canada
Niger
Tokelau
Zambia
Germany
Myanmar
Tonga
Grenada
Haiti
Maldives
Niue
San Marino
Seychelles
Ukraine
Portugal
United Arab Emirates
Cameroon
Papua New Guinea
Uganda
Malawi
Thailand
Pakistan
Costa Rica
Benin
Anguilla
Saint Lucia
Qatar
Congo
Cuba
Barbados
Norway
Trinidad and Tobago
Albania
Mauritius
Mozambique
Uzbekistan
Lebanon
Nauru
Afghanistan
Dominican Republic
Guinea
Finland
Aruba
Bahrain
Equatorial Guinea
Botswana
Brazil
Isle of Man
El Salvador
Liechtenstein
Dominica
Kyrgyzstan
Algeria
Gibraltar
Kazakhstan
Oman
Paraguay
Saudi Arabia
Slovakia
Belgium
Libya
Guinea-Bissau
Belize
Guernsey
Uruguay
Latvia
Chad
Azerbaijan
French Polynesia
Senegal
Ethiopia
Luxembourg
Tajikistan
Bangladesh
Suriname
Sierra Leone
Bulgaria
Central African Republic
Ghana
Kiribati
Netherlands
Turks and Caicos Islands
Sweden
Lithuania
Philippines
Austria
Bermuda
Romania
Singapore
Namibia
South Africa
Greece
Cambodia
Czechia
Zimbabwe
France
Bhutan
Sudan
Iceland
Croatia
Armenia
Yemen
Cayman Islands
Slovenia
T