<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Import-pandas-and-numpy,-and-load-the-COVID-19-and-land-temperature-data" data-toc-modified-id="Import-pandas-and-numpy,-and-load-the-COVID-19-and-land-temperature-data-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Import pandas and numpy, and load the COVID-19 and land temperature data</a></span></li><li><span><a href="#Sort-data-by-location-and-date" data-toc-modified-id="Sort-data-by-location-and-date-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Sort data by location and date</a></span></li><li><span><a href="#Iterate-over-rows-with-itertuples" data-toc-modified-id="Iterate-over-rows-with-itertuples-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Iterate over rows with itertuples</a></span></li><li><span><a href="#Create-a-DataFrame-from-the-list-of-summary-values,-rowlist" data-toc-modified-id="Create-a-DataFrame-from-the-list-of-summary-values,-rowlist-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Create a DataFrame from the list of summary values, rowlist</a></span></li><li><span><a href="#Sort-the-land-temperature-data" data-toc-modified-id="Sort-the-land-temperature-data-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Sort the land temperature data</a></span></li><li><span><a href="#Exclude-rows-where-there-is-a-large-change-from-one-period-to-the-next" data-toc-modified-id="Exclude-rows-where-there-is-a-large-change-from-one-period-to-the-next-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Exclude rows where there is a large change from one period to the next</a></span></li><li><span><a href="#Create-a-DataFrame-from-the-summary-values" data-toc-modified-id="Create-a-DataFrame-from-the-summary-values-7"><span class="toc-item-num">7&nbsp;&nbsp;</span>Create a DataFrame from the summary values</a></span></li></ul></div>

# Import pandas and numpy, and load the COVID-19 and land temperature data

In [41]:
import pandas as pd
import numpy as np

In [42]:
# pd.set_option('display.width', 200)
# pd.set_option('display.max_columns', 35)
# pd.set_option('display.max_rows', 200)
pd.options.display.float_format = '{:,.2f}'.format

In [43]:
import watermark
%load_ext watermark

%watermark -n -i -iv

The watermark extension is already loaded. To reload it, use:
  %reload_ext watermark
watermark: 2.1.0
pandas   : 1.2.1
json     : 2.0.9
numpy    : 1.19.2



In [44]:
coviddaily = pd.read_csv('data/coviddaily720.csv', parse_dates=['casedate'])
ltbrazil = pd.read_csv('data/ltbrazil.csv')

# Sort data by location and date

In [45]:
coviddaily = coviddaily.sort_values(['location', 'casedate'])

# Iterate over rows with itertuples

In [46]:
coviddaily.tail(2)

Unnamed: 0,iso_code,casedate,location,continent,new_cases,new_deaths,population,pop_density,median_age,gdp_per_capita,hosp_beds,region
29211,ZWE,2020-07-11,Zimbabwe,Africa,16.0,1.0,14862927.0,42.73,19.6,1899.78,1.7,Southern Africa
29212,ZWE,2020-07-12,Zimbabwe,Africa,40.0,5.0,14862927.0,42.73,19.6,1899.78,1.7,Southern Africa


In [47]:
prevloc = 'ZZZ'
rowlist = []

for row in coviddaily.itertuples():
    if (prevloc != row.location):
        if prevloc != 'ZZZ':
            rowlist.append({'location': prevloc, 'case_count': case_count})
        case_count = 0
        prevloc = row.location
    case_count += row.new_cases

In [48]:
rowlist.append({'location': prevloc, 'case_count': case_count})

In [49]:
len(rowlist)

209

In [50]:
rowlist[0:4]

[{'location': 'Afghanistan', 'case_count': 34451.0},
 {'location': 'Albania', 'case_count': 3371.0},
 {'location': 'Algeria', 'case_count': 18712.0},
 {'location': 'Andorra', 'case_count': 855.0}]

# Create a DataFrame from the list of summary values, rowlist

In [51]:
covidtotals = pd.DataFrame(rowlist)
covidtotals.head()

Unnamed: 0,location,case_count
0,Afghanistan,34451.0
1,Albania,3371.0
2,Algeria,18712.0
3,Andorra,855.0
4,Angola,483.0


# Sort the land temperature data

In [52]:
ltbrazil = ltbrazil.sort_values(['station', 'month'])
ltbrazil = ltbrazil.dropna(subset=['temperature'])

In [53]:
ltbrazil.tail(2)

Unnamed: 0,locationid,year,month,temperature,latitude,longitude,elevation,station,countryid,country,latabs
938,BR00B6-0360,2019,11,25.15,-20.42,-49.98,503.0,VOTUPORANGA,BR,Brazil,20.42
1030,BR00B6-0360,2019,12,24.85,-20.42,-49.98,503.0,VOTUPORANGA,BR,Brazil,20.42


# Exclude rows where there is a large change from one period to the next

In [54]:
prevstation = 'ZZZ'
prevtemp = 0
templist = []
# station_count = 0
# temp_count = 0

In [55]:
for row in ltbrazil.itertuples():
    if prevstation != row.station:
        if prevstation != 'ZZZ':
            templist.append({
                'station': prevstation,
                'avgtemp': temp_count / station_count,
                'station_count': station_count
            })
        temp_count = 0
        station_count = 0
        prevstation = row.station
    # choose only rows that are within 3 degrees of the previous temperature
    if ((0 <= abs(row.temperature - prevtemp) <= 3) or (station_count == 0)):
        temp_count += row.temperature
        station_count += 1
    prevtemp = row.temperature

In [56]:
templist.append({
    'station': prevstation,
    'avgtemp': temp_count / station_count,
    'station_count': station_count
})
templist[0:5]

[{'station': 'ALTAMIRA', 'avgtemp': 28.310000000000002, 'station_count': 5},
 {'station': 'ALTA_FLORESTA_AERO',
  'avgtemp': 29.433636363636367,
  'station_count': 11},
 {'station': 'ARAXA', 'avgtemp': 21.612499999999997, 'station_count': 4},
 {'station': 'BACABAL', 'avgtemp': 29.75, 'station_count': 4},
 {'station': 'BAGE', 'avgtemp': 20.366666666666664, 'station_count': 9}]

# Create a DataFrame from the summary values

In [57]:
ltbrazilavgs = pd.DataFrame(templist)

In [58]:
ltbrazilavgs.head(2)

Unnamed: 0,station,avgtemp,station_count
0,ALTAMIRA,28.31,5
1,ALTA_FLORESTA_AERO,29.43,11
