In [56]:
import numpy as np
import pandas as pd
import seaborn as sns
import os

import matplotlib.pyplot as plt
import plotly.graph_objects as go

from datetime import datetime

# Read data

In [3]:
city_temp_file = os.path.join(
    os.environ['HOME'], 
    'data',
    'global-temperature',
    'GlobalLandTemperaturesByCity.csv')
global_temp_file = os.path.join(
    os.environ['HOME'], 
    'data',
    'global-temperature',
    'GlobalTemperatures.csv')

In [4]:
csv_city_temp = pd.read_csv(city_temp_file)
global_temp = pd.read_csv(global_temp_file)

In [4]:
csv_city_temp[:5]

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,City,Country,Latitude,Longitude
0,1743-11-01,6.068,1.737,Århus,Denmark,57.05N,10.33E
1,1743-12-01,,,Århus,Denmark,57.05N,10.33E
2,1744-01-01,,,Århus,Denmark,57.05N,10.33E
3,1744-02-01,,,Århus,Denmark,57.05N,10.33E
4,1744-03-01,,,Århus,Denmark,57.05N,10.33E


In [5]:
global_temp[:5]

Unnamed: 0,dt,LandAverageTemperature,LandAverageTemperatureUncertainty,LandMaxTemperature,LandMaxTemperatureUncertainty,LandMinTemperature,LandMinTemperatureUncertainty,LandAndOceanAverageTemperature,LandAndOceanAverageTemperatureUncertainty
0,1750-01-01,3.034,3.574,,,,,,
1,1750-02-01,3.083,3.702,,,,,,
2,1750-03-01,5.626,3.076,,,,,,
3,1750-04-01,8.49,2.451,,,,,,
4,1750-05-01,11.573,2.072,,,,,,


# Explore distributions

Global temperature

In [40]:
global_temp['dt'] = pd.to_datetime(global_temp['dt'])
global_temp['year'] = global_temp['dt'].dt.to_period('Y').apply(lambda period: period.year)
global_temp['month'] = global_temp['dt'].map(lambda d: d.month)
global_temp.reset_index(inplace=True, drop=True)

In [41]:
global_temp.columns

Index(['dt', 'LandAverageTemperature', 'LandAverageTemperatureUncertainty',
       'LandMaxTemperature', 'LandMaxTemperatureUncertainty',
       'LandMinTemperature', 'LandMinTemperatureUncertainty',
       'LandAndOceanAverageTemperature',
       'LandAndOceanAverageTemperatureUncertainty', 'year', 'month'],
      dtype='object')

In [42]:
(min(global_temp.year), max(global_temp.year))

(1750, 2015)

In [43]:
# Aggregate by year for plotting
annual_global_temp = global_temp.groupby('year').agg({
    'LandAverageTemperature': ['min','mean','max']
})
annual_global_temp.reset_index(inplace=True)
annual_global_temp[:3]

Unnamed: 0_level_0,year,LandAverageTemperature,LandAverageTemperature,LandAverageTemperature
Unnamed: 0_level_1,Unnamed: 1_level_1,min,mean,max
0,1750,2.772,8.719364,15.868
1,1751,0.963,7.976143,14.405
2,1752,0.348,5.779833,8.265


In [68]:
fig = go.Figure(data=[
    go.Candlestick(x=annual_global_temp['year'],
    open=annual_global_temp['LandAverageTemperature']['mean'],
    close=annual_global_temp['LandAverageTemperature']['mean'],
    high=annual_global_temp['LandAverageTemperature']['max'],
    low=annual_global_temp['LandAverageTemperature']['min'])
])

fig.update_layout(
    title='Annual Global Temperature',
    yaxis_title='°C',
    xaxis_title='year',
    
)
fig.show()

Temperature by city

In [9]:
csv_city_temp.columns

Index(['dt', 'AverageTemperature', 'AverageTemperatureUncertainty', 'City',
       'Country', 'Latitude', 'Longitude'],
      dtype='object')

In [10]:
len(set(csv_city_temp.City))

3448

# TODOs
- Strata sampling
- Re-explore distributions after sampling
- Fill N/As