In [77]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline 
import seaborn as sns
import glob, os

import plotly.offline
import plotly.graph_objs as go
plotly.offline.init_notebook_mode(connected = True)


In [None]:
# https://www.meteoblue.com/en/weather/forecast/modelclimate/nuenen_netherlands_2749780


In [52]:
file_list = sorted(glob.glob('./weather_data/*/*.xlsx'))

def read_weather_data(file_dir):
    df = pd.read_excel(file_dir)
    df = df.iloc[10:, :]
    df.columns = df.iloc[0, : ]
    df = df.iloc[2:, :].reset_index()
    df.columns = ['idx', 'year', 'month', 'day', 'hour', 'minute', 'temperature',
           'total_precipitation_high_resolution', 'total_precipitation_low_resolution',
           'snowfall_amount_high_resolution', 'snowfall_amount_low_resolution',
           'total_cloud_cover', 'sunshine_duration', 'wind_speed',
           'wind_direction']

    return df

In [53]:
df_list = []

for i in range(len(file_list)):
    one_df = read_weather_data(file_list[i])
    df_list.append(one_df)
    

In [41]:
df_list[0].head()


Unnamed: 0,idx,year,month,day,hour,minute,temperature,total_precipitation_high_resolution,total_precipitation_low_resolution,snowfall_amount_high_resolution,snowfall_amount_low_resolution,total_cloud_cover,sunshine_duration,wind_speed,wind_direction
0,12,2019,3,8,1,0,6.06,0,0,0,0,27,0,20.55,258.89
1,13,2019,3,8,2,0,6.16,0,0,0,0,100,0,21.19,260.22
2,14,2019,3,8,3,0,5.68,0,0,0,0,100,0,20.7,256.93
3,15,2019,3,8,4,0,5.04,0,0,0,0,1,0,18.45,252.98
4,16,2019,3,8,5,0,4.67,0,0,0,0,65,0,17.32,249.3


In [56]:
# find average sunshine duration, cloud cover, precipitation with high resolution for all cities

def find_statistics(df):
    avg_sun = df['sunshine_duration'].sum()/df.shape[0]
    avg_cloud = df['total_cloud_cover'].sum()/df.shape[0]
    avg_rain = df['total_precipitation_high_resolution'].sum()/df.shape[0]

    print('{}: avg_sun {}, avg_cloud {}, avg_rain {}'.format(city, avg_sun, avg_cloud, avg_rain))
    

In [57]:
for i in range(len(df_list)):
    city = file_list[i].split('/')[2]
    find_statistics(df_list[i])
    
    

Arles: avg_sun 24.60659685863874, avg_cloud 23.834031413612568, avg_rain 0.0
Auvers-sur-Oise: avg_sun 6.2598952879581145, avg_cloud 82.02356020942409, avg_rain 0.09738219895287957
Nuenen: avg_sun 5.44544502617801, avg_cloud 85.59895287958115, avg_rain 0.2214659685863875
Paris: avg_sun 6.217329842931936, avg_cloud 81.82198952879581, avg_rain 0.08481675392670154
St.Remy: avg_sun 24.064816753926696, avg_cloud 23.648167539267025, avg_rain 0.0005235602094240838


In [96]:
def get_non_zero_weather_data(df, weather_column):
    # combine the time data into one column
    time_full = []

    for i in range(df.shape[0]):
        y = df.iloc[i]['year']
        m = df.iloc[i]['month']
        d = df.iloc[i]['day']
        h = df.iloc[i]['hour']
        mins = df.iloc[i]['minute']

        t = ('_').join([str(v) for v in [y, m, d, h, mins]])
        # time with zero entry in weather data 
        time_full.append(t)
    # weahter data with zero entry
    sun_duration = df[weather_column]

    # non zero weather data 
#     sun_duration = df['sunshine_duration'][df['sunshine_duration'] != 0]
#     time = [time_full[i] for i in list(df[df['sunshine_duration'] != 0].index)]
    
    return sun_duration, time_full
    

In [109]:

color_dict = {'Nuenen': 'rgb(77, 61, 0)',  
              'Auvers-sur-Oise': 'rgb(77, 77, 0)',
              'Paris': 'rgb(153, 153, 0)', 
              'St.Remy': 'rgb(255, 255, 51)',
              'Arles': 'rgb(230, 230, 0)'}


In [113]:
# bar chart - sunlight duration

data = []

for i in range(len(df_list)):
    df = df_list[i]
    sun_duration, time = get_non_zero_weather_data(df, 'sunshine_duration')
    city = file_list[i].split('/')[2]
    
    trace = go.Bar(x = time, 
                   y = sun_duration,
                   name = city,
                   marker = dict(color = color_dict[city])
                  )
    data.append(trace)
    
layout = dict(title = 'sunlight duration in five cities',
              barmode = 'stack',
              xaxis = dict(title = 'time'),
              yaxis = dict(title = 'sunlight duration (min)'),
              width = 2400,
              height = 600,
              margin = dict(l = 50, r = 10, b = 200, t = 100, pad = 4))

fig = dict(data = data, layout = layout)

plotly.offline.iplot(fig, filename = 'sunlight_duration_five_cities.html')


In [112]:
# bar chart - cloud cover

data = []

for i in range(len(df_list)):
    df = df_list[i]
    sun_duration, time = get_non_zero_weather_data(df, 'total_cloud_cover')
    city = file_list[i].split('/')[2]
    
    trace = go.Bar(x = time, 
                   y = sun_duration,
                   name = city,
                   marker = dict(color = color_dict[city])
                  )
    data.append(trace)
    
layout = dict(title = 'cloud cover in five cities',
              barmode = 'stack',
              xaxis = dict(title = 'time'),
              yaxis = dict(title = 'cloud cover (%)'),
              width = 2400,
              height = 600,
              margin = dict(l = 50, r = 10, b = 200, t = 100, pad = 4))

fig = dict(data = data, layout = layout)

plotly.offline.iplot(fig, filename = 'cloud_cover_five_cities.html')
