### Python + pandas is useful for processing multiple files

Can you create an Excel spreadsheet with cells containing Excel formulas? - Edward

This notebook shows an example of combining data from multiple files, doing analysis and generating an Excel output file

In [1]:
import pandas as pd
from pathlib import Path
from datetime import datetime

In [2]:
today = datetime.today()
src_dir = Path.cwd() / 'data' / 'raw' / 'weather'
report_file = Path.cwd() / 'reports' / f'weather_summary_{today:%m_%d_%Y}.xlsx'

In [3]:
# Use a recursive glob to look for all csv file
dataframes = []
for csv_file in src_dir.rglob('*.csv'):
    df = pd.read_csv(csv_file, index_col=False, converters={'time': pd.to_datetime})
    dataframes.append(df)
    
all_data = pd.concat(dataframes)

#all_data['time'] = all_data['time'].dt.normalize()

In [4]:
dataframes[0]

Unnamed: 0,Country/Region,Province/State,time,summary,icon,moonPhase,precipIntensity,precipIntensityMax,precipProbability,precipType,...,temperatureLow,dewPoint,humidity,pressure,cloudCover,uvIndex,temperatureMin,temperatureMax,Lat,Long
0,Afghanistan,,2019-12-31,Rain (with a chance of 1–3 in. of snow) until ...,rain,0.20,0.0156,0.1515,0.71,rain,...,32.13,23.88,0.60,1019.1,0.99,2,32.96,48.36,33.0,65.0
1,Afghanistan,,2020-01-01,Light rain throughout the day.,rain,0.23,0.0235,0.0985,0.95,rain,...,28.90,33.61,0.90,1021.2,0.99,2,32.33,40.42,33.0,65.0
2,Afghanistan,,2020-01-02,Clear throughout the day.,rain,0.26,0.0016,0.0062,0.25,rain,...,28.80,29.86,0.76,1022.7,0.22,3,28.90,46.53,33.0,65.0
3,Afghanistan,,2020-01-03,Partly cloudy throughout the day.,partly-cloudy-day,0.30,0.0003,0.0012,0.14,rain,...,32.84,26.61,0.69,1021.9,0.32,3,28.80,45.77,33.0,65.0
4,Afghanistan,,2020-01-04,Light rain throughout the day.,rain,0.33,0.0145,0.0310,0.83,rain,...,37.25,33.14,0.88,1016.1,1.00,2,32.84,40.84,33.0,65.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
107,Afghanistan,,2020-04-16,Possible light rain in the morning.,rain,0.84,0.0055,0.0187,0.65,rain,...,47.94,45.32,0.71,1012.0,0.76,7,49.71,65.06,33.0,65.0
108,Afghanistan,,2020-04-17,Clear throughout the day.,clear-day,0.87,0.0014,0.0114,0.09,rain,...,48.01,43.26,0.62,1014.3,0.23,8,47.94,67.68,33.0,65.0
109,Afghanistan,,2020-04-18,Mostly cloudy throughout the day.,partly-cloudy-day,0.90,0.0002,0.0014,0.04,rain,...,47.27,40.84,0.57,1013.4,0.61,7,48.01,66.55,33.0,65.0
110,Afghanistan,,2020-04-19,Clear throughout the day.,clear-day,0.93,0.0001,0.0007,0.05,rain,...,48.90,42.42,0.58,1013.4,0.24,11,47.27,68.84,33.0,65.0


In [5]:
all_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30688 entries, 0 to 111
Data columns (total 21 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   Country/Region      30688 non-null  object        
 1   Province/State      13776 non-null  object        
 2   time                30688 non-null  datetime64[ns]
 3   summary             30685 non-null  object        
 4   icon                30688 non-null  object        
 5   moonPhase           30688 non-null  float64       
 6   precipIntensity     30688 non-null  float64       
 7   precipIntensityMax  30688 non-null  float64       
 8   precipProbability   30688 non-null  float64       
 9   precipType          28444 non-null  object        
 10  temperatureHigh     30688 non-null  float64       
 11  temperatureLow      30688 non-null  float64       
 12  dewPoint            30688 non-null  float64       
 13  humidity            30688 non-null  float64     

In [6]:
all_data.head()

Unnamed: 0,Country/Region,Province/State,time,summary,icon,moonPhase,precipIntensity,precipIntensityMax,precipProbability,precipType,...,temperatureLow,dewPoint,humidity,pressure,cloudCover,uvIndex,temperatureMin,temperatureMax,Lat,Long
0,Afghanistan,,2019-12-31,Rain (with a chance of 1–3 in. of snow) until ...,rain,0.2,0.0156,0.1515,0.71,rain,...,32.13,23.88,0.6,1019.1,0.99,2,32.96,48.36,33.0,65.0
1,Afghanistan,,2020-01-01,Light rain throughout the day.,rain,0.23,0.0235,0.0985,0.95,rain,...,28.9,33.61,0.9,1021.2,0.99,2,32.33,40.42,33.0,65.0
2,Afghanistan,,2020-01-02,Clear throughout the day.,rain,0.26,0.0016,0.0062,0.25,rain,...,28.8,29.86,0.76,1022.7,0.22,3,28.9,46.53,33.0,65.0
3,Afghanistan,,2020-01-03,Partly cloudy throughout the day.,partly-cloudy-day,0.3,0.0003,0.0012,0.14,rain,...,32.84,26.61,0.69,1021.9,0.32,3,28.8,45.77,33.0,65.0
4,Afghanistan,,2020-01-04,Light rain throughout the day.,rain,0.33,0.0145,0.031,0.83,rain,...,37.25,33.14,0.88,1016.1,1.0,2,32.84,40.84,33.0,65.0


In [7]:
agg_func = {'temperatureMin': 'mean', 'temperatureMax': 'mean'}

report = all_data.groupby(['Country/Region',
                           pd.Grouper(key='time',
                                      freq='M')]).agg(agg_func).unstack().round(2)

report

Unnamed: 0_level_0,temperatureMin,temperatureMin,temperatureMin,temperatureMin,temperatureMin,temperatureMax,temperatureMax,temperatureMax,temperatureMax,temperatureMax
time,2019-12-31,2020-01-31,2020-02-29,2020-03-31,2020-04-30,2019-12-31,2020-01-31,2020-02-29,2020-03-31,2020-04-30
Country/Region,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
Afghanistan,32.96,25.95,32.85,40.46,47.84,48.36,42.40,51.02,58.01,66.86
Albania,32.80,36.80,37.52,42.63,46.84,51.43,53.78,56.55,59.75,68.44
Algeria,37.67,45.05,52.11,54.95,64.24,61.83,70.55,79.77,83.05,89.09
Andorra,40.70,33.70,36.88,35.52,41.06,59.15,52.57,60.02,54.84,60.30
Antigua and Barbuda,75.92,74.73,74.31,73.81,76.02,83.07,81.80,81.58,81.49,83.65
...,...,...,...,...,...,...,...,...,...,...
Uruguay,71.88,63.99,63.23,64.12,54.50,84.88,87.19,87.14,87.13,74.34
Uzbekistan,29.70,25.75,32.85,40.81,49.10,47.72,41.34,51.22,63.17,70.04
Venezuela,67.85,67.59,69.22,69.90,69.86,87.02,87.38,90.06,90.51,88.49
Vietnam,58.90,54.36,54.49,61.25,61.03,64.79,68.39,71.47,80.92,79.14


In [8]:
# Force the date time format to not show the full time info
writer = pd.ExcelWriter(report_file,
                        engine='xlsxwriter',
                        datetime_format='mmm d yyyy',
                        date_format='mmmm dd yyyy')
report.to_excel(writer, sheet_name='report')

# Work with XlsxWriter
workbook = writer.book
worksheet = writer.sheets['report']
worksheet.set_column('A:K', 18)

# Show example of putting in a formula in a specific cell
formula = "=AVERAGE(B4:B164)"
worksheet.write_formula("B165", formula)

# Save the file
writer.save()

In [9]:
report_file

WindowsPath('c:/Users/chris/win_dev/excel-to-python-course/webcast_materials/reports/weather_summary_09_29_2020.xlsx')