### Read meteorological data
Folder with [original data](https://drive.google.com/drive/folders/1me2IpDY3om6IRKv_WMT5W6Qw0r9DI74C)

In [144]:
import os
import zipfile
import pandas as pd
import glob
import keplergl
import geopandas as gpd
import matplotlib.pyplot as plt
import csv
from datetime import datetime
import re
import numpy as np
from pyproj import Proj
%run Meteo_utils.ipynb

In [2]:
os.listdir('data')

['.DS_Store',
 '07-HGPT-MT-07.- Estacion Aeropuerto-20210505T205931Z-001.zip',
 'Estaciones_meteorologicas.csv',
 'Estaciones_meteorologicas_SW.csv',
 '02-HGPT-MT-06.- Estacion Baños-20210426T162523Z-001.zip',
 'Aeropuerto_5min.csv',
 '07-HGPT-MT-07-EstacionAeropuerto.csv',
 'Inventario Estaciones Meteo.xls',
 '.ipynb_checkpoints',
 'Estaciones_meteorologicas_SW.numbers',
 'Aeropuerto_1min.csv']

## Automate data extraction from Drive downloads

In [90]:
path = '/Users/tamarahuete/Documents/Github_repos/TFM21/data'
ziplist = glob.glob(f'{path}/*.zip')
#ziplist = glob.glob(f'data/*.zip')

In [91]:
ziplist

['/Users/tamarahuete/Documents/Github_repos/TFM21/data/24-INAMHI-M0380.- HUambalo-20210610T152020Z-001.zip',
 '/Users/tamarahuete/Documents/Github_repos/TFM21/data/07-HGPT-MT-07.- Estacion Aeropuerto-20210505T205931Z-001.zip',
 '/Users/tamarahuete/Documents/Github_repos/TFM21/data/02-HGPT-MT-06.- Estacion Baños-20210426T162523Z-001.zip',
 '/Users/tamarahuete/Documents/Github_repos/TFM21/data/11-HGPT-PV-04.- Estacion Rio Verde-20210610T152008Z-001.zip']

In [98]:
zf = zipfile.ZipFile(f'{ziplist[0]}')
zf.namelist()[0:5]

['24-INAMHI-M0380.- HUambalo/Todo.xls',
 '24-INAMHI-M0380.- HUambalo/2015/julio_diciembre.csv',
 '24-INAMHI-M0380.- HUambalo/2016/Enero_Abril.csv',
 '24-INAMHI-M0380.- HUambalo/Mayo-Octubre_subir.csv',
 '24-INAMHI-M0380.- HUambalo/Anual.xls']

In [140]:
files_by_date = order_meteo_zip(path, folder =0)

In [136]:
df = read_meteo_csv(path =path,folder = 0,file = files_by_date[0])

Success sep ";", latin encoding
file =24-INAMHI-M0380.- HUambalo/2015/julio_diciembre.csv, cols = 4


In [137]:
df.head()

Unnamed: 0,Fecha,PAvg,Freq,file_name
0,01/07/2015 0:00,0,1hora,24-INAMHI-M0380.- HUambalo/2015/julio_diciembr...
1,02/07/2015 0:00,0,1hora,24-INAMHI-M0380.- HUambalo/2015/julio_diciembr...
2,03/07/2015 0:00,7,1hora,24-INAMHI-M0380.- HUambalo/2015/julio_diciembr...
3,04/07/2015 0:00,25,1hora,24-INAMHI-M0380.- HUambalo/2015/julio_diciembr...
4,05/07/2015 0:00,14,1hora,24-INAMHI-M0380.- HUambalo/2015/julio_diciembr...


In [10]:
all_var = get_unique_variables(files_by_date[0:5], export_name='summary_aeropuerto1.csv',path =path,folder = 0)

Success regular csv, no header, latin encoding
file =07-HGPT-MT-07.- Estacion Aeropuerto/2013-03-26/8310_1min_20170404.csv, cols = 6
Success sep ";", latin encoding
file =07-HGPT-MT-07.- Estacion Aeropuerto/2013-03-26/Aeropuerto_1hora_20130517.csv, cols = 6
Success regular csv, no header, latin encoding
file =07-HGPT-MT-07.- Estacion Aeropuerto/2013-05-07/8310_1min_20170404.csv, cols = 6
Success sep ";", latin encoding
file =07-HGPT-MT-07.- Estacion Aeropuerto/2013-05-07/Aeropuerto_5min_20140417.csv, cols = 6
Success sep ";", latin encoding
file =07-HGPT-MT-07.- Estacion Aeropuerto/2013-06-13/Aeropuerto_5min_20130624.csv, cols = 6


In [114]:
## Variable names:
# What are 'TB1hrAcc','TB1minAcc','TB5minAcc','BAT'?

replace_values ={
# Temperature
    'ATAvg' :['AT1HrAvg', 'AT5minAvg', 'ATAvg','Temperatura'],
    'ATMin' : ['AT1HrMin', 'AT5minMin','T_Min'],
    'ATMax' : ['AT1HrMax','AT5minMax', 'T_Max'],
 
 # Relative Humidity
    'RHAvg' : ['Humedad','RH5minAvg','RHAvg','RelHumidAvg'],
    'RHMin' : ['H_Min','RH5minMin','RelHumidMin'],
    'RHMax' : ['H_Max', 'RH5minMax', 'RelHumidMax'],

 # Wind Speed
    'WSAvg' : ['Velocidad','WS5minAvg','WSAvg','WindMnSpdSclr'],
    'WSMin' : ['WindMinSpdSclr','WS5minMin'],
    'WSMax' : ['WindMaxSpdSclr','WS5minMax'],

 # Wind Direction
    'WDAvg' : ['Direccion', 'WDAvg','WD5minAvg','WindMnDirUnit'],
    'WDMin' : ['WD5minMin'],
    'WDMax' : ['WD5minMax','WindMaxDir'],

 # Rain
    'PAvg' : ['Precipitacion']
}

var_list = ['ATAvg','ATMin','ATMax','RHAvg','RHMin','RHMax', 'WSAvg','WSMin','WSMax','WDAvg','WDMin', 'WDMax','PAvg','TB1hrAcc','TB1minAcc','TB5minAcc','BAT']
sorted(var_list)

['ATAvg',
 'ATMax',
 'ATMin',
 'BAT',
 'PAvg',
 'RHAvg',
 'RHMax',
 'RHMin',
 'TB1hrAcc',
 'TB1minAcc',
 'TB5minAcc',
 'WDAvg',
 'WDMax',
 'WDMin',
 'WSAvg',
 'WSMax',
 'WSMin']

In [12]:
files_by_date[1].split('/')[0].replace(" ","").replace(".","")

'07-HGPT-MT-07-EstacionAeropuerto'

In [391]:
df = read_meteo_csv(path =path,folder = 0,file = files_by_date[0])
df2 = reformat_df(df=df, replace_values=replace_values)
df2.dtypes

Success regular csv, no header, latin encoding
file =07-HGPT-MT-07.- Estacion Aeropuerto/2013-03-26/8310_1min_20170404.csv, cols = 7


Variable
Date_Time    datetime64[ns]
Type                 object
Freq                 object
file_name            object
ATAvg               float64
ATMax               float64
ATMin               float64
BAT                 float64
PAvg                float64
RHAvg               float64
RHMax               float64
RHMin               float64
TB1hrAcc            float64
TB1minAcc           float64
TB5minAcc           float64
WDAvg               float64
WDMax               float64
WDMin               float64
WSAvg               float64
WSMax               float64
WSMin               float64
dtype: object

In [395]:
len(files_by_date)

77

In [None]:
#test with 0, 26, 41
master_df =pd.DataFrame()
for file in range(0, len(files_by_date)):
    print(f'{file}/{len(files_by_date)}')
    df = read_meteo_csv(path =path,folder = 0,file = files_by_date[file])
    df2 = reformat_df(df=df, replace_values=replace_values)
    master_df = master_df.append(df2)
master_df.to_csv(f'{files_by_date[file].split("/")[0].replace(" ","").replace(".","")}.csv')

In [399]:
master_df.to_csv(f'{files_by_date[file].split("/")[0].replace(" ","").replace(".","")}.csv')

In [398]:
master_df.dtypes

Variable
Date_Time    datetime64[ns]
Type                 object
Freq                 object
file_name            object
ATAvg               float64
ATMax               float64
ATMin               float64
BAT                 float64
PAvg                float64
RHAvg               float64
RHMax               float64
RHMin               float64
TB1hrAcc            float64
TB1minAcc           float64
TB5minAcc           float64
WDAvg               float64
WDMax               float64
WDMin               float64
WSAvg               float64
WSMax               float64
WSMin               float64
dtype: object

In [401]:
len(master_df)

4627314

### Analisis of data

In [314]:
df= pd.read_csv(f'{files_by_date[file].split("/")[0].replace(" ","").replace(".","")}.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [403]:
master_df.columns

Index(['Date_Time', 'Type', 'Freq', 'file_name', 'ATAvg', 'ATMax', 'ATMin',
       'BAT', 'PAvg', 'RHAvg', 'RHMax', 'RHMin', 'TB1hrAcc', 'TB1minAcc',
       'TB5minAcc', 'WDAvg', 'WDMax', 'WDMin', 'WSAvg', 'WSMax', 'WSMin'],
      dtype='object', name='Variable')

In [319]:
master_df.head()

Variable,Date_Time,Type,Freq,file_name,ATAvg,ATMax,ATMin,BAT,PAvg,RHAvg,...,RHMin,TB1hrAcc,TB1minAcc,TB5minAcc,WDAvg,WDMax,WDMin,WSAvg,WSMax,WSMin
0,2013-03-19 21:18:00,G,1min,07-HGPT-MT-07.- Estacion Aeropuerto/2013-03-26...,,,,,,,...,,,,,,,,,,
1,2013-03-19 21:19:00,G,1min,07-HGPT-MT-07.- Estacion Aeropuerto/2013-03-26...,,,,,,,...,,,,,,,,,,
2,2013-03-19 21:20:00,G,1min,07-HGPT-MT-07.- Estacion Aeropuerto/2013-03-26...,,,,,,,...,,,,,,,,,,
3,2013-03-19 21:21:00,G,1min,07-HGPT-MT-07.- Estacion Aeropuerto/2013-03-26...,,,,,,,...,,,,,,,,,,
4,2013-03-19 21:22:00,G,1min,07-HGPT-MT-07.- Estacion Aeropuerto/2013-03-26...,,,,,,,...,,,,,,,,,,


#### Other type of stations

In [99]:
path = '/Users/tamarahuete/Documents/Github_repos/TFM21/data'
ziplist = glob.glob(f'{path}/*.zip')
#ziplist = glob.glob(f'data/*.zip')

In [100]:
ziplist

['/Users/tamarahuete/Documents/Github_repos/TFM21/data/24-INAMHI-M0380.- HUambalo-20210610T152020Z-001.zip',
 '/Users/tamarahuete/Documents/Github_repos/TFM21/data/07-HGPT-MT-07.- Estacion Aeropuerto-20210505T205931Z-001.zip',
 '/Users/tamarahuete/Documents/Github_repos/TFM21/data/02-HGPT-MT-06.- Estacion Baños-20210426T162523Z-001.zip',
 '/Users/tamarahuete/Documents/Github_repos/TFM21/data/11-HGPT-PV-04.- Estacion Rio Verde-20210610T152008Z-001.zip']

In [147]:
zf = zipfile.ZipFile(f'{ziplist[2]}')
zf.namelist()[0:5]

['02-HGPT-MT-06.- Estacion Baños/2013-10-14/Banios_5min_20140116_subir.csv',
 '02-HGPT-MT-06.- Estacion Baños/2016-07-28/BANIOS_5MIN_20160728_subir.csv',
 '02-HGPT-MT-06.- Estacion Baños/2013-10-14/Banios_5min_20140116.csv',
 '02-HGPT-MT-06.- Estacion Baños/2015-02-06/8310_5min_20150211__.csv',
 '02-HGPT-MT-06.- Estacion Baños/2016-07-28/BANIOS_5MIN_20160728___.csv']

In [145]:
### Revise this in the fucntion
files_by_date = order_meteo_zip(path, folder =0)
files_by_date

[]

In [148]:
## order files and skip resumen and xls sheets
r = re.compile(f'.*/*/.*.csv') # only csv files
files_by_date = list(filter(r.match, zf.namelist())) 
files_by_date

['02-HGPT-MT-06.- Estacion Baños/2013-10-14/Banios_5min_20140116_subir.csv',
 '02-HGPT-MT-06.- Estacion Baños/2016-07-28/BANIOS_5MIN_20160728_subir.csv',
 '02-HGPT-MT-06.- Estacion Baños/2013-10-14/Banios_5min_20140116.csv',
 '02-HGPT-MT-06.- Estacion Baños/2015-02-06/8310_5min_20150211__.csv',
 '02-HGPT-MT-06.- Estacion Baños/2016-07-28/BANIOS_5MIN_20160728___.csv',
 '02-HGPT-MT-06.- Estacion Baños/2016-01-14/8310_5min_20160125_subir.csv',
 '02-HGPT-MT-06.- Estacion Baños/2013-06-13/8310_5min_20200203.csv',
 '02-HGPT-MT-06.- Estacion Baños/2016-01-14/8310_5min_20160125__.csv',
 '02-HGPT-MT-06.- Estacion Baños/2015-02-06/8310_5min_20150211_subir.csv',
 '02-HGPT-MT-06.- Estacion Baños/2013-06-13/8310_1min_20170717.csv',
 '02-HGPT-MT-06.- Estacion Baños/2018-08-16/BANIOS_5MIN_20180816.csv',
 '02-HGPT-MT-06.- Estacion Baños/2015-02-06/8310_5min_20150211.csv',
 '02-HGPT-MT-06.- Estacion Baños/2013-10-14/8310_1min_20170717.csv',
 '02-HGPT-MT-06.- Estacion Baños/2016-07-28/BANI

In [149]:
df = read_meteo_csv(path =path,folder = 0,file = files_by_date[0])

Failed


UnboundLocalError: local variable 'df' referenced before assignment

In [123]:
df

Unnamed: 0,Fecha,PAvg,Freq,file_name
0,01/07/2015 0:00,0,1hora,24-INAMHI-M0380.- HUambalo/2015/julio_diciembr...
1,02/07/2015 0:00,0,1hora,24-INAMHI-M0380.- HUambalo/2015/julio_diciembr...
2,03/07/2015 0:00,07,1hora,24-INAMHI-M0380.- HUambalo/2015/julio_diciembr...
3,04/07/2015 0:00,25,1hora,24-INAMHI-M0380.- HUambalo/2015/julio_diciembr...
4,05/07/2015 0:00,14,1hora,24-INAMHI-M0380.- HUambalo/2015/julio_diciembr...
...,...,...,...,...
179,27/12/2015 0:00,240,1hora,24-INAMHI-M0380.- HUambalo/2015/julio_diciembr...
180,28/12/2015 0:00,000,1hora,24-INAMHI-M0380.- HUambalo/2015/julio_diciembr...
181,29/12/2015 0:00,000,1hora,24-INAMHI-M0380.- HUambalo/2015/julio_diciembr...
182,30/12/2015 0:00,070,1hora,24-INAMHI-M0380.- HUambalo/2015/julio_diciembr...


In [124]:
len(list(set(['Date']) - set(df.columns)))

1

In [125]:
reformat_df(df, replace_values)

Unnamed: 0,Date_Time,Type,Freq,file_name,ATAvg,ATMax,ATMin,BAT,PAvg,RHAvg,...,RHMin,TB1hrAcc,TB1minAcc,TB5minAcc,WDAvg,WDMax,WDMin,WSAvg,WSMax,WSMin
0,2015-01-07,,1hora,24-INAMHI-M0380.- HUambalo/2015/julio_diciembr...,,,,,0.0,,...,,,,,,,,,,
1,2015-02-07,,1hora,24-INAMHI-M0380.- HUambalo/2015/julio_diciembr...,,,,,0.0,,...,,,,,,,,,,
2,2015-03-07,,1hora,24-INAMHI-M0380.- HUambalo/2015/julio_diciembr...,,,,,0.7,,...,,,,,,,,,,
3,2015-04-07,,1hora,24-INAMHI-M0380.- HUambalo/2015/julio_diciembr...,,,,,2.5,,...,,,,,,,,,,
4,2015-05-07,,1hora,24-INAMHI-M0380.- HUambalo/2015/julio_diciembr...,,,,,1.4,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
179,2015-12-27,,1hora,24-INAMHI-M0380.- HUambalo/2015/julio_diciembr...,,,,,2.4,,...,,,,,,,,,,
180,2015-12-28,,1hora,24-INAMHI-M0380.- HUambalo/2015/julio_diciembr...,,,,,0.0,,...,,,,,,,,,,
181,2015-12-29,,1hora,24-INAMHI-M0380.- HUambalo/2015/julio_diciembr...,,,,,0.0,,...,,,,,,,,,,
182,2015-12-30,,1hora,24-INAMHI-M0380.- HUambalo/2015/julio_diciembr...,,,,,0.7,,...,,,,,,,,,,


In [None]:
def reformat_df(df, replace_values):
    
    ## delete any empty rows
    df.dropna(inplace=True)
        
    ## merge date + time
    ## replace variable names
    ## convert to long format with same column names and order
    
    #1. Wide format datasets
    if len(list(set(['Date']) - set(df.columns))) == 1:
        df['Date_Time'] = pd.to_datetime(df['Fecha'])
        df.drop(columns =['Fecha'],inplace =True)
        
        for variable in list(replace_values.keys()):
            try:
                df = df.rename(columns={list(set(df.columns) & set(replace_values[variable]))[0]: variable})
            except:
                continue
        
        df['Type'] = np.nan
    
    #2. Long format datasets
    else:
        df['Date_Time']= pd.to_datetime(df['Date'] + ' ' + df['Time'])
        df.drop(columns =['Date','Time'],inplace =True)
        
        for variable in list(replace_values.keys()):
            df =df.replace(to_replace =replace_values[variable], value = variable)
    
        #Convert to long format and keep all variables
        df.drop_duplicates(inplace=True)
        df = df.reset_index().drop(columns = 'index')
        freq = df.Freq.unique()[0]
        file_name = df.file_name.unique()[0]
        try:
            df2 =df.pivot(index='Date_Time', columns='Variable').reset_index()
            df = df.pivot(index='Date_Time', columns='Variable', values='Value').reset_index()
        except:
            df2 =pd.pivot_table(df,index='Date_Time', columns='Variable',aggfunc='first').reset_index()
            df =pd.pivot_table(df,index='Date_Time', columns='Variable',values = 'Value',aggfunc='first').reset_index()
        df['Freq'] = freq
        df['Type'] = df2.Type.iloc[:,0]
        df['file_name'] = file_name
    
    
    ## Add variables that are not in the df but are in the general list
    not_in_df = list(set(var_list)-set(df.columns))
    for i in not_in_df:
        df[i]=np.nan
    
    ## Order columns so all dfs have the same structure
    var_order = ['Date_Time', 'Type', 'Freq','file_name']
    var_order.extend(sorted(df.loc[:,list(set(df.columns) - set(['Date_Time', 'Type', 'Freq','file_name']))]))
    df = df.reindex(var_order, axis=1)
    
    ## convert any ',' decimals to '.'
    try:
        df.iloc[:,4:] = df.iloc[:,4:].astype(float)
    except:
        cols= np.where(df.dtypes[4:]=='object')[0]+4
        df.iloc[:,cols] = df.iloc[:,cols].apply(lambda x: x.str.replace(',','.')) 
        df.iloc[:,cols] = df.iloc[:,cols].apply(pd.to_numeric, errors='coerce')
    
    return df