### Read meteorological data
Folder with [original data](https://drive.google.com/drive/folders/1me2IpDY3om6IRKv_WMT5W6Qw0r9DI74C)

In [110]:
import os
import zipfile
import pandas as pd
import glob
import keplergl
import geopandas as gpd
import matplotlib.pyplot as plt
import csv
from datetime import datetime
import re
import numpy as np
from pyproj import Proj
%run Meteo_utils.ipynb

In [48]:
os.listdir('data')

['24-INAMHI-M0380.- HUambalo-20210610T152020Z-001.zip',
 '.DS_Store',
 '10-HGPT-MT-14.- Estacion Pisayambo-20210613T114618Z-001.zip',
 '07-HGPT-MT-07.- Estacion Aeropuerto-20210505T205931Z-001.zip',
 'Estaciones_meteorologicas.csv',
 'Estaciones_meteorologicas_SW.csv',
 '02-HGPT-MT-06.- Estacion Baños-20210426T162523Z-001.zip',
 '08-HGPT-MT-01.- Estacion Chiquiurcu-20210613T114648Z-002.zip',
 'summary_aeropuerto.csv',
 'Aeropuerto_5min.csv',
 '08-HGPT-MT-01.- Estacion Chiquiurcu-20210613T114648Z-001.zip',
 '07-HGPT-MT-07-EstacionAeropuerto.csv',
 'Inventario Estaciones Meteo.xls',
 '.ipynb_checkpoints',
 'Estaciones_meteorologicas_SW.numbers',
 '11-HGPT-PV-04.- Estacion Rio Verde-20210610T152008Z-001.zip',
 '08-HGPT-MT-01.- Estacion Chiquiurcu.zip',
 'Aeropuerto_1min.csv']

## Automate data extraction from Drive downloads

In [185]:
path = '/Users/tamarahuete/Documents/Github_repos/TFM21/data'
ziplist = glob.glob(f'{path}/*.zip')
#ziplist = glob.glob(f'data/*.zip')

In [186]:
ziplist

['/Users/tamarahuete/Documents/Github_repos/TFM21/data/24-INAMHI-M0380.- HUambalo-20210610T152020Z-001.zip',
 '/Users/tamarahuete/Documents/Github_repos/TFM21/data/10-HGPT-MT-14.- Estacion Pisayambo-20210613T114618Z-001.zip',
 '/Users/tamarahuete/Documents/Github_repos/TFM21/data/07-HGPT-MT-07.- Estacion Aeropuerto-20210505T205931Z-001.zip',
 '/Users/tamarahuete/Documents/Github_repos/TFM21/data/02-HGPT-MT-06.- Estacion Baños-20210426T162523Z-001.zip',
 '/Users/tamarahuete/Documents/Github_repos/TFM21/data/11-HGPT-PV-04.- Estacion Rio Verde-20210610T152008Z-001.zip',
 '/Users/tamarahuete/Documents/Github_repos/TFM21/data/08-HGPT-MT-01.- Estacion Chiquiurcu.zip']

In [99]:
folder = 0

In [100]:
zf = zipfile.ZipFile(f'{ziplist[folder]}')
zf.namelist()[0:5]

['24-INAMHI-M0380.- HUambalo/Todo.xls',
 '24-INAMHI-M0380.- HUambalo/2015/julio_diciembre.csv',
 '24-INAMHI-M0380.- HUambalo/2016/Enero_Abril.csv',
 '24-INAMHI-M0380.- HUambalo/Mayo-Octubre_subir.csv',
 '24-INAMHI-M0380.- HUambalo/Anual.xls']

In [101]:
files_by_date = order_meteo_zip(path, folder =folder)

In [102]:
df = read_meteo_csv(path =path,folder = folder,file = files_by_date[0])

Success sep ";", latin encoding
file =24-INAMHI-M0380.- HUambalo/2015/julio_diciembre.csv, cols = 4


In [103]:
df.head()

Unnamed: 0,Fecha,PAvg,Freq,st_name
0,01/07/2015 0:00,0,1day,24-INAMHI-M0380-HUambalo
1,02/07/2015 0:00,0,1day,24-INAMHI-M0380-HUambalo
2,03/07/2015 0:00,7,1day,24-INAMHI-M0380-HUambalo
3,04/07/2015 0:00,25,1day,24-INAMHI-M0380-HUambalo
4,05/07/2015 0:00,14,1day,24-INAMHI-M0380-HUambalo


In [104]:
files_by_date[0].split('/')[-1].replace(" ","")

'julio_diciembre.csv'

In [105]:
station = files_by_date[0].split('/')[0].replace(" ","").replace(".","")
all_var = get_unique_variables(files_by_date,path =path,folder = folder)

Success sep ";", latin encoding
file =24-INAMHI-M0380.- HUambalo/2015/julio_diciembre.csv, cols = 4
Success sep ";", latin encoding
file =24-INAMHI-M0380.- HUambalo/2016/Enero_Abril.csv, cols = 4
Success sep ";", latin encoding
file =24-INAMHI-M0380.- HUambalo/Mayo-Octubre_subir.csv, cols = 4
Success sep ";", latin encoding
file =24-INAMHI-M0380.- HUambalo/2015/Julio- Octubre_subir.csv, cols = 4
Success sep ";", latin encoding
file =24-INAMHI-M0380.- HUambalo/2013/2013_subir.csv, cols = 4
Success sep ";", latin encoding
file =24-INAMHI-M0380.- HUambalo/2014/2014_huambalo_subir.csv, cols = 4
Success sep ";", latin encoding
file =24-INAMHI-M0380.- HUambalo/2015/Enero_Julio.csv, cols = 4


In [2]:
## Variable names:
# BAT  is  battery
# Not very clear: 'H_M Min','WindMnSpdSclr','WindMnSpdS','WindMnDirUnit'
replace_values ={
# Temperature
    'ATAvg' :['AT1HrAvg', 'AT5minAvg', 'ATAvg','Temperatura'],
    'ATMin' : ['AT1HrMin', 'AT5minMin','T_Min', 'T Min'],
    'ATMax' : ['AT1HrMax','AT5minMax', 'T_Max', 'T Max'],
 
 # Relative Humidity
    'RHAvg' : ['Humedad','RH5minAvg','RHAvg','RelHumidAvg', 'Humedad R', 'RH5m2015'],
    'RHMin' : ['H_Min','RH5minMin','RelHumidMin','H_M Min'],
    'RHMax' : ['H_Max', 'RH5minMax', 'RelHumidMax','H_R Max'],

 # Wind Speed
    'WSAvg' : ['Velocidad','WS5minAvg','WSAvg','WindMnSpdSclr','WindMnSpdS'],
    'WSMin' : ['WindMinSpdSclr','WS5minMin'],
    'WSMax' : ['WindMaxSpdSclr','WS5minMax'],

 # Wind Direction
    'WDAvg' : ['Direccion', 'WDAvg','WD5minAvg','WindMnDirUnit','Dirección'],
    'WDMin' : ['WD5minMin'],
    'WDMax' : ['WD5minMax','WindMaxDir'],

 # Rain
    'PAvg' : ['Precipitacion','TB1hrAcc','TB1minAcc','TB5minAcc','Precipitación']
}

var_list = ['ATAvg','ATMin','ATMax','RHAvg','RHMin','RHMax', 'WSAvg','WSMin','WSMax','WDAvg','WDMin', 'WDMax','PAvg']
#sorted(var_list)

In [76]:
df = read_meteo_csv(path =path,folder = 0,file = files_by_date[0])
df2 = reformat_df(df=df, replace_values=replace_values)
df2.dtypes

Success sep ";", latin encoding
file =24-INAMHI-M0380.- HUambalo/2015/julio_diciembre.csv, cols = 4


Date_Time    datetime64[ns]
Type                float64
Freq                 object
file_name           float64
ATAvg               float64
ATMax               float64
ATMin               float64
BAT                 float64
PAvg                float64
RHAvg               float64
RHMax               float64
RHMin               float64
TB1hrAcc            float64
TB1minAcc           float64
TB5minAcc           float64
WDAvg               float64
WDMax               float64
WDMin               float64
WSAvg               float64
WSMax               float64
WSMin               float64
st_name             float64
dtype: object

In [77]:
len(files_by_date)

7

## Process files in loop

In [111]:
## Variable names:
# BAT  is  battery
# Not very clear: 'H_M Min','WindMnSpdSclr','WindMnSpdS','WindMnDirUnit'
replace_values ={
# Temperature
    'ATAvg' :['AT1HrAvg', 'AT5minAvg', 'ATAvg','Temperatura'],
    'ATMin' : ['AT1HrMin', 'AT5minMin','T_Min', 'T Min'],
    'ATMax' : ['AT1HrMax','AT5minMax', 'T_Max', 'T Max'],
 
 # Relative Humidity
    'RHAvg' : ['Humedad','RH5minAvg','RHAvg','RelHumidAvg', 'Humedad R', 'RH5m2015'],
    'RHMin' : ['H_Min','RH5minMin','RelHumidMin','H_M Min'],
    'RHMax' : ['H_Max', 'RH5minMax', 'RelHumidMax','H_R Max'],

 # Wind Speed
    'WSAvg' : ['Velocidad','WS5minAvg','WSAvg','WindMnSpdSclr','WindMnSpdS'],
    'WSMin' : ['WindMinSpdSclr','WS5minMin'],
    'WSMax' : ['WindMaxSpdSclr','WS5minMax'],

 # Wind Direction
    'WDAvg' : ['Direccion', 'WDAvg','WD5minAvg','WindMnDirUnit','Dirección'],
    'WDMin' : ['WD5minMin'],
    'WDMax' : ['WD5minMax','WindMaxDir'],

 # Rain
    'PAvg' : ['Precipitacion','TB1hrAcc','TB1minAcc','TB5minAcc','Precipitación']
}

var_list = ['ATAvg','ATMin','ATMax','RHAvg','RHMin','RHMax', 'WSAvg','WSMin','WSMax','WDAvg','WDMin', 'WDMax','PAvg']
#sorted(var_list)

In [None]:
path = '/Users/tamarahuete/Documents/Github_repos/TFM21/data'
folder = 4
files_by_date = order_meteo_zip(path, folder =folder)
all_var = get_unique_variables(files_by_date,path =path,folder = folder)

In [112]:
path = '/Users/tamarahuete/Documents/Github_repos/TFM21/data'
ziplist = glob.glob(f'{path}/*.zip')
for folder in range(0, len(ziplist)):
    files_by_date = order_meteo_zip(path, folder =folder)
    get_unique_variables(files_by_date,path =path,folder = folder)

    master_df =pd.DataFrame()
    for file in range(0, len(files_by_date)):
        print(f'{file}/{len(files_by_date)}')
        df = read_meteo_csv(path =path,folder = folder,file = files_by_date[file])    
        df2 = reformat_df(df=df, replace_values=replace_values)
        master_df = master_df.append(df2)
    master_df.to_csv(f'data/{files_by_date[file].split("/")[0].replace(" ","").replace(".","")}.csv')

Success sep ";", latin encoding
file =24-INAMHI-M0380.- HUambalo/2015/julio_diciembre.csv, cols = 4
Success sep ";", latin encoding
file =24-INAMHI-M0380.- HUambalo/2016/Enero_Abril.csv, cols = 4
Success sep ";", latin encoding
file =24-INAMHI-M0380.- HUambalo/Mayo-Octubre_subir.csv, cols = 4
Success sep ";", latin encoding
file =24-INAMHI-M0380.- HUambalo/2015/Julio- Octubre_subir.csv, cols = 4
Success sep ";", latin encoding
file =24-INAMHI-M0380.- HUambalo/2013/2013_subir.csv, cols = 4
Success sep ";", latin encoding
file =24-INAMHI-M0380.- HUambalo/2014/2014_huambalo_subir.csv, cols = 4
Success sep ";", latin encoding
file =24-INAMHI-M0380.- HUambalo/2015/Enero_Julio.csv, cols = 4
0/7
Success sep ";", latin encoding
file =24-INAMHI-M0380.- HUambalo/2015/julio_diciembre.csv, cols = 4
1/7
Success sep ";", latin encoding
file =24-INAMHI-M0380.- HUambalo/2016/Enero_Abril.csv, cols = 4
2/7
Success sep ";", latin encoding
file =24-INAMHI-M0380.- HUambalo/Mayo-Octubre_subir.csv, cols = 4


KeyError: 'Fecha'

In [113]:
folder, file

(5, 1)

In [116]:
df = read_meteo_csv(path =path,folder = folder,file = files_by_date[25])
df

Success regular csv, no header, latin encoding
file =__MACOSX/08-HGPT-MT-01.- Estacion Chiquiurcu/2014-06-30/._Chuquiurcu_5min_20140708.csv, cols = 2


Unnamed: 0,Freq,file_name
0,5min,__MACOSX/08-HGPT-MT-01.- Estacion Chiquiurcu/2...


In [115]:
## delete any complete empty rows
df.dropna(inplace=True, how = 'all')
        
    ## merge date + time
    ## replace variable names
    ## convert to long format with same column names and order
    
    #1. Wide format datasets
if len(list(set(['Date']) - set(df.columns))) == 1:
    df['Date_Time'] = pd.to_datetime(df['Fecha'])
    df.drop(columns =['Fecha'],inplace =True)
        
    for variable in list(replace_values.keys()):
        try:
            df = df.rename(columns={list(set(df.columns) & set(replace_values[variable]))[0]: variable})
        except:
            continue
        
    df['Type'] = np.nan
    
    #2. Long format datasets
else:
    df['Date_Time']= pd.to_datetime(df['Date'] + ' ' + df['Time'])
    df.drop(columns =['Date','Time'],inplace =True)
        
    for variable in list(replace_values.keys()):
        df =df.replace(to_replace =replace_values[variable], value = variable)
    
        #Convert to long format and keep all variables
    df.drop_duplicates(inplace=True)
    df = df.reset_index().drop(columns = 'index')
    freq = df.Freq.unique()[0]
    file_name = df.file_name.unique()[0]
    try:
        df2 =df.pivot(index='Date_Time', columns='Variable').reset_index()
        df = df.pivot(index='Date_Time', columns='Variable', values='Value').reset_index()
    except:
        df2 =pd.pivot_table(df,index='Date_Time', columns='Variable',aggfunc='first').reset_index()
        df =pd.pivot_table(df,index='Date_Time', columns='Variable',values = 'Value',aggfunc='first').reset_index()
    df['Freq'] = freq
    df['Type'] = df2.Type.iloc[:,0]
    df['file_name'] = file_name
df

KeyError: 'Fecha'

In [101]:
keep = list(replace_values.keys())+ ['Date_Time', 'Type', 'Freq','file_name']
keep

['ATAvg',
 'ATMin',
 'ATMax',
 'RHAvg',
 'RHMin',
 'RHMax',
 'WSAvg',
 'WSMin',
 'WSMax',
 'WDAvg',
 'WDMin',
 'WDMax',
 'PAvg',
 'Date_Time',
 'Type',
 'Freq',
 'file_name']

In [105]:
## Add variables that are not in the df but are in the general list
not_in_df = list(set(replace_values.keys())-set(df.columns))
for i in not_in_df:
    df[i]=np.nan

    #df.dropna(axis=1,inplace= True, how = 'all')
keep = list(replace_values.keys())+ ['Date_Time', 'Type', 'Freq','file_name']
empty_cols = [col for col in df.columns if df[col].isnull().all() or round(df[col].isnull().value_counts()[0]/len(df[col]),2) < 0.01]
if len(list(set(empty_cols) - set(keep))) >0:
    df.drop(list(set(empty_cols) - set(keep)),
        axis=1,
        inplace=True)
    
    
df.head()

Unnamed: 0,ATAvg,RHAvg,PAvg,WDAvg,WSAvg,Freq,file_name,Date_Time,Type,ATMax,WSMax,WSMin,ATMin,WDMax,RHMax,WDMin,RHMin
0,,9584,4,14853,592,5min,10-HGPT-MT-14.- Estacion Pisayambo/2018-01-11/...,2017-06-16 10:20:00,,,,,,,,,
1,,9531,0,1436,518,5min,10-HGPT-MT-14.- Estacion Pisayambo/2018-01-11/...,2017-06-16 10:25:00,,,,,,,,,
2,,943,0,1393,451,5min,10-HGPT-MT-14.- Estacion Pisayambo/2018-01-11/...,2017-06-16 10:30:00,,,,,,,,,
3,,9365,0,14013,488,5min,10-HGPT-MT-14.- Estacion Pisayambo/2018-01-11/...,2017-06-16 10:35:00,,,,,,,,,
4,,9427,1,13553,447,5min,10-HGPT-MT-14.- Estacion Pisayambo/2018-01-11/...,2017-06-16 10:40:00,,,,,,,,,


In [106]:
## Order columns so all dfs have the same structure
var_order = ['Date_Time', 'Type', 'Freq','file_name']
var_order.extend(sorted(df.loc[:,list(set(df.columns) - set(['Date_Time', 'Type', 'Freq','file_name']))]))
df = df.reindex(var_order, axis=1)

In [107]:
list(set(keep) - set(df.columns))

[]

In [108]:
## make sure only the accepted variables are kept
not_good = list(set(keep) - set(df.columns))
df.drop(not_good)
    
## convert any ',' decimals to '.'
try:
    df.iloc[:,4:] = df.iloc[:,4:].astype(float)
except:
    cols= np.where(df.dtypes[4:]=='object')[0]+4
    df.iloc[:,cols] = df.iloc[:,cols].apply(lambda x: x.str.replace(',','.')) 
    df.iloc[:,cols] = df.iloc[:,cols].apply(pd.to_numeric, errors='coerce')

In [109]:
df

Unnamed: 0,Date_Time,Type,Freq,file_name,ATAvg,ATMax,ATMin,PAvg,RHAvg,RHMax,RHMin,WDAvg,WDMax,WDMin,WSAvg,WSMax,WSMin
0,2017-06-16 10:20:00,,5min,10-HGPT-MT-14.- Estacion Pisayambo/2018-01-11/...,,,,0.4,95.84,,,148.53,,,5.92,,
1,2017-06-16 10:25:00,,5min,10-HGPT-MT-14.- Estacion Pisayambo/2018-01-11/...,,,,0.0,95.31,,,143.60,,,5.18,,
2,2017-06-16 10:30:00,,5min,10-HGPT-MT-14.- Estacion Pisayambo/2018-01-11/...,,,,0.0,94.30,,,139.30,,,4.51,,
3,2017-06-16 10:35:00,,5min,10-HGPT-MT-14.- Estacion Pisayambo/2018-01-11/...,,,,0.0,93.65,,,140.13,,,4.88,,
4,2017-06-16 10:40:00,,5min,10-HGPT-MT-14.- Estacion Pisayambo/2018-01-11/...,,,,0.1,94.27,,,135.53,,,4.47,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60200,2018-11-01 11:00:00,,5min,10-HGPT-MT-14.- Estacion Pisayambo/2018-01-11/...,,,,0.0,56.89,,,155.11,,,4.94,,
60201,2018-11-01 11:05:00,,5min,10-HGPT-MT-14.- Estacion Pisayambo/2018-01-11/...,,,,0.0,56.83,,,163.23,,,5.04,,
60202,2018-11-01 11:10:00,,5min,10-HGPT-MT-14.- Estacion Pisayambo/2018-01-11/...,,,,0.0,59.50,,,154.36,,,6.01,,
60203,2018-11-01 11:15:00,,5min,10-HGPT-MT-14.- Estacion Pisayambo/2018-01-11/...,,,,0.0,58.41,,,153.96,,,5.25,,


In [18]:
df['Date_Time']= pd.to_datetime(df['Date'] + ' ' + df['Time'])
df.drop(columns =['Date','Time'],inplace =True)
        
for variable in list(replace_values.keys()):
    df =df.replace(to_replace =replace_values[variable], value = variable)
    
        #Convert to long format and keep all variables
df.drop_duplicates(inplace=True)
df = df.reset_index().drop(columns = 'index')
freq = df.Freq.unique()[0]
file_name = df.file_name.unique()[0]
try:
    df2 =df.pivot(index='Date_Time', columns='Variable').reset_index()
    df = df.pivot(index='Date_Time', columns='Variable', values='Value').reset_index()
except:
    df2 =pd.pivot_table(df,index='Date_Time', columns='Variable',aggfunc='first').reset_index()
    df =pd.pivot_table(df,index='Date_Time', columns='Variable',values = 'Value',aggfunc='first').reset_index()
df['Freq'] = freq
df['Type'] = df2.Type.iloc[:,0]
df['file_name'] = file_name

In [32]:
not_good = list(set(['Date_Time', 'Type', 'Freq','file_name']+ var_list) - set(df.columns))

In [35]:
not_good

[]

In [33]:
df.dropna(axis=1,inplace= True, how = 'all')
not_in_df = list(set(var_list)-set(df.columns))
for i in not_in_df:
    df[i]=np.nan
df.drop(not_good)

Unnamed: 0,Variable,Value,Type,Freq,file_name,Date_Time,RHAvg,ATMax,ATMin,WSMax,ATAvg,RHMax,WSMin,RHMin,WDMin,WSAvg,WDAvg,PAvg,WDMax
1,TB5minAcc,0.00,G,5min,11-HGPT-PV-04.- Estacion Rio Verde/2013-03-21/...,2013-02-05 18:10:00,,,,,,,,,,,,,
2,TB5minAcc,0.00,G,5min,11-HGPT-PV-04.- Estacion Rio Verde/2013-03-21/...,2013-02-05 18:15:00,,,,,,,,,,,,,
3,TB5minAcc,0.00,G,5min,11-HGPT-PV-04.- Estacion Rio Verde/2013-03-21/...,2013-02-05 18:20:00,,,,,,,,,,,,,
4,TB5minAcc,0.00,G,5min,11-HGPT-PV-04.- Estacion Rio Verde/2013-03-21/...,2013-02-05 18:25:00,,,,,,,,,,,,,
5,TB5minAcc,0.00,G,5min,11-HGPT-PV-04.- Estacion Rio Verde/2013-03-21/...,2013-02-05 18:30:00,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12606,TB5minAcc,0.00,G,5min,11-HGPT-PV-04.- Estacion Rio Verde/2013-03-21/...,2013-03-21 12:35:00,,,,,,,,,,,,,
12607,TB5minAcc,0.00,G,5min,11-HGPT-PV-04.- Estacion Rio Verde/2013-03-21/...,2013-03-21 12:40:00,,,,,,,,,,,,,
12608,TB5minAcc,0.00,G,5min,11-HGPT-PV-04.- Estacion Rio Verde/2013-03-21/...,2013-03-21 12:45:00,,,,,,,,,,,,,
12609,TB5minAcc,0.00,G,5min,11-HGPT-PV-04.- Estacion Rio Verde/2013-03-21/...,2013-03-21 12:50:00,,,,,,,,,,,,,


In [22]:
var_order = ['Date_Time', 'Type', 'Freq','file_name']
var_order.extend(sorted(df.loc[:,list(set(df.columns) - set(['Date_Time', 'Type', 'Freq','file_name']))]))
df = df.reindex(var_order, axis=1)
df

Variable,Date_Time,Type,Freq,file_name,BAT,PAvg
0,2013-02-05 18:00:00,G,1hora,11-HGPT-PV-04.- Estacion Rio Verde/2013-03-21/...,12.68,11.00
1,2013-02-05 19:00:00,G,1hora,11-HGPT-PV-04.- Estacion Rio Verde/2013-03-21/...,12.73,8.00
2,2013-02-05 20:00:00,G,1hora,11-HGPT-PV-04.- Estacion Rio Verde/2013-03-21/...,12.72,0.00
3,2013-02-05 21:00:00,G,1hora,11-HGPT-PV-04.- Estacion Rio Verde/2013-03-21/...,12.72,0.00
4,2013-02-05 22:00:00,G,1hora,11-HGPT-PV-04.- Estacion Rio Verde/2013-03-21/...,12.72,0.00
...,...,...,...,...,...,...
1046,2013-03-21 08:00:00,G,1hora,11-HGPT-PV-04.- Estacion Rio Verde/2013-03-21/...,13.73,0.00
1047,2013-03-21 09:00:00,G,1hora,11-HGPT-PV-04.- Estacion Rio Verde/2013-03-21/...,14.28,1.00
1048,2013-03-21 10:00:00,G,1hora,11-HGPT-PV-04.- Estacion Rio Verde/2013-03-21/...,14.16,0.00
1049,2013-03-21 11:00:00,G,1hora,11-HGPT-PV-04.- Estacion Rio Verde/2013-03-21/...,14.09,0.00


In [245]:
df.drop(columns =['Date','Time'],inplace =True)

In [246]:
for variable in list(replace_values.keys()):
    df =df.replace(to_replace =replace_values[variable], value = variable)
    
        #Convert to long format and keep all variables
df.drop_duplicates(inplace=True)
df = df.reset_index().drop(columns = 'index')
freq = df.Freq.unique()[0]
st_name = df.st_name.unique()[0]

In [247]:
df

Unnamed: 0,Variable,Value,Type,Freq,st_name,Date_Time
0,RHAvg,-1.#J,B,1min,02-HGPT-MT-06-EstacionBaños,2013-05-03 22:13:00
1,ATAvg,-1.#J,B,1min,02-HGPT-MT-06-EstacionBaños,2013-05-03 22:13:00
2,WSAvg,-1.$,B,1min,02-HGPT-MT-06-EstacionBaños,2013-05-03 22:13:00
3,WDAvg,-1,B,1min,02-HGPT-MT-06-EstacionBaños,2013-05-03 22:13:00
4,PAvg,0.00,G,1min,02-HGPT-MT-06-EstacionBaños,2013-05-03 22:13:00
...,...,...,...,...,...,...
1554647,RHAvg,0.13,G,1min,02-HGPT-MT-06-EstacionBaños,2017-08-16 10:58:00
1554648,WSAvg,6.5,G,1min,02-HGPT-MT-06-EstacionBaños,2017-08-16 10:58:00
1554649,WDAvg,94,G,1min,02-HGPT-MT-06-EstacionBaños,2017-08-16 10:58:00
1554650,PAvg,0.00,G,1min,02-HGPT-MT-06-EstacionBaños,2017-08-16 10:59:00


In [249]:
df2 =df.pivot(index='Date_Time', columns='Variable').reset_index()
df2

Unnamed: 0_level_0,Date_Time,Value,Value,Value,Value,Value,Value,Type,Type,Type,...,Freq,Freq,Freq,Freq,st_name,st_name,st_name,st_name,st_name,st_name
Variable,Unnamed: 1_level_1,NaN,ATAvg,PAvg,RHAvg,WDAvg,WSAvg,NaN,ATAvg,PAvg,...,PAvg,RHAvg,WDAvg,WSAvg,NaN,ATAvg,PAvg,RHAvg,WDAvg,WSAvg
0,NaT,,,,,,,,,,...,,,,,02-HGPT-MT-06-EstacionBaños,,,,,
1,2013-05-03 22:13:00,,-1.#J,0.00,-1.#J,-1,-1.$,,B,G,...,1min,1min,1min,1min,,02-HGPT-MT-06-EstacionBaños,02-HGPT-MT-06-EstacionBaños,02-HGPT-MT-06-EstacionBaños,02-HGPT-MT-06-EstacionBaños,02-HGPT-MT-06-EstacionBaños
2,2017-01-12 12:50:00,,23.08,0.00,0.12,110,7.5,,G,G,...,1min,1min,1min,1min,,02-HGPT-MT-06-EstacionBaños,02-HGPT-MT-06-EstacionBaños,02-HGPT-MT-06-EstacionBaños,02-HGPT-MT-06-EstacionBaños,02-HGPT-MT-06-EstacionBaños
3,2017-01-12 12:51:00,,23.26,0.20,0.13,103,7.4,,G,G,...,1min,1min,1min,1min,,02-HGPT-MT-06-EstacionBaños,02-HGPT-MT-06-EstacionBaños,02-HGPT-MT-06-EstacionBaños,02-HGPT-MT-06-EstacionBaños,02-HGPT-MT-06-EstacionBaños
4,2017-01-12 12:52:00,,23.45,0.00,0.12,115,4.7,,G,G,...,1min,1min,1min,1min,,02-HGPT-MT-06-EstacionBaños,02-HGPT-MT-06-EstacionBaños,02-HGPT-MT-06-EstacionBaños,02-HGPT-MT-06-EstacionBaños,02-HGPT-MT-06-EstacionBaños
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
310927,2017-08-16 10:55:00,,20.46,0.00,0.13,93,5.2,,G,G,...,1min,1min,1min,1min,,02-HGPT-MT-06-EstacionBaños,02-HGPT-MT-06-EstacionBaños,02-HGPT-MT-06-EstacionBaños,02-HGPT-MT-06-EstacionBaños,02-HGPT-MT-06-EstacionBaños
310928,2017-08-16 10:56:00,,20.47,0.00,0.16,88,4.1,,G,G,...,1min,1min,1min,1min,,02-HGPT-MT-06-EstacionBaños,02-HGPT-MT-06-EstacionBaños,02-HGPT-MT-06-EstacionBaños,02-HGPT-MT-06-EstacionBaños,02-HGPT-MT-06-EstacionBaños
310929,2017-08-16 10:57:00,,20.48,0.00,0.14,96,6.2,,G,G,...,1min,1min,1min,1min,,02-HGPT-MT-06-EstacionBaños,02-HGPT-MT-06-EstacionBaños,02-HGPT-MT-06-EstacionBaños,02-HGPT-MT-06-EstacionBaños,02-HGPT-MT-06-EstacionBaños
310930,2017-08-16 10:58:00,,20.42,0.00,0.13,94,6.5,,G,G,...,1min,1min,1min,1min,,02-HGPT-MT-06-EstacionBaños,02-HGPT-MT-06-EstacionBaños,02-HGPT-MT-06-EstacionBaños,02-HGPT-MT-06-EstacionBaños,02-HGPT-MT-06-EstacionBaños


In [250]:
df = df.pivot(index='Date_Time', columns='Variable', values='Value').reset_index()

In [251]:
df

Variable,Date_Time,NaN,ATAvg,PAvg,RHAvg,WDAvg,WSAvg
0,NaT,,,,,,
1,2013-05-03 22:13:00,,-1.#J,0.00,-1.#J,-1,-1.$
2,2017-01-12 12:50:00,,23.08,0.00,0.12,110,7.5
3,2017-01-12 12:51:00,,23.26,0.20,0.13,103,7.4
4,2017-01-12 12:52:00,,23.45,0.00,0.12,115,4.7
...,...,...,...,...,...,...,...
310927,2017-08-16 10:55:00,,20.46,0.00,0.13,93,5.2
310928,2017-08-16 10:56:00,,20.47,0.00,0.16,88,4.1
310929,2017-08-16 10:57:00,,20.48,0.00,0.14,96,6.2
310930,2017-08-16 10:58:00,,20.42,0.00,0.13,94,6.5


In [252]:
df['Freq'] = freq
df['Type'] = df2.Type.iloc[:,0]
df['st_name'] = st_name

In [261]:
not_in_df = list(set(var_list)-set(df.columns))
for i in not_in_df:
    df[i]=np.nan
df.dropna(axis=1,inplace= True, how = 'all')

In [262]:
var_order = ['Date_Time', 'Type', 'Freq','st_name']
var_order.extend(sorted(df.loc[:,list(set(df.columns) - set(['Date_Time', 'Type', 'Freq','st_name']))]))

In [266]:
master_df.dtypes

Date_Time        datetime64[ns]
Type                     object
Freq                     object
st_name                  object
ATAvg                   float64
ATMax                   float64
ATMin                   float64
PAvg                    float64
RHAvg                   float64
RHMax                   float64
RHMin                   float64
WDAvg                   float64
WDMax                   float64
WDMin                   float64
WSAvg                   float64
WSMax                   float64
WSMin                   float64
BAT                     float64
Dirección               float64
Precipitación           float64
Humedad R               float64
RH5m2015                float64
2015                    float64
dtype: object

In [267]:
len(master_df)

5380059

In [271]:
master_df.head()

Unnamed: 0,Date_Time,Type,Freq,st_name,ATAvg,ATMax,ATMin,PAvg,RHAvg,RHMax,...,WDMin,WSAvg,WSMax,WSMin,BAT,Dirección,Precipitación,Humedad R,RH5m2015,2015
0,2013-03-14 22:27:00,G,1min,02-HGPT-MT-06-EstacionBaños,17.3,,,0.0,95.39,,...,,,,,,,,,,
1,2013-03-14 22:28:00,G,1min,02-HGPT-MT-06-EstacionBaños,17.3,,,0.0,95.16,,...,,,,,,,,,,
2,2013-03-14 22:29:00,G,1min,02-HGPT-MT-06-EstacionBaños,17.31,,,0.0,95.0,,...,,,,,,,,,,
3,2013-03-14 22:30:00,G,1min,02-HGPT-MT-06-EstacionBaños,17.34,,,0.0,94.95,,...,,,,,,,,,,
4,2013-03-14 22:31:00,G,1min,02-HGPT-MT-06-EstacionBaños,17.34,,,0.0,95.06,,...,,,,,,,,,,


#### Other type of stations

In [18]:
%run Meteo_utils.ipynb

In [3]:
path = '/Users/tamarahuete/Documents/Github_repos/TFM21/data'
ziplist = glob.glob(f'{path}/*.zip')
#ziplist = glob.glob(f'data/*.zip')

In [4]:
ziplist

['/Users/tamarahuete/Documents/Github_repos/TFM21/data/24-INAMHI-M0380.- HUambalo-20210610T152020Z-001.zip',
 '/Users/tamarahuete/Documents/Github_repos/TFM21/data/07-HGPT-MT-07.- Estacion Aeropuerto-20210505T205931Z-001.zip',
 '/Users/tamarahuete/Documents/Github_repos/TFM21/data/02-HGPT-MT-06.- Estacion Baños-20210426T162523Z-001.zip',
 '/Users/tamarahuete/Documents/Github_repos/TFM21/data/11-HGPT-PV-04.- Estacion Rio Verde-20210610T152008Z-001.zip']

In [37]:
zf = zipfile.ZipFile(f'{ziplist[3]}')
zf.namelist()[0:5]

['11-HGPT-PV-04.- Estacion Rio Verde/2020-01-13/8310_5min_20200128.xlsx',
 '11-HGPT-PV-04.- Estacion Rio Verde/2015-07-08/8310_5min_20150715_subir.csv',
 '11-HGPT-PV-04.- Estacion Rio Verde/2015-02-06/8310_5min_20150211_subir.csv',
 '11-HGPT-PV-04.- Estacion Rio Verde/2015-02-06/8310_5min_20150211.csv',
 '11-HGPT-PV-04.- Estacion Rio Verde/2015-07-08/8310_5min_20150715.csv']

In [39]:
### Revise this in the fucntion
files_by_date = order_meteo_zip(path, folder =3)
files_by_date

['11-HGPT-PV-04.- Estacion Rio Verde/2013-03-21/RIO VERDE_1hora_20130321.csv',
 '11-HGPT-PV-04.- Estacion Rio Verde/2013-03-21/RIO VERDE_5min_20130321.csv',
 '11-HGPT-PV-04.- Estacion Rio Verde/2013-05-09/RIO VERDE_30seg_20130509.csv',
 '11-HGPT-PV-04.- Estacion Rio Verde/2013-05-09/RIO VERDE_1hora_20130509.csv',
 '11-HGPT-PV-04.- Estacion Rio Verde/2013-05-09/RIO VERDE_5min_20130509.csv',
 '11-HGPT-PV-04.- Estacion Rio Verde/2013-06-13/Rio Verde_5min_20130626.csv',
 '11-HGPT-PV-04.- Estacion Rio Verde/2013-08-01/Rio Verde_5min_20130808.csv',
 '11-HGPT-PV-04.- Estacion Rio Verde/2013-10-04/Rio Verde_5min_20131017_subir.csv',
 '11-HGPT-PV-04.- Estacion Rio Verde/2013-10-04/Rio Verde_5min_20131017.csv',
 '11-HGPT-PV-04.- Estacion Rio Verde/2014-01-09/Rio Verde_5min_20140115_subir.csv',
 '11-HGPT-PV-04.- Estacion Rio Verde/2014-01-09/Rio Verde_5min_20140115.csv',
 '11-HGPT-PV-04.- Estacion Rio Verde/2014-05-08/Rio Verde_5min_20140522.csv',
 '11-HGPT-PV-04.- Estacion Rio Verde/2014-07-24/R

In [33]:
file = '02-HGPT-MT-06.- Estacion Baños/2013-03-21/8310_1min_20170717.csv'
df = pd.read_csv(zf.open(file),header=None, encoding = 'latin-1')
#df = pd.read_csv(zf.open('02-HGPT-MT-06.- Estacion Baños/2013-03-21/8310_1min_20170717.csv'))

In [34]:
df.head()

Unnamed: 0,0,1,2,3,4,5
0,03/14/2013,22:27:00,TB1minAcc,0.0,,G
1,03/14/2013,22:27:00,ATAvg,17.3,,G
2,03/14/2013,22:27:00,RHAvg,95.39,,G
3,03/14/2013,22:28:00,TB1minAcc,0.0,,G
4,03/14/2013,22:28:00,ATAvg,17.3,,G


In [40]:
df = read_meteo_csv(path =path,folder = 3,file = files_by_date[0])

Success sep ";", latin encoding
file =11-HGPT-PV-04.- Estacion Rio Verde/2013-03-21/RIO VERDE_1hora_20130321.csv, cols = 6


In [21]:
df

Unnamed: 0,Fecha,PAvg,Freq,file_name
0,01/07/2015 0:00,0,1day,24-INAMHI-M0380.- HUambalo/2015/julio_diciembr...
1,02/07/2015 0:00,0,1day,24-INAMHI-M0380.- HUambalo/2015/julio_diciembr...
2,03/07/2015 0:00,07,1day,24-INAMHI-M0380.- HUambalo/2015/julio_diciembr...
3,04/07/2015 0:00,25,1day,24-INAMHI-M0380.- HUambalo/2015/julio_diciembr...
4,05/07/2015 0:00,14,1day,24-INAMHI-M0380.- HUambalo/2015/julio_diciembr...
...,...,...,...,...
179,27/12/2015 0:00,240,1day,24-INAMHI-M0380.- HUambalo/2015/julio_diciembr...
180,28/12/2015 0:00,000,1day,24-INAMHI-M0380.- HUambalo/2015/julio_diciembr...
181,29/12/2015 0:00,000,1day,24-INAMHI-M0380.- HUambalo/2015/julio_diciembr...
182,30/12/2015 0:00,070,1day,24-INAMHI-M0380.- HUambalo/2015/julio_diciembr...


In [124]:
len(list(set(['Date']) - set(df.columns)))

1

In [24]:
reformat_df(df, replace_values)

Unnamed: 0,Date_Time,Type,Freq,file_name,ATAvg,ATMax,ATMin,BAT,PAvg,RHAvg,...,RHMin,TB1hrAcc,TB1minAcc,TB5minAcc,WDAvg,WDMax,WDMin,WSAvg,WSMax,WSMin
0,2015-01-07,,1day,24-INAMHI-M0380.- HUambalo/2015/julio_diciembr...,,,,,0.0,,...,,,,,,,,,,
1,2015-02-07,,1day,24-INAMHI-M0380.- HUambalo/2015/julio_diciembr...,,,,,0.0,,...,,,,,,,,,,
2,2015-03-07,,1day,24-INAMHI-M0380.- HUambalo/2015/julio_diciembr...,,,,,0.7,,...,,,,,,,,,,
3,2015-04-07,,1day,24-INAMHI-M0380.- HUambalo/2015/julio_diciembr...,,,,,2.5,,...,,,,,,,,,,
4,2015-05-07,,1day,24-INAMHI-M0380.- HUambalo/2015/julio_diciembr...,,,,,1.4,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
179,2015-12-27,,1day,24-INAMHI-M0380.- HUambalo/2015/julio_diciembr...,,,,,2.4,,...,,,,,,,,,,
180,2015-12-28,,1day,24-INAMHI-M0380.- HUambalo/2015/julio_diciembr...,,,,,0.0,,...,,,,,,,,,,
181,2015-12-29,,1day,24-INAMHI-M0380.- HUambalo/2015/julio_diciembr...,,,,,0.0,,...,,,,,,,,,,
182,2015-12-30,,1day,24-INAMHI-M0380.- HUambalo/2015/julio_diciembr...,,,,,0.7,,...,,,,,,,,,,
