## 8. ConcatenacionDF_Final

### Objetivo

Generar dataframe final de movimientos a utilizar en etapas de machine learning posteriores. Este DF contendrá la identificación del cluster al cual pertenece cada estación según tipo de clusterización (demanda, geografica o full)

### Descripción General de notebook

    1. Carga de datos de movimientos históricos y de clusterización
    2. Concatenación de dataframes
    3. Export de datos finales

## 1. Carga de datos de entrada

In [1]:
import pandas as pd
import numpy as np
import time
import datetime

%run "../1. Librerias Mongo/MongoDB_Connections.ipynb"
%run "../1. Librerias Mongo/MongoDB_Funciones_Consultas.ipynb"

In [2]:
# MOVIMIENTOS

df_tracks = pd.read_csv('../../data/DataFrame_Final_Cierre_2017_2019.csv', parse_dates=['FECHA'])

# RESULTADOS DE CLUSTERIZACION
df_clusterFull = pd.read_csv('../../data/Clusterizada_DataFull.csv')
df_clusterGeog5 = pd.read_csv('../../data/Clusterizada_soloGeografico_5Cluster.csv')
df_clusterGeog7 = pd.read_csv('../../data/Clusterizada_soloGeografico_7Cluster.csv')
df_clusterDmnd = pd.read_csv('../../data/Clusterizada_soloDemandaMensual.csv')

In [3]:
df_clusterDmnd

Unnamed: 0,ESTACION,CLUSTER
0,1,0
1,2,3
2,3,4
3,4,3
4,5,3
...,...,...
167,139,4
168,87,1
169,105,1
170,146,1


## 2. Concatenación de dataframes

In [4]:
df_final = pd.merge(df_tracks, df_clusterFull[['ESTACION','CLUSTER']], how='left', left_on=['ESTACION'], right_on=['ESTACION'])
df_final = df_final.rename(columns={'CLUSTER': 'CLUSTER_FullCols'})
df_final

Unnamed: 0,ESTACION,ANIO,MES,DIA,HORA,FECHA,DIA_SEMANA,AM_PM,TEMPORADA,TEMPORADA_NUM,...,TEMPERATURA,VIENTO,PRESION,HUMEDAD,PRECIPITACION_1h,PRECIPITACION_3h,DESC_TIEMPO,DESC_TIEMPO_detalle,DEMANDA,CLUSTER_FullCols
0,1,2017,3,31,23,2017-03-31 23:00:00,6,PM,INVIERNO,1,...,12.58,7.72,1020.0,44.0,0.0,0.0,Clouds,few clouds,7,4
1,2,2017,3,31,23,2017-03-31 23:00:00,6,PM,INVIERNO,1,...,12.58,7.72,1020.0,44.0,0.0,0.0,Clouds,few clouds,3,4
2,3,2017,3,31,23,2017-03-31 23:00:00,6,PM,INVIERNO,1,...,12.58,7.72,1020.0,44.0,0.0,0.0,Clouds,few clouds,1,3
3,4,2017,3,31,23,2017-03-31 23:00:00,6,PM,INVIERNO,1,...,12.58,7.72,1020.0,44.0,0.0,0.0,Clouds,few clouds,1,2
4,5,2017,3,31,23,2017-03-31 23:00:00,6,PM,INVIERNO,1,...,12.58,7.72,1020.0,44.0,0.0,0.0,Clouds,few clouds,2,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2912133,168,2020,2,29,23,2020-02-29 23:00:00,7,PM,INVIERNO,1,...,9.60,6.20,1015.0,69.0,0.0,0.0,Clouds,few clouds,2,0
2912134,169,2020,2,29,23,2020-02-29 23:00:00,7,PM,INVIERNO,1,...,9.60,6.20,1015.0,69.0,0.0,0.0,Clouds,few clouds,8,0
2912135,171,2020,2,29,23,2020-02-29 23:00:00,7,PM,INVIERNO,1,...,9.60,6.20,1015.0,69.0,0.0,0.0,Clouds,few clouds,2,2
2912136,172,2020,2,29,23,2020-02-29 23:00:00,7,PM,INVIERNO,1,...,9.60,6.20,1015.0,69.0,0.0,0.0,Clouds,few clouds,1,1


In [5]:
df_final = pd.merge(df_final, df_clusterGeog5[['ESTACION','CLUSTER']], how='left', left_on=['ESTACION'], right_on=['ESTACION'])
df_final = df_final.rename(columns={'CLUSTER': 'CLUSTER_soloGeo5'})
df_final

Unnamed: 0,ESTACION,ANIO,MES,DIA,HORA,FECHA,DIA_SEMANA,AM_PM,TEMPORADA,TEMPORADA_NUM,...,VIENTO,PRESION,HUMEDAD,PRECIPITACION_1h,PRECIPITACION_3h,DESC_TIEMPO,DESC_TIEMPO_detalle,DEMANDA,CLUSTER_FullCols,CLUSTER_soloGeo5
0,1,2017,3,31,23,2017-03-31 23:00:00,6,PM,INVIERNO,1,...,7.72,1020.0,44.0,0.0,0.0,Clouds,few clouds,7,4,4
1,2,2017,3,31,23,2017-03-31 23:00:00,6,PM,INVIERNO,1,...,7.72,1020.0,44.0,0.0,0.0,Clouds,few clouds,3,4,4
2,3,2017,3,31,23,2017-03-31 23:00:00,6,PM,INVIERNO,1,...,7.72,1020.0,44.0,0.0,0.0,Clouds,few clouds,1,3,0
3,4,2017,3,31,23,2017-03-31 23:00:00,6,PM,INVIERNO,1,...,7.72,1020.0,44.0,0.0,0.0,Clouds,few clouds,1,2,0
4,5,2017,3,31,23,2017-03-31 23:00:00,6,PM,INVIERNO,1,...,7.72,1020.0,44.0,0.0,0.0,Clouds,few clouds,2,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2912133,168,2020,2,29,23,2020-02-29 23:00:00,7,PM,INVIERNO,1,...,6.20,1015.0,69.0,0.0,0.0,Clouds,few clouds,2,0,0
2912134,169,2020,2,29,23,2020-02-29 23:00:00,7,PM,INVIERNO,1,...,6.20,1015.0,69.0,0.0,0.0,Clouds,few clouds,8,0,0
2912135,171,2020,2,29,23,2020-02-29 23:00:00,7,PM,INVIERNO,1,...,6.20,1015.0,69.0,0.0,0.0,Clouds,few clouds,2,2,1
2912136,172,2020,2,29,23,2020-02-29 23:00:00,7,PM,INVIERNO,1,...,6.20,1015.0,69.0,0.0,0.0,Clouds,few clouds,1,1,2


In [6]:
df_final = pd.merge(df_final, df_clusterGeog7[['ESTACION','CLUSTER']], how='left', left_on=['ESTACION'], right_on=['ESTACION'])
df_final = df_final.rename(columns={'CLUSTER': 'CLUSTER_soloGeo7'})
df_final

Unnamed: 0,ESTACION,ANIO,MES,DIA,HORA,FECHA,DIA_SEMANA,AM_PM,TEMPORADA,TEMPORADA_NUM,...,PRESION,HUMEDAD,PRECIPITACION_1h,PRECIPITACION_3h,DESC_TIEMPO,DESC_TIEMPO_detalle,DEMANDA,CLUSTER_FullCols,CLUSTER_soloGeo5,CLUSTER_soloGeo7
0,1,2017,3,31,23,2017-03-31 23:00:00,6,PM,INVIERNO,1,...,1020.0,44.0,0.0,0.0,Clouds,few clouds,7,4,4,1
1,2,2017,3,31,23,2017-03-31 23:00:00,6,PM,INVIERNO,1,...,1020.0,44.0,0.0,0.0,Clouds,few clouds,3,4,4,1
2,3,2017,3,31,23,2017-03-31 23:00:00,6,PM,INVIERNO,1,...,1020.0,44.0,0.0,0.0,Clouds,few clouds,1,3,0,1
3,4,2017,3,31,23,2017-03-31 23:00:00,6,PM,INVIERNO,1,...,1020.0,44.0,0.0,0.0,Clouds,few clouds,1,2,0,4
4,5,2017,3,31,23,2017-03-31 23:00:00,6,PM,INVIERNO,1,...,1020.0,44.0,0.0,0.0,Clouds,few clouds,2,3,0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2912133,168,2020,2,29,23,2020-02-29 23:00:00,7,PM,INVIERNO,1,...,1015.0,69.0,0.0,0.0,Clouds,few clouds,2,0,0,6
2912134,169,2020,2,29,23,2020-02-29 23:00:00,7,PM,INVIERNO,1,...,1015.0,69.0,0.0,0.0,Clouds,few clouds,8,0,0,4
2912135,171,2020,2,29,23,2020-02-29 23:00:00,7,PM,INVIERNO,1,...,1015.0,69.0,0.0,0.0,Clouds,few clouds,2,2,1,2
2912136,172,2020,2,29,23,2020-02-29 23:00:00,7,PM,INVIERNO,1,...,1015.0,69.0,0.0,0.0,Clouds,few clouds,1,1,2,0


In [7]:
df_final = pd.merge(df_final, df_clusterDmnd[['ESTACION','CLUSTER']], how='left', left_on=['ESTACION'], right_on=['ESTACION'])
df_final = df_final.rename(columns={'CLUSTER': 'CLUSTER_soloDemanda'})
df_final

Unnamed: 0,ESTACION,ANIO,MES,DIA,HORA,FECHA,DIA_SEMANA,AM_PM,TEMPORADA,TEMPORADA_NUM,...,HUMEDAD,PRECIPITACION_1h,PRECIPITACION_3h,DESC_TIEMPO,DESC_TIEMPO_detalle,DEMANDA,CLUSTER_FullCols,CLUSTER_soloGeo5,CLUSTER_soloGeo7,CLUSTER_soloDemanda
0,1,2017,3,31,23,2017-03-31 23:00:00,6,PM,INVIERNO,1,...,44.0,0.0,0.0,Clouds,few clouds,7,4,4,1,0
1,2,2017,3,31,23,2017-03-31 23:00:00,6,PM,INVIERNO,1,...,44.0,0.0,0.0,Clouds,few clouds,3,4,4,1,3
2,3,2017,3,31,23,2017-03-31 23:00:00,6,PM,INVIERNO,1,...,44.0,0.0,0.0,Clouds,few clouds,1,3,0,1,4
3,4,2017,3,31,23,2017-03-31 23:00:00,6,PM,INVIERNO,1,...,44.0,0.0,0.0,Clouds,few clouds,1,2,0,4,3
4,5,2017,3,31,23,2017-03-31 23:00:00,6,PM,INVIERNO,1,...,44.0,0.0,0.0,Clouds,few clouds,2,3,0,4,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2912133,168,2020,2,29,23,2020-02-29 23:00:00,7,PM,INVIERNO,1,...,69.0,0.0,0.0,Clouds,few clouds,2,0,0,6,2
2912134,169,2020,2,29,23,2020-02-29 23:00:00,7,PM,INVIERNO,1,...,69.0,0.0,0.0,Clouds,few clouds,8,0,0,4,0
2912135,171,2020,2,29,23,2020-02-29 23:00:00,7,PM,INVIERNO,1,...,69.0,0.0,0.0,Clouds,few clouds,2,2,1,2,3
2912136,172,2020,2,29,23,2020-02-29 23:00:00,7,PM,INVIERNO,1,...,69.0,0.0,0.0,Clouds,few clouds,1,1,2,0,3


## 3. Export de datos finales

In [8]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2912138 entries, 0 to 2912137
Data columns (total 25 columns):
 #   Column               Dtype         
---  ------               -----         
 0   ESTACION             int64         
 1   ANIO                 int64         
 2   MES                  int64         
 3   DIA                  int64         
 4   HORA                 int64         
 5   FECHA                datetime64[ns]
 6   DIA_SEMANA           int64         
 7   AM_PM                object        
 8   TEMPORADA            object        
 9   TEMPORADA_NUM        int64         
 10  Es_Festivo           int64         
 11  Es_FinSemana         int64         
 12  TEMPERATURA          float64       
 13  VIENTO               float64       
 14  PRESION              float64       
 15  HUMEDAD              float64       
 16  PRECIPITACION_1h     float64       
 17  PRECIPITACION_3h     float64       
 18  DESC_TIEMPO          object        
 19  DESC_TIEMPO_detalle  

In [9]:
# REORDENAMIENTO DE COLUMNAS PREVIO A EXPORT

df_final_cierre = df_final[['ESTACION', 'ANIO', 'MES', 'DIA', 'HORA', 'FECHA', 'DIA_SEMANA', 'AM_PM', 'TEMPORADA', 
                            'TEMPORADA_NUM', 'Es_Festivo', 'Es_FinSemana', 'TEMPERATURA', 'VIENTO', 'PRESION', 'HUMEDAD', 
                            'PRECIPITACION_1h', 'PRECIPITACION_3h', 'DESC_TIEMPO', 'DESC_TIEMPO_detalle', 'CLUSTER_FullCols', 
                            'CLUSTER_soloGeo5', 'CLUSTER_soloGeo7', 'CLUSTER_soloDemanda', 'DEMANDA']]
df_final_cierre

Unnamed: 0,ESTACION,ANIO,MES,DIA,HORA,FECHA,DIA_SEMANA,AM_PM,TEMPORADA,TEMPORADA_NUM,...,HUMEDAD,PRECIPITACION_1h,PRECIPITACION_3h,DESC_TIEMPO,DESC_TIEMPO_detalle,CLUSTER_FullCols,CLUSTER_soloGeo5,CLUSTER_soloGeo7,CLUSTER_soloDemanda,DEMANDA
0,1,2017,3,31,23,2017-03-31 23:00:00,6,PM,INVIERNO,1,...,44.0,0.0,0.0,Clouds,few clouds,4,4,1,0,7
1,2,2017,3,31,23,2017-03-31 23:00:00,6,PM,INVIERNO,1,...,44.0,0.0,0.0,Clouds,few clouds,4,4,1,3,3
2,3,2017,3,31,23,2017-03-31 23:00:00,6,PM,INVIERNO,1,...,44.0,0.0,0.0,Clouds,few clouds,3,0,1,4,1
3,4,2017,3,31,23,2017-03-31 23:00:00,6,PM,INVIERNO,1,...,44.0,0.0,0.0,Clouds,few clouds,2,0,4,3,1
4,5,2017,3,31,23,2017-03-31 23:00:00,6,PM,INVIERNO,1,...,44.0,0.0,0.0,Clouds,few clouds,3,0,4,3,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2912133,168,2020,2,29,23,2020-02-29 23:00:00,7,PM,INVIERNO,1,...,69.0,0.0,0.0,Clouds,few clouds,0,0,6,2,2
2912134,169,2020,2,29,23,2020-02-29 23:00:00,7,PM,INVIERNO,1,...,69.0,0.0,0.0,Clouds,few clouds,0,0,4,0,8
2912135,171,2020,2,29,23,2020-02-29 23:00:00,7,PM,INVIERNO,1,...,69.0,0.0,0.0,Clouds,few clouds,2,1,2,3,2
2912136,172,2020,2,29,23,2020-02-29 23:00:00,7,PM,INVIERNO,1,...,69.0,0.0,0.0,Clouds,few clouds,1,2,0,3,1


In [10]:
df_final_cierre.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2912138 entries, 0 to 2912137
Data columns (total 25 columns):
 #   Column               Dtype         
---  ------               -----         
 0   ESTACION             int64         
 1   ANIO                 int64         
 2   MES                  int64         
 3   DIA                  int64         
 4   HORA                 int64         
 5   FECHA                datetime64[ns]
 6   DIA_SEMANA           int64         
 7   AM_PM                object        
 8   TEMPORADA            object        
 9   TEMPORADA_NUM        int64         
 10  Es_Festivo           int64         
 11  Es_FinSemana         int64         
 12  TEMPERATURA          float64       
 13  VIENTO               float64       
 14  PRESION              float64       
 15  HUMEDAD              float64       
 16  PRECIPITACION_1h     float64       
 17  PRECIPITACION_3h     float64       
 18  DESC_TIEMPO          object        
 19  DESC_TIEMPO_detalle  

In [11]:
df_final_cierre.to_csv('../../data/DataFrame_Final_Cierre_Cluster_2017_2019.csv', index=False)

In [12]:
df_final_cierre.groupby(['ANIO','MES']).agg(DEMANDA=('DEMANDA', 'sum'))

Unnamed: 0_level_0,Unnamed: 1_level_0,DEMANDA
ANIO,MES,Unnamed: 2_level_1
2017,3,270
2017,4,229141
2017,5,304879
2017,6,348622
2017,7,313819
2017,8,254717
2017,9,372590
2017,10,380083
2017,11,308364
2017,12,229176
