**This notebook will explore weather data from Max Planck Institute for Biogeochemistry in Jena, Germany**

[www.bgc-jena.mpg.de/wetter](http://www.bgc-jena.mpg.de/wetter)

In [15]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
#import statsmodels.tsa.stattools as tss
#import statsmodels.tsa.statespace.tools as ssm
import statsmodels.api as sm
from statsmodels.graphics.tsaplots import plot_pacf;


In [4]:
loc = os.path.join('data/')# this will put the directory data in variable loc

**1-** Let's make a list of all the file names 

In [5]:
filenames=os.listdir(loc)
filenames

['mpi_roof.zip',
 'mpi_roof_2010a.zip',
 'mpi_roof_2010b.zip',
 'mpi_roof_2011a.zip',
 'mpi_roof_2011b.zip',
 'mpi_roof_2012a.zip',
 'mpi_roof_2012b.zip',
 'mpi_roof_2013a.zip',
 'mpi_roof_2013b.zip',
 'mpi_roof_2014a.zip',
 'mpi_roof_2014b.zip',
 'mpi_roof_2015a.zip',
 'mpi_roof_2015b.zip',
 'mpi_roof_2016a.zip',
 'mpi_roof_2016b.zip',
 'mpi_roof_2017a.zip',
 'mpi_roof_2017b.zip']

In [6]:
'''
create a for loop with pd.read_csv and add all read dataframes into the for loop
This will create a list of all dataframes

'''
weather_list=[pd.read_csv(loc+filenames[i], header=0,encoding="ISO-8859-2") for i in range(len(filenames))]


In [7]:
'''
We simply want to stack those datasets together and ignore original index

'''
weather_df=pd.concat(weather_list,ignore_index=True)

In [8]:
weather_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 445210 entries, 0 to 445209
Data columns (total 22 columns):
Date Time               445210 non-null object
p (mbar)                445210 non-null float64
T (degC)                445210 non-null float64
Tpot (K)                445210 non-null float64
Tdew (degC)             445210 non-null float64
rh (%)                  445210 non-null float64
VPmax (mbar)            445210 non-null float64
VPact (mbar)            445210 non-null float64
VPdef (mbar)            445210 non-null float64
sh (g/kg)               445210 non-null float64
H2OC (mmol/mol)         445210 non-null float64
rho (g/m**3)            445210 non-null float64
wv (m/s)                445210 non-null float64
max. wv (m/s)           445210 non-null float64
wd (deg)                445210 non-null float64
rain (mm)               445210 non-null float64
raining (s)             445210 non-null float64
SWDR (W/m˛)             445210 non-null float64
PAR (ľmol/m˛/s)         44

In [9]:
weather_df.columns

Index(['Date Time', 'p (mbar)', 'T (degC)', 'Tpot (K)', 'Tdew (degC)',
       'rh (%)', 'VPmax (mbar)', 'VPact (mbar)', 'VPdef (mbar)', 'sh (g/kg)',
       'H2OC (mmol/mol)', 'rho (g/m**3)', 'wv (m/s)', 'max. wv (m/s)',
       'wd (deg)', 'rain (mm)', 'raining (s)', 'SWDR (W/m˛)',
       'PAR (ľmol/m˛/s)', 'max. PAR (ľmol/m˛/s)', 'Tlog (degC)', 'CO2 (ppm)'],
      dtype='object')

In [10]:
'''
I want to move Date column to be first column and change it to datetime and sort from oldest to newest

'''
weather_df=weather_df[['Date Time','CO2 (ppm)', 'H2OC (mmol/mol)', 'PAR (ľmol/m˛/s)',
       'SWDR (W/m˛)', 'T (degC)', 'Tdew (degC)', 'Tlog (degC)', 'Tpot (K)',
       'VPact (mbar)', 'VPdef (mbar)', 'VPmax (mbar)', 'max. PAR (ľmol/m˛/s)',
       'max. wv (m/s)', 'p (mbar)', 'rain (mm)', 'raining (s)', 'rh (%)',
       'rho (g/m**3)', 'sh (g/kg)', 'wd (deg)', 'wv (m/s)']]
weather_df['Date Time']=pd.to_datetime(weather_df['Date Time'])

weather_df=weather_df.sort_values('Date Time').reset_index(drop=True)

In [11]:
weather_df

Unnamed: 0,Date Time,CO2 (ppm),H2OC (mmol/mol),PAR (ľmol/m˛/s),SWDR (W/m˛),T (degC),Tdew (degC),Tlog (degC),Tpot (K),VPact (mbar),...,max. PAR (ľmol/m˛/s),max. wv (m/s),p (mbar),rain (mm),raining (s),rh (%),rho (g/m**3),sh (g/kg),wd (deg),wv (m/s)
0,2010-01-01 00:10:00,404.6,4.91,0.00,0.00,-2.84,-3.41,8.59,272.89,4.75,...,0.00,2.76,967.56,0.0,0.0,95.80,1244.56,3.06,15.41,1.61
1,2010-01-01 00:20:00,404.1,4.90,0.00,0.00,-2.85,-3.43,8.53,272.88,4.74,...,0.00,3.10,967.45,0.0,0.0,95.70,1244.48,3.05,17.04,2.00
2,2010-01-01 00:30:00,404.3,4.89,0.00,0.00,-2.88,-3.46,8.52,272.85,4.73,...,0.00,3.79,967.45,0.0,0.0,95.80,1244.65,3.05,25.35,2.25
3,2010-01-01 00:40:00,404.0,4.88,0.00,0.00,-2.90,-3.47,8.54,272.84,4.72,...,0.00,3.77,967.34,0.0,270.0,95.80,1244.57,3.04,23.64,2.64
4,2010-01-01 00:50:00,401.9,4.86,0.00,0.00,-2.96,-3.53,8.55,272.78,4.70,...,0.00,4.29,967.29,0.0,310.0,95.80,1244.81,3.03,18.94,2.82
5,2010-01-01 01:00:00,401.6,4.84,0.00,0.00,-3.03,-3.59,8.55,272.72,4.68,...,0.00,4.09,967.18,0.0,600.0,95.90,1245.00,3.02,22.06,2.92
6,2010-01-01 01:10:00,401.8,4.84,0.00,0.00,-3.05,-3.59,8.52,272.70,4.68,...,0.00,4.93,967.14,0.0,60.0,96.00,1245.03,3.02,18.60,2.70
7,2010-01-01 01:20:00,402.3,4.85,0.00,0.00,-3.03,-3.56,8.46,272.73,4.69,...,0.00,3.53,967.10,0.1,0.0,96.10,1244.89,3.02,20.80,2.28
8,2010-01-01 01:30:00,402.8,4.85,0.00,0.00,-3.04,-3.56,8.42,272.72,4.69,...,0.00,4.51,967.09,0.0,250.0,96.20,1244.93,3.02,23.99,2.51
9,2010-01-01 01:40:00,402.5,4.84,0.00,0.00,-3.10,-3.60,8.39,272.66,4.68,...,0.00,3.93,967.13,0.0,40.0,96.30,1245.24,3.01,23.12,2.83


In [11]:
weather_df.to_csv("allWeather.csv")

<!--NAVIGATION-->
##### <[Intro-RNN and LSTM](_01_Intro_RNN_LSTM.ipynb) | [Contents](Index.ipynb) |[LSTM Data Prep and simple model](03_LSTM_Data_Prep.ipynb) >