## Global Historical Climatology Network Dataset
Variables are stored in both rows and columns
This dataset represents the daily weather records for a weather station (MX17004) in Mexico for five months in 2010.

In [1]:
import pandas as pd
import numpy as np

# 1. Reading Data File

In [3]:
weather = pd.read_csv('../weather-raw.csv')
weather.head()

Unnamed: 0,id,year,month,element,d1,d2,d3,d4,d5,d6,...,d22,d23,d24,d25,d26,d27,d28,d29,d30,d31
0,MX17004,2010,1,tmax,,,,,,,...,,,,,,,,,27.8,
1,MX17004,2010,1,tmin,,,,,,,...,,,,,,,,,14.5,
2,MX17004,2010,2,tmax,,27.3,24.1,,,,...,,29.9,,,,,,,,
3,MX17004,2010,2,tmin,,14.4,14.4,,,,...,,10.7,,,,,,,,
4,MX17004,2010,3,tmax,,,,,32.1,,...,,,,,,,,,,


# 2. Switching Nulls to 0

In [4]:
weather = weather.fillna(0)
weather.head()

Unnamed: 0,id,year,month,element,d1,d2,d3,d4,d5,d6,...,d22,d23,d24,d25,d26,d27,d28,d29,d30,d31
0,MX17004,2010,1,tmax,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,27.8,0.0
1,MX17004,2010,1,tmin,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,14.5,0.0
2,MX17004,2010,2,tmax,0.0,27.3,24.1,0.0,0.0,0.0,...,0.0,29.9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,MX17004,2010,2,tmin,0.0,14.4,14.4,0.0,0.0,0.0,...,0.0,10.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,MX17004,2010,3,tmax,0.0,0.0,0.0,0.0,32.1,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# 2. Dropping Columns

In [5]:
drop_list = ['id','year']
weather = weather.drop(drop_list, axis = 1)
weather.head()

Unnamed: 0,month,element,d1,d2,d3,d4,d5,d6,d7,d8,...,d22,d23,d24,d25,d26,d27,d28,d29,d30,d31
0,1,tmax,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,27.8,0.0
1,1,tmin,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,14.5,0.0
2,2,tmax,0.0,27.3,24.1,0.0,0.0,0.0,0.0,0.0,...,0.0,29.9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2,tmin,0.0,14.4,14.4,0.0,0.0,0.0,0.0,0.0,...,0.0,10.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,3,tmax,0.0,0.0,0.0,0.0,32.1,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# 3. Switching Days to Variable

In [6]:
weather2 = weather.melt(id_vars=['month','element'])
weather2.head()

Unnamed: 0,month,element,variable,value
0,1,tmax,d1,0.0
1,1,tmin,d1,0.0
2,2,tmax,d1,0.0
3,2,tmin,d1,0.0
4,3,tmax,d1,0.0


# 4. Switching Element to Columns

In [7]:
weather3 = weather2.pivot_table(index = ['month','variable'], columns = 'element')
weather3.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,value,value
Unnamed: 0_level_1,element,tmax,tmin
month,variable,Unnamed: 2_level_2,Unnamed: 3_level_2
1,d1,0.0,0.0
1,d10,0.0,0.0
1,d11,0.0,0.0
1,d12,0.0,0.0
1,d13,0.0,0.0


# 5. Remove Days without Temperatures

In [8]:
weather4 = weather3.loc[(weather3!=0).any(axis=1)]
weather4.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,value,value
Unnamed: 0_level_1,element,tmax,tmin
month,variable,Unnamed: 2_level_2,Unnamed: 3_level_2
1,d30,27.8,14.5
2,d11,29.7,13.4
2,d2,27.3,14.4
2,d23,29.9,10.7
2,d3,24.1,14.4


# 6. Rename Columns

In [9]:
weather5 = weather4.reset_index()
weather5.columns = ("month","day","tmax","tmin")
weather5.head()

Unnamed: 0,month,day,tmax,tmin
0,1,d30,27.8,14.5
1,2,d11,29.7,13.4
2,2,d2,27.3,14.4
3,2,d23,29.9,10.7
4,2,d3,24.1,14.4


# 7. Clean 'Days' Values

In [10]:
weather5["day"] = weather5["day"].map(lambda x: x.lstrip("d"))
weather5.head()

Unnamed: 0,month,day,tmax,tmin
0,1,30,27.8,14.5
1,2,11,29.7,13.4
2,2,2,27.3,14.4
3,2,23,29.9,10.7
4,2,3,24.1,14.4
