## Global Historical Climatology Network Dataset
Variables are stored in both rows and columns
This dataset represents the daily weather records for a weather station (MX17004) in Mexico for five months in 2010.

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('weather-raw.csv')
df

Unnamed: 0,id,year,month,element,d1,d2,d3,d4,d5,d6,...,d22,d23,d24,d25,d26,d27,d28,d29,d30,d31
0,MX17004,2010,1,tmax,,,,,,,...,,,,,,,,,27.8,
1,MX17004,2010,1,tmin,,,,,,,...,,,,,,,,,14.5,
2,MX17004,2010,2,tmax,,27.3,24.1,,,,...,,29.9,,,,,,,,
3,MX17004,2010,2,tmin,,14.4,14.4,,,,...,,10.7,,,,,,,,
4,MX17004,2010,3,tmax,,,,,32.1,,...,,,,,,,,,,
5,MX17004,2010,3,tmin,,,,,14.2,,...,,,,,,,,,,
6,MX17004,2010,4,tmax,,,,,,,...,,,,,,36.3,,,,
7,MX17004,2010,4,tmin,,,,,,,...,,,,,,16.7,,,,
8,MX17004,2010,5,tmax,,,,,,,...,,,,,,33.2,,,,
9,MX17004,2010,5,tmin,,,,,,,...,,,,,,18.2,,,,


In [3]:
# Inspeccionamos las columnas

df.columns

Index(['id', 'year', 'month', 'element', 'd1', 'd2', 'd3', 'd4', 'd5', 'd6',
       'd7', 'd8', 'd9', 'd10', 'd11', 'd12', 'd13', 'd14', 'd15', 'd16',
       'd17', 'd18', 'd19', 'd20', 'd21', 'd22', 'd23', 'd24', 'd25', 'd26',
       'd27', 'd28', 'd29', 'd30', 'd31'],
      dtype='object')

In [4]:
help(df.melt)

Help on method melt in module pandas.core.frame:

melt(id_vars=None, value_vars=None, var_name=None, value_name='value', col_level=None, ignore_index=True) -> 'DataFrame' method of pandas.core.frame.DataFrame instance
    Unpivot a DataFrame from wide to long format, optionally leaving identifiers set.
    
    This function is useful to massage a DataFrame into a format where one
    or more columns are identifier variables (`id_vars`), while all other
    columns, considered measured variables (`value_vars`), are "unpivoted" to
    the row axis, leaving just two non-identifier columns, 'variable' and
    'value'.
    
    .. versionadded:: 0.20.0
    
    Parameters
    ----------
    id_vars : tuple, list, or ndarray, optional
        Column(s) to use as identifier variables.
    value_vars : tuple, list, or ndarray, optional
        Column(s) to unpivot. If not specified, uses all columns that
        are not set as `id_vars`.
    var_name : scalar
        Name to use for the 'vari

In [5]:
# Hacemos melt para introducir los días como una columnas más

dflargo = pd.melt(df, id_vars = ['id', 'year', 'month', 'element'], var_name = 'day', value_name = 'temperature')
dflargo

Unnamed: 0,id,year,month,element,day,temperature
0,MX17004,2010,1,tmax,d1,
1,MX17004,2010,1,tmin,d1,
2,MX17004,2010,2,tmax,d1,
3,MX17004,2010,2,tmin,d1,
4,MX17004,2010,3,tmax,d1,
...,...,...,...,...,...,...
677,MX17004,2010,10,tmin,d31,
678,MX17004,2010,11,tmax,d31,
679,MX17004,2010,11,tmin,d31,
680,MX17004,2010,12,tmax,d31,


In [6]:
# Comprobamos que la operación ha salido bien ya que las 22 filas del df originas multiplicadas por 31 días = 682

# 682 es el número de filas de nuestro nuevo dflargo

22*31

682

In [7]:
# Identificamos que la columna id no aporta valor ya que es siempre el mismo

dflargo['id'].value_counts()

MX17004    682
Name: id, dtype: int64

In [8]:
# Eliminamos la columna id

dflargo.drop('id', axis=1, inplace=True)

In [9]:
dflargo

Unnamed: 0,year,month,element,day,temperature
0,2010,1,tmax,d1,
1,2010,1,tmin,d1,
2,2010,2,tmax,d1,
3,2010,2,tmin,d1,
4,2010,3,tmax,d1,
...,...,...,...,...,...
677,2010,10,tmin,d31,
678,2010,11,tmax,d31,
679,2010,11,tmin,d31,
680,2010,12,tmax,d31,


In [10]:
# Cambiamos el orden de las columnas para que quede más ordenado: año, mes, día

dflargo = dflargo[['year', 'month', 'day', 'element', 'temperature']]

In [11]:
dflargo

Unnamed: 0,year,month,day,element,temperature
0,2010,1,d1,tmax,
1,2010,1,d1,tmin,
2,2010,2,d1,tmax,
3,2010,2,d1,tmin,
4,2010,3,d1,tmax,
...,...,...,...,...,...
677,2010,10,d31,tmin,
678,2010,11,d31,tmax,
679,2010,11,d31,tmin,
680,2010,12,d31,tmax,


In [12]:
# Eliminamos la d de la columna 'day' y aplicamos astype para que sea un int

dflargo['day'] = dflargo['day'].str.replace('d','').astype('int')

In [13]:
dflargo

Unnamed: 0,year,month,day,element,temperature
0,2010,1,1,tmax,
1,2010,1,1,tmin,
2,2010,2,1,tmax,
3,2010,2,1,tmin,
4,2010,3,1,tmax,
...,...,...,...,...,...
677,2010,10,31,tmin,
678,2010,11,31,tmax,
679,2010,11,31,tmin,
680,2010,12,31,tmax,


In [14]:
# Ahora ya lo tenemos con los tipos de datos deseados

dflargo.dtypes

year             int64
month            int64
day              int64
element         object
temperature    float64
dtype: object

In [15]:
# Aplicamos pivot_table para pasar t_max y t_min a columnas y lo asignamos al nuefo dataframe weather

weather = pd.pivot_table(dflargo, values = 'temperature', index = ['year', 'month', 'day'], columns = 'element')
weather.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,element,tmax,tmin
year,month,day,Unnamed: 3_level_1,Unnamed: 4_level_1
2010,1,30,27.8,14.5
2010,2,2,27.3,14.4
2010,2,3,24.1,14.4
2010,2,11,29.7,13.4
2010,2,23,29.9,10.7
2010,3,5,32.1,14.2
2010,3,10,34.5,16.8
2010,3,16,31.1,17.6
2010,4,27,36.3,16.7
2010,5,27,33.2,18.2


In [16]:
# Podríamos haberlo hecho todo en una línea desde dflargo

#wetest = dflargo.pivot_table(values = 'temperature', index = ['year', 'month', 'day'], columns = 'element').reset_index().assign(#ver resto de función)

In [17]:
help(df.sort_values)

Help on method sort_values in module pandas.core.frame:

sort_values(by, axis=0, ascending=True, inplace=False, kind='quicksort', na_position='last', ignore_index=False, key: Union[Callable[[ForwardRef('Series')], Union[ForwardRef('Series'), ~AnyArrayLike]], NoneType] = None) method of pandas.core.frame.DataFrame instance
    Sort by the values along either axis.
    
    Parameters
    ----------
            by : str or list of str
                Name or list of names to sort by.
    
                - if `axis` is 0 or `'index'` then `by` may contain index
                  levels and/or column labels.
                - if `axis` is 1 or `'columns'` then `by` may contain column
                  levels and/or index labels.
    
                .. versionchanged:: 0.23.0
    
                   Allow specifying index or column level names.
    axis : {0 or 'index', 1 or 'columns'}, default 0
         Axis to be sorted.
    ascending : bool or list of bool, default True
         Sort 

In [18]:
# Ordenamos los valores por año, mes y día en ese orden

weather.sort_values(['year', 'month', 'day'], inplace=True)

In [19]:
weather.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,element,tmax,tmin
year,month,day,Unnamed: 3_level_1,Unnamed: 4_level_1
2010,1,30,27.8,14.5
2010,2,2,27.3,14.4
2010,2,3,24.1,14.4
2010,2,11,29.7,13.4
2010,2,23,29.9,10.7
2010,3,5,32.1,14.2
2010,3,10,34.5,16.8
2010,3,16,31.1,17.6
2010,4,27,36.3,16.7
2010,5,27,33.2,18.2


In [20]:
# Añadimos columna delta para ver la diferencia de temperatura

weather['delta'] = weather['tmax'] - weather['tmin']

In [21]:
weather.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,element,tmax,tmin,delta
year,month,day,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2010,1,30,27.8,14.5,13.3
2010,2,2,27.3,14.4,12.9
2010,2,3,24.1,14.4,9.7
2010,2,11,29.7,13.4,16.3
2010,2,23,29.9,10.7,19.2
2010,3,5,32.1,14.2,17.9
2010,3,10,34.5,16.8,17.7
2010,3,16,31.1,17.6,13.5
2010,4,27,36.3,16.7,19.6
2010,5,27,33.2,18.2,15.0


In [22]:
# Le quitamos el índice 'element'

weather.reset_index(inplace=True)

In [23]:
weather.head()

element,year,month,day,tmax,tmin,delta
0,2010,1,30,27.8,14.5,13.3
1,2010,2,2,27.3,14.4,12.9
2,2010,2,3,24.1,14.4,9.7
3,2010,2,11,29.7,13.4,16.3
4,2010,2,23,29.9,10.7,19.2


In [24]:
# Aplicar timeseries y resample con la media mensual (ver 5.2.1 MorePandas clase sábado 13/03)
import datetime as dt

In [25]:
weather.apply(lambda row: dt.date(row['year'], row['month'], row['day']), axis=1)

# No sabemos por qué da error, ya que no hay valores nulos y mis columnas son todas int

TypeError: integer argument expected, got float

In [26]:
weather.isna().sum()

element
year     0
month    0
day      0
tmax     0
tmin     0
delta    0
dtype: int64

In [27]:
weather.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33 entries, 0 to 32
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   year    33 non-null     int64  
 1   month   33 non-null     int64  
 2   day     33 non-null     int64  
 3   tmax    33 non-null     float64
 4   tmin    33 non-null     float64
 5   delta   33 non-null     float64
dtypes: float64(3), int64(3)
memory usage: 1.7 KB
