In [189]:
import pandas as pd
import numpy as np

In [190]:
#Estos datos corresonden a la concentración promedio diaria de monoxido de carbono 
#de 4 puntos en la ciudad de Londres en el periodo 01-ene-2000 a 30-sep-2019
#Los 4 sitios son:
# BX1 Bexley - Slade Green
# CT6 City of London - Walbrook Wharf
# LH0 Hillingdon - Harlington
# KC1 Kensington and Chelsea - North Ken

data = pd.read_csv('LaqnData.csv')
data.head(2)

Unnamed: 0,Site,Species,ReadingDateTime,Value,Units,Provisional or Ratified
0,BX1,CO,01/01/2000 00:00,0.5,mg m-3,R
1,BX1,CO,02/01/2000 00:00,0.4,mg m-3,R


In [191]:
#Hay que usar al menos 8 métodos de limpieza y/o manipulación de datos

#Método 1. Contar los registros por cada región de Londres
data['Site'].value_counts()

CT6    7212
BX1    7212
LH0    7212
KC1    7212
Name: Site, dtype: int64

In [192]:
#Método 2. Ver si hay valores nulos en cada columna
MuestraNulos = data[(data['Value'].isnull() == True)]
MuestraNulos
#Valores nulos
#Site no tiene valores nulos
#Species no tiene valores nulos
#ReadingDateTime no tiene valores nulos
#Value sí tiene valores nulos ##
#Units no tiene valores nulos
#Provisional or Ratified no tiene valores nulos

Unnamed: 0,Site,Species,ReadingDateTime,Value,Units,Provisional or Ratified
12,BX1,CO,13/01/2000 00:00,,mg m-3,R
39,BX1,CO,09/02/2000 00:00,,mg m-3,R
40,BX1,CO,10/02/2000 00:00,,mg m-3,R
41,BX1,CO,11/02/2000 00:00,,mg m-3,R
48,BX1,CO,18/02/2000 00:00,,mg m-3,R
49,BX1,CO,19/02/2000 00:00,,mg m-3,R
50,BX1,CO,20/02/2000 00:00,,mg m-3,R
51,BX1,CO,21/02/2000 00:00,,mg m-3,R
52,BX1,CO,22/02/2000 00:00,,mg m-3,R
53,BX1,CO,23/02/2000 00:00,,mg m-3,R


In [193]:
#Método 3. Cambiemos el nombre de la columna
#Provisional or Ratified a P or R
data = data.rename(columns={'Provisional or Ratified':'P or R'})
data.head(2)

Unnamed: 0,Site,Species,ReadingDateTime,Value,Units,P or R
0,BX1,CO,01/01/2000 00:00,0.5,mg m-3,R
1,BX1,CO,02/01/2000 00:00,0.4,mg m-3,R


In [194]:
#Verificamos los valores extremos de Value y el type de los datos
data['Value'].min() #0.0
data['Value'].max() #2.6
data.dtypes

Site                object
Species             object
ReadingDateTime     object
Value              float64
Units               object
P or R              object
dtype: object

In [195]:
#Método 4. Creamos df separados para cada sitio
Bexley = data[(data['Site'] == 'BX1')]
City = data[(data['Site'] == 'CT6')]
Hillington = data[(data['Site'] == 'LH0')]
Kensington = data[(data['Site'] == 'KC1')]
Bexley.head(2)

Unnamed: 0,Site,Species,ReadingDateTime,Value,Units,P or R
0,BX1,CO,01/01/2000 00:00,0.5,mg m-3,R
1,BX1,CO,02/01/2000 00:00,0.4,mg m-3,R


In [196]:
#Método 5
#Quitamos la columna Site, Species y Units para cada df
#axis=1 significa elegir las columnas, axis=0 quitaría renglones
columnasBorrar = ['Site','Species','Units']
Bexley = Bexley.drop(columnasBorrar, axis=1)
City = City.drop(columnasBorrar, axis=1)
Hillington = Hillington.drop(columnasBorrar, axis=1)
Kensington = Kensington.drop(columnasBorrar, axis=1)
Hillington.head(2)


Unnamed: 0,ReadingDateTime,Value,P or R
14424,01/01/2000 00:00,,R
14425,02/01/2000 00:00,,R


In [197]:
#Verifiquemos cuantos registros de los 7212 de cada df son nulos
Bexley['Value'].value_counts().sum() #4514
City['Value'].value_counts().sum() #959
Hillington['Value'].value_counts().sum() #1473
Kensington['Value'].value_counts().sum() #6997

6997

In [198]:
#Método 6. Quitar los datos que son NaN de cada df
#Esto porque significa que no hubo medición
#No se puede sustituir por 0
Hillington = Hillington.dropna()
Bexley = Bexley.dropna()
City = City.dropna()
Kensington = Kensington.dropna()

In [199]:
#Método 7. Quitamos los datos no ratificados, i.e. los
#que valen P en la columna 'P or R'

Bexley = Bexley[(Bexley['P or R'] == 'R')]
City = City[(City['P or R'] == 'R')]
Hillington = Hillington[(Hillington['P or R'] == 'R')]
Kensington = Kensington[(Kensington['P or R'] == 'R')]

In [200]:
#Método 8. Quitamos la columna 'R or P'
Bexley = Bexley.drop('P or R', axis=1)
City = City.drop('P or R', axis=1)
Hillington = Hillington.drop('P or R', axis=1)
Kensington = Kensington.drop('P or R', axis=1)

In [201]:
#Método 9. Obtenemos las estadísticas
Bex = Bexley.describe()
Cit = City.describe()
Hil = Hillington.describe()
Ken = Kensington.describe()

#Le cambiamos el nombre a la columna 'Value'
Bex = Bex.rename(columns={'Value':'Bexley CO mg/m3'})
Cit = Cit.rename(columns={'Value':'City CO mg/m3'})
Hil = Hil.rename(columns={'Value':'Hillington CO mg/m3'})
Ken = Ken.rename(columns={'Value':'Kensington CO mg/m3'})


In [202]:
#Método 10. Obtenemos un nuevo df combinando 
#las estadísticas de los df de cada lugar
#agregando columnas de otro df al de estadisticas
estadisticas = Bex
estadisticas['City CO mg/m3'] = Cit['City CO mg/m3']
estadisticas['Hillington CO mg/m3'] = Hil['Hillington CO mg/m3']
estadisticas['Kensington CO mg/m3'] = Ken['Kensington CO mg/m3']
estadisticas

Unnamed: 0,Bexley CO mg/m3,City CO mg/m3,Hillington CO mg/m3,Kensington CO mg/m3
count,4487.0,959.0,1473.0,6753.0
mean,0.29035,0.675495,0.340665,0.310395
std,0.182035,0.382081,0.220939,0.187372
min,0.1,0.0,0.1,0.0
25%,0.2,0.4,0.2,0.2
50%,0.2,0.6,0.3,0.3
75%,0.3,0.9,0.4,0.4
max,2.3,2.0,2.1,2.6


In [203]:
#Método 11. Guardamos este df en un .csv con su índice
#para no restar información
estadisticas.to_csv('estadisticas.csv', sep=',')

In [204]:
#Método 12. Filtremos las fechas cuando el promedio de CO
#fue mayor a una std del promedio de los valores (20% de los casos)
Bexley_filtro1 = Bexley[(Bexley['Value'] > 0.48)]
Bexley_filtro1.head(2)

Unnamed: 0,ReadingDateTime,Value
0,01/01/2000 00:00,0.5
2,03/01/2000 00:00,0.5


In [205]:
#Renombremos columna ReadingDateTime
Bexley_filtro1 = Bexley_filtro1.rename(columns={'ReadingDateTime':'Date'})
Bexley_filtro1.head(2)

Unnamed: 0,Date,Value
0,01/01/2000 00:00,0.5
2,03/01/2000 00:00,0.5


In [208]:
City.head(2)

Unnamed: 0,ReadingDateTime,Value
9918,30/05/2007 00:00,0.3
9919,31/05/2007 00:00,0.3


In [216]:
#Pasemos el ReadingDateTime de object a fecha real de python
#revisar https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior  para los formatos
City['ReadingDateTime'] = pd.to_datetime(City['ReadingDateTime'], format='%d/%m/%Y %H:%M')
City.head(2)


Unnamed: 0,ReadingDateTime,Value
9918,2007-05-30,0.3
9919,2007-05-31,0.3


In [214]:
#Vamos a agrupar por mes y año la suma de concentraciones de CO
# dt es datatype
# strftime Convert to Index using specified date_format
#revisar https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.dt.strftime.html
#La segunda parte dice sobre quien sumar o ejecutar
#.sort_values() los ordena de menor a mayor
City.groupby(City['ReadingDateTime'].dt.strftime('%Y/%B'))['Value'].sum().sort_values()

ReadingDateTime
2007/May           0.6
2010/April         3.2
2010/February      4.5
2007/June          6.8
2007/July          8.3
2007/August        9.0
2009/September     9.3
2007/September     9.9
2007/November     11.8
2008/March        13.2
2008/January      14.6
2007/December     15.3
2008/June         16.4
2008/April        17.6
2010/March        17.8
2009/July         18.8
2009/May          19.8
2009/November     20.0
2008/July         20.1
2009/February     20.2
2009/August       21.2
2009/October      21.6
2008/May          23.4
2009/April        23.7
2008/December     24.0
2008/October      24.5
2008/November     25.0
2008/February     25.0
2008/August       25.2
2008/September    25.3
2009/March        25.6
2009/January      29.1
2009/June         29.5
2009/December     30.1
2010/January      37.4
Name: Value, dtype: float64

In [217]:
#Esto saca el promedio por mes por año de concentración de CO
City.groupby(City['ReadingDateTime'].dt.strftime('%Y/%B'))['Value'].mean().sort_values()

ReadingDateTime
2007/June         0.226667
2007/July         0.267742
2007/August       0.290323
2007/May          0.300000
2007/September    0.353571
2009/September    0.387500
2008/March        0.425806
2010/April        0.457143
2008/January      0.486667
2008/June         0.546667
2007/November     0.561905
2010/March        0.574194
2008/April        0.586667
2007/December     0.588462
2009/July         0.606452
2009/May          0.638710
2010/February     0.642857
2009/November     0.666667
2008/July         0.670000
2009/August       0.683871
2009/October      0.696774
2009/February     0.721429
2008/May          0.754839
2008/October      0.790323
2008/August       0.812903
2008/December     0.827586
2008/November     0.833333
2008/September    0.843333
2008/February     0.862069
2009/April        0.911538
2009/March        0.914286
2009/January      0.938710
2009/December     0.970968
2009/June         0.983333
2010/January      1.206452
Name: Value, dtype: float64