In [160]:
import pandas as pd
import numpy as np

In [161]:
csv_path = './content/gams_indoor.csv'
df = pd.read_csv(csv_path)
df.head()

Unnamed: 0,ts,co2,humidity,pm10,pm25,temperature,voc
0,2016-11-21 00:47:03,708.0,72.09,10.2,9.0,20.83,0.062
1,2016-11-21 00:48:03,694.0,70.95,10.9,10.1,21.01,0.062
2,2016-11-21 00:49:03,693.0,69.12,10.2,9.9,21.2,0.062
3,2016-11-21 00:50:03,692.0,68.83,9.6,9.6,21.37,0.062
4,2016-11-21 00:51:03,690.0,68.6,9.4,8.4,21.49,0.062


## Modifying data of various columns
### We want to get data range closer of goals by limiting the range and stay around different means
* ppm25: < 0.56 
* ppm10: < 1.76 
* temperature: 19 - 26 °C (ISO5)  
* humidité: 45 - 65 % (ISO5) 
* co2: 300 – 380 ppm

In [162]:
df.describe()

Unnamed: 0,co2,humidity,pm10,pm25,temperature,voc
count,135099.0,135099.0,135099.0,135099.0,135099.0,135099.0
mean,688.833011,37.879496,17.553535,15.801651,22.939613,0.12105
std,385.845573,5.284216,12.603744,11.709474,2.051068,0.089947
min,369.0,21.97,0.5,0.5,17.71,0.062
25%,429.0,34.49,8.4,7.3,21.41,0.064
50%,483.0,37.64,14.0,12.3,22.86,0.076
75%,852.0,41.29,23.4,21.0,24.65,0.149
max,2626.0,72.09,142.6,85.2,27.96,2.0


## Ajusting parameters to fit our needs
* ### pm10

In [163]:
df['pm10'] = df['pm10'].apply(lambda x : np.log10(x) + 0.5)
print("ppm10: < 1.76")
df.describe()

ppm10: < 1.76


Unnamed: 0,co2,humidity,pm10,pm25,temperature,voc
count,135099.0,135099.0,135099.0,135099.0,135099.0,135099.0
mean,688.833011,37.879496,1.63862,15.801651,22.939613,0.12105
std,385.845573,5.284216,0.311378,11.709474,2.051068,0.089947
min,369.0,21.97,0.19897,0.5,17.71,0.062
25%,429.0,34.49,1.424279,7.3,21.41,0.064
50%,483.0,37.64,1.646128,12.3,22.86,0.076
75%,852.0,41.29,1.869216,21.0,24.65,0.149
max,2626.0,72.09,2.65412,85.2,27.96,2.0


* ### pm25

In [164]:
df['pm25'] = df['pm25'].apply(lambda x : np.absolute(np.log(x / 10)))
print("ppm25: < 0.56")
df.describe()

ppm25: < 0.56


Unnamed: 0,co2,humidity,pm10,pm25,temperature,voc
count,135099.0,135099.0,135099.0,135099.0,135099.0,135099.0
mean,688.833011,37.879496,1.63862,0.627392,22.939613,0.12105
std,385.845573,5.284216,0.311378,0.447574,2.051068,0.089947
min,369.0,21.97,0.19897,0.0,17.71,0.062
25%,429.0,34.49,1.424279,0.248461,21.41,0.064
50%,483.0,37.64,1.646128,0.548121,22.86,0.076
75%,852.0,41.29,1.869216,0.941609,24.65,0.149
max,2626.0,72.09,2.65412,2.995732,27.96,2.0


* ### humidity

In [165]:
df['humidity'] = df['humidity'].apply(lambda x : x - x / 12 if x >= 55 else x + x / 3)
print("humidité: 45 - 65 %")
df.describe()

humidité: 45 - 65 %


Unnamed: 0,co2,humidity,pm10,pm25,temperature,voc
count,135099.0,135099.0,135099.0,135099.0,135099.0,135099.0
mean,688.833011,50.373556,1.63862,0.627392,22.939613,0.12105
std,385.845573,6.759053,0.311378,0.447574,2.051068,0.089947
min,369.0,29.293333,0.19897,0.0,17.71,0.062
25%,429.0,45.986667,1.424279,0.248461,21.41,0.064
50%,483.0,50.186667,1.646128,0.548121,22.86,0.076
75%,852.0,54.933333,1.869216,0.941609,24.65,0.149
max,2626.0,73.253333,2.65412,2.995732,27.96,2.0


* ### co2

In [166]:
df['co2'] = df['co2'].apply(lambda x : x / 6 + 240)
print("co2: 300 - 380")
df.describe()

co2: 300 - 380


Unnamed: 0,co2,humidity,pm10,pm25,temperature,voc
count,135099.0,135099.0,135099.0,135099.0,135099.0,135099.0
mean,354.805502,50.373556,1.63862,0.627392,22.939613,0.12105
std,64.307596,6.759053,0.311378,0.447574,2.051068,0.089947
min,301.5,29.293333,0.19897,0.0,17.71,0.062
25%,311.5,45.986667,1.424279,0.248461,21.41,0.064
50%,320.5,50.186667,1.646128,0.548121,22.86,0.076
75%,382.0,54.933333,1.869216,0.941609,24.65,0.149
max,677.666667,73.253333,2.65412,2.995732,27.96,2.0


## Save data into a csv

In [167]:
df.to_csv('./content/preprocessed_dataset.csv')