In [1]:
from datetime import datetime
from time import time
import pandas as pd
import os, re
import numpy as np

In [2]:
present = time()

In [3]:
fn_format = '%Y-%m-%d %H-%M-%S'

In [4]:
pattern = re.compile(r'\d{4}-\d{2}-\d{2} \d{2}-\d{2}-\d{2}')

In [5]:
files = os.listdir('./data')
files_str = ' '.join(files)
files_list = re.findall(pattern, files_str)

In [6]:
files_list[:10]

['2022-08-31 22-59-59',
 '2022-09-01 00-01-35',
 '2022-09-01 01-03-11',
 '2022-09-01 02-04-47',
 '2022-09-01 03-06-23',
 '2022-09-01 04-07-59',
 '2022-09-01 05-09-35',
 '2022-09-01 06-11-11',
 '2022-09-01 07-12-47',
 '2022-09-01 08-14-23']

In [7]:
files_timestamp = []

In [8]:
for each in files_list:
    files_timestamp.append(datetime.strptime(each, fn_format).timestamp())

In [9]:
files_timestamp[:10]

[1662011999.0,
 1662015695.0,
 1662019391.0,
 1662023087.0,
 1662026783.0,
 1662030479.0,
 1662034175.0,
 1662037871.0,
 1662041567.0,
 1662045263.0]

In [10]:
df_dict = {'timestamp': files_timestamp, 'filename': [i+'.txt' for i in files_list]}

In [11]:
df = pd.DataFrame(data=df_dict, index=pd.to_datetime(df_dict['timestamp'], unit='s'))
df.index.name = 'datetime'

In [12]:
df

Unnamed: 0_level_0,timestamp,filename
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1
2022-09-01 05:59:59,1.662012e+09,2022-08-31 22-59-59.txt
2022-09-01 07:01:35,1.662016e+09,2022-09-01 00-01-35.txt
2022-09-01 08:03:11,1.662019e+09,2022-09-01 01-03-11.txt
2022-09-01 09:04:47,1.662023e+09,2022-09-01 02-04-47.txt
2022-09-01 10:06:23,1.662027e+09,2022-09-01 03-06-23.txt
...,...,...
2022-10-07 11:37:56,1.665143e+09,2022-10-07 04-37-56.txt
2022-10-07 12:39:31,1.665146e+09,2022-10-07 05-39-31.txt
2022-10-07 13:41:07,1.665150e+09,2022-10-07 06-41-07.txt
2022-10-07 14:42:42,1.665154e+09,2022-10-07 07-42-42.txt


In [13]:
df = pd.read_csv('data/2022-10-07 01-33-09.txt', delimiter=' ', header=0, index_col=False, na_values=None)
df = df.set_index(pd.to_datetime(df['timestamp'], unit='s'))
df.index.name = 'datetime'

In [14]:
df = df.drop(['timestamp'], axis=1)

In [15]:
df.iloc[368]

temperature          20.29
humidity           47.5514
pressure           988.684
pm10_standard         None
pm25_standard         None
pm100_standard        None
pm10_env              None
pm25_env              None
pm100_env             None
particles_03um        None
particles_05um        None
particles_10um        None
particles_25um        None
particles_50um        None
particles_100um       None
Name: 2022-10-07 09:04:38.300965786, dtype: object

In [16]:
df.dtypes

temperature        float64
humidity           float64
pressure           float64
pm10_standard       object
pm25_standard       object
pm100_standard      object
pm10_env            object
pm25_env            object
pm100_env           object
particles_03um      object
particles_05um      object
particles_10um      object
particles_25um      object
particles_50um      object
particles_100um     object
dtype: object

In [17]:
df_np = np.genfromtxt('data/2022-10-07 01-33-09.txt', delimiter=' ', skip_header=1)

In [18]:
df_np.shape

(720, 16)

In [19]:
df_np[368]

array([1.66513348e+09, 2.02900391e+01, 4.75513994e+01, 9.88684286e+02,
                  nan,            nan,            nan,            nan,
                  nan,            nan,            nan,            nan,
                  nan,            nan,            nan,            nan])

In [20]:
with open('data/2022-10-07 01-33-09.txt', 'r') as file:
    header = file.readline()

In [21]:
header = header.split(' ')

In [22]:
header[-1] = header[-1][:-1]

In [23]:
header

['timestamp',
 'temperature',
 'humidity',
 'pressure',
 'pm10_standard',
 'pm25_standard',
 'pm100_standard',
 'pm10_env',
 'pm25_env',
 'pm100_env',
 'particles_03um',
 'particles_05um',
 'particles_10um',
 'particles_25um',
 'particles_50um',
 'particles_100um']

In [24]:
df = pd.DataFrame(data=df_np, columns=header)

In [25]:
df.dtypes

timestamp          float64
temperature        float64
humidity           float64
pressure           float64
pm10_standard      float64
pm25_standard      float64
pm100_standard     float64
pm10_env           float64
pm25_env           float64
pm100_env          float64
particles_03um     float64
particles_05um     float64
particles_10um     float64
particles_25um     float64
particles_50um     float64
particles_100um    float64
dtype: object

In [26]:
df.index.name = 'datetime'

In [27]:
df.set_index('timestamp')

Unnamed: 0_level_0,temperature,humidity,pressure,pm10_standard,pm25_standard,pm100_standard,pm10_env,pm25_env,pm100_env,particles_03um,particles_05um,particles_10um,particles_25um,particles_50um,particles_100um
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1.665132e+09,20.528906,47.365668,988.590465,5.0,9.0,9.0,5.0,9.0,9.0,1038.0,321.0,39.0,4.0,0.0,0.0
1.665132e+09,20.523828,47.371653,988.575016,5.0,7.0,9.0,5.0,7.0,9.0,1092.0,314.0,33.0,4.0,2.0,0.0
1.665132e+09,20.513672,47.330184,988.595814,5.0,7.0,9.0,5.0,7.0,9.0,1092.0,314.0,33.0,4.0,2.0,0.0
1.665132e+09,20.508594,47.353699,988.582349,5.0,7.0,9.0,5.0,7.0,9.0,1092.0,314.0,33.0,4.0,2.0,0.0
1.665132e+09,20.508594,47.235419,988.580965,5.0,8.0,9.0,5.0,8.0,9.0,1104.0,321.0,33.0,4.0,2.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1.665135e+09,20.645898,46.645576,988.622277,2.0,6.0,6.0,2.0,6.0,6.0,705.0,197.0,48.0,2.0,0.0,0.0
1.665135e+09,20.640820,46.757869,988.611388,2.0,6.0,6.0,2.0,6.0,6.0,705.0,197.0,48.0,2.0,0.0,0.0
1.665135e+09,20.650977,46.592324,988.599010,2.0,6.0,6.0,2.0,6.0,6.0,705.0,197.0,48.0,2.0,0.0,0.0
1.665135e+09,20.640820,46.663265,988.614453,2.0,5.0,5.0,2.0,5.0,5.0,678.0,185.0,46.0,2.0,0.0,0.0


In [28]:
df

Unnamed: 0_level_0,timestamp,temperature,humidity,pressure,pm10_standard,pm25_standard,pm100_standard,pm10_env,pm25_env,pm100_env,particles_03um,particles_05um,particles_10um,particles_25um,particles_50um,particles_100um
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0,1.665132e+09,20.528906,47.365668,988.590465,5.0,9.0,9.0,5.0,9.0,9.0,1038.0,321.0,39.0,4.0,0.0,0.0
1,1.665132e+09,20.523828,47.371653,988.575016,5.0,7.0,9.0,5.0,7.0,9.0,1092.0,314.0,33.0,4.0,2.0,0.0
2,1.665132e+09,20.513672,47.330184,988.595814,5.0,7.0,9.0,5.0,7.0,9.0,1092.0,314.0,33.0,4.0,2.0,0.0
3,1.665132e+09,20.508594,47.353699,988.582349,5.0,7.0,9.0,5.0,7.0,9.0,1092.0,314.0,33.0,4.0,2.0,0.0
4,1.665132e+09,20.508594,47.235419,988.580965,5.0,8.0,9.0,5.0,8.0,9.0,1104.0,321.0,33.0,4.0,2.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
715,1.665135e+09,20.645898,46.645576,988.622277,2.0,6.0,6.0,2.0,6.0,6.0,705.0,197.0,48.0,2.0,0.0,0.0
716,1.665135e+09,20.640820,46.757869,988.611388,2.0,6.0,6.0,2.0,6.0,6.0,705.0,197.0,48.0,2.0,0.0,0.0
717,1.665135e+09,20.650977,46.592324,988.599010,2.0,6.0,6.0,2.0,6.0,6.0,705.0,197.0,48.0,2.0,0.0,0.0
718,1.665135e+09,20.640820,46.663265,988.614453,2.0,5.0,5.0,2.0,5.0,5.0,678.0,185.0,46.0,2.0,0.0,0.0


In [29]:
df = df.dropna()

In [30]:
df = df.astype({'pm10_standard': 'int', 'pm25_standard': 'int', 'pm100_standard': 'int', 'pm10_env': 'int', 'pm25_env': 'int',
           'pm100_env': 'int', 'particles_03um': 'int', 'particles_05um': 'int', 'particles_10um': 'int',
           'particles_25um': 'int', 'particles_50um': 'int', 'particles_100um': 'int'})

In [31]:
df

Unnamed: 0_level_0,timestamp,temperature,humidity,pressure,pm10_standard,pm25_standard,pm100_standard,pm10_env,pm25_env,pm100_env,particles_03um,particles_05um,particles_10um,particles_25um,particles_50um,particles_100um
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0,1.665132e+09,20.528906,47.365668,988.590465,5,9,9,5,9,9,1038,321,39,4,0,0
1,1.665132e+09,20.523828,47.371653,988.575016,5,7,9,5,7,9,1092,314,33,4,2,0
2,1.665132e+09,20.513672,47.330184,988.595814,5,7,9,5,7,9,1092,314,33,4,2,0
3,1.665132e+09,20.508594,47.353699,988.582349,5,7,9,5,7,9,1092,314,33,4,2,0
4,1.665132e+09,20.508594,47.235419,988.580965,5,8,9,5,8,9,1104,321,33,4,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
715,1.665135e+09,20.645898,46.645576,988.622277,2,6,6,2,6,6,705,197,48,2,0,0
716,1.665135e+09,20.640820,46.757869,988.611388,2,6,6,2,6,6,705,197,48,2,0,0
717,1.665135e+09,20.650977,46.592324,988.599010,2,6,6,2,6,6,705,197,48,2,0,0
718,1.665135e+09,20.640820,46.663265,988.614453,2,5,5,2,5,5,678,185,46,2,0,0


In [32]:
df.dtypes

timestamp          float64
temperature        float64
humidity           float64
pressure           float64
pm10_standard        int32
pm25_standard        int32
pm100_standard       int32
pm10_env             int32
pm25_env             int32
pm100_env            int32
particles_03um       int32
particles_05um       int32
particles_10um       int32
particles_25um       int32
particles_50um       int32
particles_100um      int32
dtype: object

In [33]:
df[df['timestamp'] < 1665131625]

Unnamed: 0_level_0,timestamp,temperature,humidity,pressure,pm10_standard,pm25_standard,pm100_standard,pm10_env,pm25_env,pm100_env,particles_03um,particles_05um,particles_10um,particles_25um,particles_50um,particles_100um
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0,1665132000.0,20.528906,47.365668,988.590465,5,9,9,5,9,9,1038,321,39,4,0,0
1,1665132000.0,20.523828,47.371653,988.575016,5,7,9,5,7,9,1092,314,33,4,2,0
2,1665132000.0,20.513672,47.330184,988.595814,5,7,9,5,7,9,1092,314,33,4,2,0
3,1665132000.0,20.508594,47.353699,988.582349,5,7,9,5,7,9,1092,314,33,4,2,0
4,1665132000.0,20.508594,47.235419,988.580965,5,8,9,5,8,9,1104,321,33,4,2,0
5,1665132000.0,20.508594,47.365527,988.549284,5,8,9,5,8,9,1104,321,33,4,2,0
6,1665132000.0,20.51875,47.270904,988.580965,5,8,9,5,8,9,1104,321,33,4,2,0


In [34]:
df.__sizeof__()

63184

In [35]:
df

Unnamed: 0_level_0,timestamp,temperature,humidity,pressure,pm10_standard,pm25_standard,pm100_standard,pm10_env,pm25_env,pm100_env,particles_03um,particles_05um,particles_10um,particles_25um,particles_50um,particles_100um
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0,1.665132e+09,20.528906,47.365668,988.590465,5,9,9,5,9,9,1038,321,39,4,0,0
1,1.665132e+09,20.523828,47.371653,988.575016,5,7,9,5,7,9,1092,314,33,4,2,0
2,1.665132e+09,20.513672,47.330184,988.595814,5,7,9,5,7,9,1092,314,33,4,2,0
3,1.665132e+09,20.508594,47.353699,988.582349,5,7,9,5,7,9,1092,314,33,4,2,0
4,1.665132e+09,20.508594,47.235419,988.580965,5,8,9,5,8,9,1104,321,33,4,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
715,1.665135e+09,20.645898,46.645576,988.622277,2,6,6,2,6,6,705,197,48,2,0,0
716,1.665135e+09,20.640820,46.757869,988.611388,2,6,6,2,6,6,705,197,48,2,0,0
717,1.665135e+09,20.650977,46.592324,988.599010,2,6,6,2,6,6,705,197,48,2,0,0
718,1.665135e+09,20.640820,46.663265,988.614453,2,5,5,2,5,5,678,185,46,2,0,0
