### Import the necessary libraries

In [111]:
import glob
import pandas as pd

### Get the data

In [124]:
path =r'data/luftdaten/5331' 
bf_5331 = pd.concat([pd.read_csv(f, sep=';') for f in glob.glob(path + "/*.csv")],
                      ignore_index=True)

### Start to explore the data

In [125]:
bf_5331.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 125131 entries, 0 to 125130
Data columns (total 12 columns):
sensor_id      125131 non-null int64
sensor_type    125131 non-null object
location       125131 non-null int64
lat            125131 non-null float64
lon            125131 non-null float64
timestamp      125131 non-null object
P1             125131 non-null float64
durP1          0 non-null float64
ratioP1        0 non-null float64
P2             125131 non-null float64
durP2          0 non-null float64
ratioP2        0 non-null float64
dtypes: float64(8), int64(2), object(2)
memory usage: 11.5+ MB


### Look at the first rows

In [126]:
bf_5331.head()

Unnamed: 0,sensor_id,sensor_type,location,lat,lon,timestamp,P1,durP1,ratioP1,P2,durP2,ratioP2
0,5331,SDS011,3121,57.138,-2.077,2017-11-26T00:00:21,8.43,,,2.87,,
1,5331,SDS011,3121,57.138,-2.077,2017-11-26T00:02:48,5.5,,,1.8,,
2,5331,SDS011,3121,57.138,-2.077,2017-11-26T00:05:16,5.87,,,2.3,,
3,5331,SDS011,3121,57.138,-2.077,2017-11-26T00:07:43,9.7,,,2.4,,
4,5331,SDS011,3121,57.138,-2.077,2017-11-26T00:10:11,13.33,,,1.73,,


### What is the structure like? 

In [127]:
# this provides a rough equivalent of the R str() function:

def rstr(df): return df.shape, df.apply(lambda x: [x.unique()])

print(rstr(bf_5331))

((125131, 12), sensor_id                                               [[5331]]
sensor_type                                           [[SDS011]]
location                                          [[3121, 2688]]
lat                               [[57.138000000000005, 57.128]]
lon                                            [[-2.077, -2.13]]
timestamp      [[2017-11-26T00:00:21, 2017-11-26T00:02:48, 20...
P1             [[8.43, 5.5, 5.87, 9.7, 13.33, 4.53, 4.5, 6.8,...
durP1                                                    [[nan]]
ratioP1                                                  [[nan]]
P2             [[2.87, 1.8, 2.3, 2.4, 1.73, 1.27, 1.63, 1.9, ...
durP2                                                    [[nan]]
ratioP2                                                  [[nan]]
dtype: object)


## Drop columns with no values

In [128]:
bf_5331_nona = bf_5331.dropna(axis=1, how='all')

In [129]:
print(rstr(bf_5331_nona))

((125131, 8), sensor_id                                               [[5331]]
sensor_type                                           [[SDS011]]
location                                          [[3121, 2688]]
lat                               [[57.138000000000005, 57.128]]
lon                                            [[-2.077, -2.13]]
timestamp      [[2017-11-26T00:00:21, 2017-11-26T00:02:48, 20...
P1             [[8.43, 5.5, 5.87, 9.7, 13.33, 4.53, 4.5, 6.8,...
P2             [[2.87, 1.8, 2.3, 2.4, 1.73, 1.27, 1.63, 1.9, ...
dtype: object)


<span style="color:blue">Note that, in the cell above, 3 values change (location, lat, long). Was the sensor moved?</span> 

Assuming for now that the sensor is stationary, we drop the columns which should not vary from reading to reading

In [131]:
bf_5331_nona_trim = bf_5331_nona.drop(['sensor_id','location','sensor_type','lat','lon'],axis=1)

### Run a command to emulate str() in R. 

In [132]:
print(rstr(bf_5331_nona_trim))

((125131, 3), timestamp    [[2017-11-26T00:00:21, 2017-11-26T00:02:48, 20...
P1           [[8.43, 5.5, 5.87, 9.7, 13.33, 4.53, 4.5, 6.8,...
P2           [[2.87, 1.8, 2.3, 2.4, 1.73, 1.27, 1.63, 1.9, ...
dtype: object)


In [135]:
max_5331_p1 = bf_5331_nona_trim[bf_5331_nona_trim['P1']==bf_5331_nona_trim['P1'].max()]
min_5331_p1 = bf_5331_nona_trim[bf_5331_nona_trim['P1']==bf_5331_nona_trim['P1'].min()]

max_5331_p2 = bf_5331_nona_trim[bf_5331_nona_trim['P2']==bf_5331_nona_trim['P2'].max()]
min_5331_p2 = bf_5331_nona_trim[bf_5331_nona_trim['P2']==bf_5331_nona_trim['P2'].min()]

In [142]:
print ("Max P1")
print (max_5331_p1)

print ("\nMin P1")
print (min_5331_p1)

print ("\nMax P2")
print (max_5331_p2)

print ("\nMin P2")
print (min_5331_p2)

Max P1
                 timestamp      P1      P2
90942  2018-02-05T15:29:20  1999.9  691.83

Min P1
                  timestamp   P1    P2
57750   2017-12-25T06:48:26  0.5  0.50
79320   2018-04-23T14:04:33  0.5  0.50
79322   2018-04-23T14:09:29  0.5  0.50
86514   2018-04-24T00:08:11  0.5  0.50
86521   2018-04-24T00:25:23  0.5  0.50
86523   2018-04-24T00:30:19  0.5  0.50
95978   2018-04-22T15:48:19  0.5  0.50
96004   2018-04-22T16:52:33  0.5  0.50
96018   2018-04-22T17:27:05  0.5  0.50
96031   2018-04-22T17:59:05  0.5  0.50
98525   2018-01-29T02:32:51  0.5  0.50
98544   2018-01-29T03:19:35  0.5  0.50
98553   2018-01-29T03:41:44  0.5  0.50
98555   2018-01-29T03:46:39  0.5  0.50
98557   2018-01-29T03:51:34  0.5  0.50
98567   2018-01-29T04:16:10  0.5  0.50
98572   2018-01-29T04:28:27  0.5  0.50
107208  2017-12-23T02:03:53  0.5  0.50
107211  2017-12-23T02:11:16  0.5  0.50
107213  2017-12-23T02:16:11  0.5  0.50
107215  2017-12-23T02:21:06  0.5  0.50
107220  2017-12-23T02:33:24  0.5  0.50
10