In [67]:
import os
import numpy as np
import pandas as pd
import plotly.express as px
from scipy import stats


In [25]:
DATA_PATH = 'data/'

### Get all data files

In [269]:
data_files = []
for f in os.listdir(DATA_PATH):
        if not f.startswith('.'):
                data_files.append(f)


### Add a new column called device name (should be the name of the txt file), combine all dataframes

In [27]:
data_array = []
for d in data_files:
        device_name = d.split('.')[0]
        print(device_name)
        data_df = pd.read_csv(DATA_PATH + d, skiprows=1)
        data_df.columns = data_df.columns.str.replace(' ','')
        data_df['Device'] = device_name
        data_array.append(data_df)
all_device_data = pd.concat(data_array)

# add datetime
all_device_data['Datetime'] = pd.to_datetime(all_device_data['Date'] + ' ' + all_device_data['Time'])

Device7
Device12
Device13
Device9


In [28]:
to_numeric_cols = all_device_data.columns.drop(['Time', 'Date', 'Battery', 'Fix', 'Longitude','Latitude', 'Temp(C)', 'RH(%)', 'P(hPa)', 'Alti(m)', 'Device', 'Datetime'])
all_device_data[to_numeric_cols] = all_device_data[to_numeric_cols].apply(pd.to_numeric, errors='coerce', downcast='float').astype(float)

In [29]:
dp3 = all_device_data[['Datetime', 'Date', 'Time', 'Dp>0.3','Device']]

In [48]:
rounded = pd.DataFrame(dp3['Datetime'].dt.round('5s'))
dp3['Datetime_round'] = rounded

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dp3['Datetime_round'] = rounded


In [50]:
dp3_rounded = dp3.groupby(['Datetime_round','Device'])['Dp>0.3'].mean().reset_index()

## Helper Function

In [265]:
def plot_line_chart(df, x, y, title):
    plot = px.line(df,
                          x=x,
                          y=y,
                          color='Device',
                          title=title,
                          labels={'Datetime_round': 'Time'})
    return plot

In [254]:
def drop_numerical_outliers(df, z_thresh=4):
    constrains = df.select_dtypes(include=['float64']) \
        .apply(lambda x: np.abs(stats.zscore(x)) < z_thresh).all(axis=1)
    idx = df.index[constrains==False].tolist()
    new_df = df.drop(idx)
    return new_df

## Get data betweem a specific time 'YYYY-MM-DD HH:MM:SS'

In [18]:
START_TIME = '2022-04-22 16:00:00'
END_TIME = '2022-04-22 16:15:00'

In [210]:
dp3_in_time_range = dp3_rounded[(dp3_rounded['Datetime_round'] >= START_TIME) & (dp3_rounded['Datetime_round'] <= END_TIME)]

In [211]:
#dp3_in_time_range.to_csv("output.csv", index=False)

In [266]:
lineplt_by_time = plot_line_chart(df=dp3_in_time_range, x='Datetime_round', y='Dp>0.3', title='Dp>0.3 Collected by Device')
lineplt_by_time.show()

In [267]:
Z_thresh = 4
dp3_removed_extreme = drop_numerical_outliers(dp3_in_time_range.copy(), Z_thresh)

In [268]:
removed_extreme_plot = plot_line_chart(dp3_removed_extreme, 'Datetime_round', 'Dp>0.3', 'Dp>0.3 Collected by Device with Extreme Removed')
removed_extreme_plot.show()