In [None]:
# default_exp load

# Load data from the longwall

> Methods for loading data

In [None]:
#hide
from nbdev.showdoc import *
from fastcore import test

In [None]:
#export
import pandas as pd
import numpy as np

## Read data

We will take data from one day of the shearer. the data is hosted at https://aida.ii.uam.es/2018-01-15.csv

In [None]:
!wget -O /home/jovyan/data/input_data.csv https://aida.ii.uam.es/2018-01-15.csv

--2020-02-25 17:46:37--  https://aida.ii.uam.es/2018-01-15.csv
Resolving aida.ii.uam.es (aida.ii.uam.es)... 150.244.57.52
Connecting to aida.ii.uam.es (aida.ii.uam.es)|150.244.57.52|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 42457900 (40M) [text/csv]
Saving to: ‘/home/jovyan/data/input_data.csv’


2020-02-25 17:46:38 (112 MB/s) - ‘/home/jovyan/data/input_data.csv’ saved [42457900/42457900]



In [None]:
data = pd.read_csv('/home/jovyan/data/input_data.csv', sep=';', skiprows=2)

  interactivity=interactivity, compiler=compiler, result=result)


In [None]:
data.head()

Unnamed: 0,description,Prąd silnika organu lewego,Prąd silnika organu prawego,Prąd lewego silnika posuwu,Prąd prawego silnika posuwu,Temperatura łożysko – silnik organu prawego,Temperatura uzwojenie – silnik organu prawego,Temperatura łożysko – silnik organu lewego,Temperatura uzwojenie – silnik organu lewego,Temperatura łożysko – silnik ciągnika lewego,...,Doziemienie.8,Napęd górny PZS bieg wolny,Załączenie.9,Prąd.9,Zabezpieczenie.9,Przeciążenie.9,Zwarcie.9,Temperatura.9,Doziemienie.9,Unnamed: 176
0,2018-01-15 00:00:00,0.0,0.0,0.0,0.0,28.0,36.0,24.0,30.0,26.0,...,,5.0,,,,,,,,
1,2018-01-15 00:00:01,0.0,0.0,0.0,0.0,28.0,36.0,24.0,30.0,26.0,...,,5.0,,,,,,,,
2,2018-01-15 00:00:02,0.0,0.0,0.0,0.0,28.0,36.0,24.0,30.0,26.0,...,,5.0,,,,,,,,
3,2018-01-15 00:00:03,0.0,0.0,0.0,0.0,28.0,36.0,24.0,30.0,26.0,...,,5.0,,,,,,,,
4,2018-01-15 00:00:04,0.0,0.0,0.0,0.0,28.0,36.0,24.0,30.0,26.0,...,,5.0,,,,,,,,


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 86400 entries, 0 to 86399
Columns: 177 entries, description to Unnamed: 176
dtypes: float64(122), int64(1), object(54)
memory usage: 116.7+ MB


The timestamp is given in the column `description`

In [None]:
data['timestamp'] = pd.to_datetime(data['description'])
data = data.drop('description', axis=1)

In [None]:
df1 = data.select_dtypes(exclude='object')
df2 = data.select_dtypes(include='object').astype('bool')
data = pd.concat([df2.reset_index(drop = True), df1], axis = 1)

For the dimensionality reduction we might be interested only in the numeric columns

In [None]:
data_numeric = data.select_dtypes(include=['float', 'datetime'])

As detailed in the TimeCluster paper, the data will be normalized into the range $[0, 1]$. Also, NaN columsn will be removed.

In [None]:
tmp = data_numeric.select_dtypes(include='float')
#data_numeric[data_numeric.select_dtypes(include='float')] = (tmp - tmp.min())/(tmp.max()-tmp.min())
data_numeric[data_numeric.select_dtypes(include='float').columns] = (tmp - tmp.min())/(tmp.max()-tmp.min())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [None]:
data_numeric = data_numeric.dropna(axis=1, how='all').fillna(0)

Finally, we define a function that gathers all this operations

In [None]:
# export
def fpreprocess_numeric_vars(data, cname_ts='description', normalize=True, nan_replacement=0):
    "Preprocess a dataframe `data` containing the monitoring data from a mining longwall. \
    Non-numeric variables will be removed. Each column \
    is expected to have values of a variable in form of a time series, whose index will be described in the \
    column named `cname_ts`. The timestamp column will be renamed to 'timestamp'.NaN values will be \
    replaced by a constant value `nan_replacement`"
    data['timestamp'] = pd.to_datetime(data[cname_ts])
    data = data.drop(cname_ts, axis=1)
    df1 = data.select_dtypes(exclude='object')
    df2 = data.select_dtypes(include='object').astype('bool')
    data = pd.concat([df2.reset_index(drop = True), df1], axis = 1)
    data_numeric = data.select_dtypes(include=['float', 'datetime'])
    tmp = data_numeric.select_dtypes(include='float')
    if normalize: data_numeric[data_numeric.select_dtypes(include='float').columns] = (tmp - tmp.min())/(tmp.max()-tmp.min())
    data_numeric = data_numeric.dropna(axis=1, how='all').fillna(nan_replacement)
    return data_numeric

## Read multiple monitoring files

Since the mining monitoring data is given a set of CSV files, one per day, it is usefl to have a function to load multiple files in order to analyse data from multiple days

In [None]:
# export
def fread_and_concat(paths, **read_args):
    "Read, from `paths`, a list of mining dataframes and concat them. All dataframes \
    must have the same columns. "
    return pd.concat([pd.read_csv(x, **read_args) for x in paths], 
                     ignore_index=True)

In [None]:
paths = ['/data/PACMEL-2019/343_HMB/2018-01-14.csv', '/data/PACMEL-2019/343_HMB/2018-01-15.csv']
df1 = pd.read_csv(paths[0], sep=';', skiprows=2)
df2 = pd.read_csv(paths[1], sep=';', skiprows=2)
df = fread_and_concat(paths, sep=';', skiprows=2)

  exec(code_obj, self.user_global_ns, self.user_ns)


In [None]:
test.equals(df1.shape[0] + df2.shape[0], df.shape[0])
test.all_equal([df1.shape[1], df2.shape[1], df.shape[1]], np.repeat(df1.shape[1], 3))

True

In [None]:
# export
def fread_mining_monitoring_files(paths):
    "Read monitoring files from the PACMEL mining use case."
    df = fread_and_concat(paths, 
                          sep=';', 
                          skiprows=2)
    return df

In [None]:
paths = ['/data/PACMEL-2019/343_HMB/2018-01-14.csv', '/data/PACMEL-2019/343_HMB/2018-01-15.csv']
df = fread_mining_monitoring_files(paths)

  


In [None]:
isinstance(df, pd.core.frame.DataFrame)

True

## Export notebook

In [None]:
# hide
from nbdev.export import *
notebook2script()

Converted 00_load.ipynb.
Converted 01_DCAE.ipynb.
Converted 02_dimensionality_reduction.ipynb.
Converted index.ipynb.
