## Py-imp is the section of the project that is implemented using Python
- This notebook is the data description section
- Data was collected at every minute

In [1]:
# Warning options (before all imports)
import warnings
warnings.filterwarnings('ignore')
# warnings.simplefilter(action='ignore', category=UserWarning)
# warnings.simplefilter(action='ignore', category=FutureWarning)
# %xmode Verbose # simplified traceback when an exception occurs
%xmode Plain

Exception reporting mode: Plain


### Install needed packages


In [2]:
# Install a conda package in the current Jupyter kernel
import sys
# !mamba install -c conda-forge --yes --prefix {sys.prefix} <pkg>
# !conda install -c conda-forge --yes --prefix {sys.prefix} <pkg>

# Install a pip package in the current Jupyter kernel
# import sys
# !{sys.executable} - m pip install <pkg>

In [3]:
# 'sklearn for Time series data'
# !mamba install -c conda-forge --yes --prefix {sys.prefix} sktime-all-extras
# 'for Time series data'
# !mamba install -c conda-forge --yes --prefix {sys.prefix} feature_engine
# !mamba install -c conda-forge --yes --prefix {sys.prefix} mlxtend
# !mamba install -c conda-forge --yes --prefix {sys.prefix} statsmodels
# !mamba install -c conda-forge --yes --prefix {sys.prefix} ptitprince
# for tf.keras.utils.plot_model
# !mamba install -c conda-forge --yes --prefix {sys.prefix} graphviz python-graphviz

# # upgrade pip
# !{sys.executable} -m pip install --upgrade pip

# # Debug pip
# # Uninstall SetupTools:
# !{sys.executable} -m pip uninstall pip setuptools
# # Reinstall Setuptools:
# !{sys.executable} -m pip install --upgrade setuptools

# feature engineering Time series data
# !{sys.executable} -m pip install tsfresh
# pretty Matplotlib plots
# !{sys.executable} -m pip install git+https://github.com/iammelvink/LovelyPlots
# better progress bar
# !{sys.executable} -m pip install rich
# better 'describe'
# !{sys.executable} -m pip install skimpy
# For size encoded heatmaps
# !{sys.executable} -m pip install heatmapz
# XGBoost
# !{sys.executable} -m pip install xgboost
# !{sys.executable} -m pip install lightgbm
# !{sys.executable} -m pip install catboost
# !{sys.executable} -m pip install fastprogress
# !{sys.executable} -m pip install matplotx[all]
# !{sys.executable} -m pip install matplotx
# !{sys.executable} -m pip install lux
# !jupyter nbextension install --py luxwidget
# !jupyter nbextension enable --py luxwidget

# !{sys.executable} -m pip install sweetviz
# !{sys.executable} -m pip install yellowbrick
# !{sys.executable} -m pip install autoviz
# !{sys.executable} -m pip install pandas-profiling
# !{sys.executable} -m pip install dtale
# !{sys.executable} -m pip install sketch
# !{sys.executable} -m pip install watermark
# !{sys.executable} -m pip install snoop
# for tf.keras.utils.plot_model
# !{sys.executable} -m pip install pydot
# !{sys.executable} -m pip install waterfallcharts

# !{sys.executable} -m pip install matplotlib pandas==1.5.3 --force-reinstall

In [4]:
# %pip install <pkg> --user

### Importing libraries


In [5]:
import snoop

snoop.install()
%load_ext snoop
# eg. put this before debug code %%snoop

from skimpy import skim  # better describe
from rich.progress import track
import os, random
import timeit

import math
from scipy.stats import chi2_contingency

# Dataset reducer
# Source: https://www.kaggle.com/code/joseguzman/reducing/notebook
from reducing import PandaReducer

# Local ChatGPT-ish
import sketch
os.environ["SKETCH_MAX_COLUMNS"] = "300"
os.environ["SKETCH_ROW_OVERRIDE_LIMIT"] = "5"

# Run locally
# os.environ["LAMBDAPROMPT_BACKEND"] = "StarCoder"
# os.environ["SKETCH_USE_REMOTE_LAMBDAPROMPT"] = "False"
# os.environ["HF_ACCESS_TOKEN"] = "your_hugging_face_token"

# EDA
import numpy as np
import pandas as pd
import sweetviz as sv
# from autoviz import FixDQ # auto clean dataset
from autoviz.AutoViz_Class import AutoViz_Class
from pandas_profiling import ProfileReport
import dtale
# import lux
# lux.config.default_display = "lux" # to display LUX by default
# lux.config.sampling = False # to turn off sampling
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
from matplotlib import cycler
from matplotlib import rcParams
# matplotlib extensions
import matplotx
# For better distribution plots
# import ptitprince as pt
# For size encoded heatmaps
from heatmap import *
# import waterfall_chart as wfc

# Pre-processing
from sklearn.preprocessing import *
from feature_engine.creation import CyclicalFeatures
from feature_engine.datetime import DatetimeFeatures
from feature_engine.imputation import DropMissingData
from feature_engine.selection import DropFeatures
from feature_engine.timeseries.forecasting import *
from mlxtend.evaluate.time_series import *
import datetime
from datetime import datetime as dt
from dateutil.relativedelta import *
from sklearn.compose import *
from sklearn.preprocessing import *
from sklearn.impute import *
from sklearn.pipeline import *

# Modeling
import tensorflow as tf
from tensorflow import keras
from sktime.forecasting.base import ForecastingHorizon
from sktime.forecasting.model_selection import *
from sklearn.model_selection import *
from sktime.utils.plotting import plot_series
from sklearn.utils import *
from sklearn.ensemble import *
from sklearn.naive_bayes import *
from sklearn.metrics import *
from sklearn import metrics
from sklearn.linear_model import *
from sklearn.neighbors import *
from sklearn.tree import *
from sklearn.svm import *
from xgboost import *
from lightgbm import *
from catboost import *
from sklearn.utils.class_weight import *
from sklearn.exceptions import NotFittedError
import pickle

np.set_printoptions(linewidth=140)

# Pandas options
# pd.options.display.precision = 3
pd.options.display.max_rows = 250
pd.options.display.max_columns = 7
pd.options.mode.chained_assignment=None

pd.set_option('display.width', 140)

# Display Pipelines and models
from sklearn import set_config
set_config(display='diagram')

plt.style.reload_library()
# plt.style.use(matplotx.styles.dufte)
# plt.style.use('ipynb') # prettier plots
plt.style.use(['ipynb', 'colors10-markers'])
# Always put after custom styles
rcParams['axes.spines.top'] = False # remove top border of plot
rcParams['axes.spines.bottom'] = True # keep top border of plot
rcParams['axes.spines.left'] = True # keep top border of plot
rcParams['axes.spines.right'] = True # keep top border of plot
# rcParams['axes.grid' ] = True
rcParams["figure.figsize"] = plt.rcParamsDefault["figure.figsize"] # reset figsize

# To display charts in Jupyter
%matplotlib inline

# Matplotlib options
rcParams['figure.dpi'] = 600
# rcParams['figure.figsize'] = 8, 6
rcParams['legend.fontsize'] = "large"
rcParams['xtick.major.size'] = 4
rcParams['xtick.minor.size'] = 1
rcParams["figure.autolayout"] = True

# For reproducibility
# rng = np.random.RandomState(0)
rng = 777
os.environ['PYTHONHASHSEED'] = str(rng)
random.seed(rng)
np.random.seed(rng)
tf.random.set_seed(rng)

# Set the seed using keras.utils.set_random_seed. This will set:
# 1) `numpy` seed
# 2) `tensorflow` random seed
# 3) `python` random seed
tf.keras.utils.set_random_seed(rng)

# This will make TensorFlow ops as deterministic as possible, but it will
# affect the overall performance, so it's not enabled by default.
# `enable_op_determinism()` is introduced in TensorFlow 2.9.
# tf.config.experimental.enable_op_determinism()
os.environ["TF_DETERMINISTIC_OPS"] = "True"
os.environ["TF_DISABLE_SEGMENT_REDUCTION_OP_DETERMINISM_EXCEPTIONS"] = "True"

Imported AutoViz_Class version: 0.1.24. Call using:
    AV = AutoViz_Class()
    AV.AutoViz(filename, sep=',', depVar='', dfte=None, header=0, verbose=0, lowess=False,
               chart_format='svg',max_rows_analyzed=150000,max_cols_analyzed=30, save_plot_dir=None)
Note: verbose=0 or 1 generates charts and displays them in your local Jupyter notebook.
      verbose=2 does not display plots but saves them in AutoViz_Plots folder in local machine.
Note: chart_format='bokeh' generates and displays charts in your local Jupyter notebook.
      chart_format='server' generates and displays charts in the browser - one tab for each chart.


In [6]:
%load_ext watermark
# %watermark
%watermark --iversions

dtale     : 3.5.0
xgboost   : 2.0.0
sketch    : 0.4.2
matplotx  : 0.3.10
sklearn   : 1.3.1
sweetviz  : 2.2.1
scipy     : 1.10.1
pandas    : 1.5.3
numpy     : 1.23.5
sys       : 3.8.17 | packaged by conda-forge | (default, Jun 16 2023, 07:01:59) [MSC v.1929 64 bit (AMD64)]
matplotlib: 3.7.1
tensorflow: 2.10.1
keras     : 2.10.0
seaborn   : 0.11.0



In [7]:
# Helper function for autoviz
def autoviz_custom(dfte, save_plot_dir):
    # EDA using Autoviz
    autoviz = AutoViz_Class().AutoViz(
        # filename= f"{data_path}/PdM_water_pump.csv")",
        filename="",
        sep=",",
        depVar="",
        # dfte=None,
        dfte=dfte,
        header=0,  # 0 = first row is header & None = no header
        # verbose=0,
        verbose=2,
        lowess=False,
        chart_format="png",
        # chart_format="svg",
        # chart_format="bokeh",
        # chart_format="html",
        max_rows_analyzed=5000000,
        # max_rows_analyzed=15,
        max_cols_analyzed=50000,
        # save_plot_dir=here('./Assets/EDA')
        save_plot_dir=save_plot_dir
    )
    return autoviz

In [8]:
# Helper functions to export lux plots to matplotlib code
def export_vis(df):  # define a function that takes a dataframe as an argument
    vis_list = df.exported  # get the list of Vis objects
    for vis in vis_list:  # loop over the list
        plt_code = vis.to_matplotlib()  # translate to Matplotlib code
        print(plt_code)  # print the Matplotlib code
        # save the plot as an image file with a unique name
        # plt_code.savefig(f"vis{i}.png")

In [9]:
# Helper functions for getting descriptive statistics
def plot_boxh_groupby(dataset, feature_name, by):
    """
    Box plot with groupby
    
    df: DataFrame
    feature_name: Name of the feature to be plotted
    by: Name of the feature based on which groups are created
    """
    dataset.boxplot(column=feature_name, by=by, vert=False,
                    figsize=(10, 6))

    # disable offset, scientific notation looking
    # plt.ticklabel_format(useOffset=False, style='plain')
    plt.title(f'Distribution of {feature_name} by {by}')
    plt.show()


def plot_hist(dataset, feature_name, kind='hist', bins=100, log=True):
    """
    Plot histogram.
    
    df: DataFrame
    feature_name: Name of the feature to be plotted.
    """
    if log:
        dataset[feature_name].apply(np.log1p).plot(kind='hist',
                                                   bins=bins,
                                                   figsize=(15, 5),
                                                   title=f'Distribution of log1p[{feature_name}]')
    else:
        dataset[feature_name].plot(kind='hist',
                                   bins=bins,
                                   figsize=(15, 5),
                                title=f'Distribution of {feature_name}')
    plt.show()


def plot_ts(series, figsize=(20, 6), title=None, xlabel="", ylabel=""):
    """
    Plot Time Series data. The series object should have date or time as index.
    
    series: Series object to be plotted.
    """
    series.plot(figsize=figsize, title=title)

    # make x-axis ticks legible
    fig = plt.gcf()
    fig.autofmt_xdate()  # auto format data
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.show()


def plot_barh(dataset, feature_name, normalize=True,
              kind='barh', figsize=(15, 5), sort_index=False, title=None):
    """
    Plot barh for a particular feature
    
    kind : Type of the plot
    
    """
    if sort_index == True:
        dataset[feature_name].value_counts(
            normalize=normalize, dropna=False).sort_index().plot(
            kind=kind, figsize=figsize, grid=True,
            title=title)
    else:
        dataset[feature_name].value_counts(
            normalize=normalize, dropna=False).sort_values().plot(
            kind=kind, figsize=figsize, grid=True,
            title=title)
    plt.legend()
    plt.show()


def plot_boxh(dataset, feature_name, kind='box', log=True):
    """
    Box plot
    """
    if log:
        dataset[feature_name].apply(np.log1p).plot(kind='box', vert=False,
                                                   figsize=(10, 6),
                                                   title=f'Distribution of log1p[{feature_name}]')
    else:
        dataset[feature_name].plot(kind='box', vert=False,
                                   figsize=(10, 6),
                                   title=f'Distribution of {feature_name}')
    plt.show()


def plot_scatter(dataset, feature_x, feature_y, figsize=(10, 10),
                 title=None, xlabel=None, ylabel=None):
    """
    Plot satter     
    """
    dataset.plot.scatter(feature_x, feature_y,
                         figsize=(8, 6), title=title,
                         legend=None)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.show()


def lower_col_name(dataset):
    lower_cols = [col.lower() for col in dataset.columns]
    dataset.columns = lower_cols
    return dataset.head(3)

In [10]:
# Helper functions for categorical type casting
def summarize_categoricals(df, show_levels=False):
    """
        Display uniqueness in each column
    """
    data = [[df[c].unique(), len(df[c].unique()), df[c].isnull().sum()]
            for c in df.columns]
    df_temp = pd.DataFrame(data, index=df.columns,
                           columns=['Levels', 'No. of Levels',
                                    'No. of Missing Values'])
    return df_temp.iloc[:, 0 if show_levels else 1:]


def return_categoricals(df, threshold=5):
    """
        Returns a list of columns that have less than or equal to
        `threshold` number of unique categorical levels
    """
    return list(filter(lambda c: c if len(df[c].unique()) <= threshold else None,
                       df.columns))


def to_categorical(columns, df):
    """
        Converts the columns passed in `columns` to categorical datatype
    """
    for col in columns:
        df[col] = df[col].astype('category')
    return df

In [11]:
# allow memory growth of GPU
physical_devices = tf.config.experimental.list_physical_devices('GPU')
for device in physical_devices:
    tf.config.experimental.set_memory_growth(device, True)

<a id="import-and-clean-data"></a>
## 1.   Import and Clean Data

In [12]:
data_path = "../../data/d_pump/"

water_pump = pd.read_csv(f"{data_path}/sensor.csv", parse_dates=["timestamp"])

# Rename timestamp column to datetime
water_pump.rename(columns={'timestamp': 'datetime'}, inplace=True)

# Rename machine_status column to pump_status
water_pump.rename(columns={'machine_status': 'pump_status'}, inplace=True)

# Convert pump_status to lowercase
water_pump.loc[:,'pump_status'] = water_pump.loc[:,'pump_status'].str.lower()

# Changing format's of date-time columns. Sort based on date for better readability
tables = [water_pump]
for dataset in tables:
    dataset.sort_values(["datetime"],
    inplace=True, ignore_index=True)

<a id="missing-values"></a>
### 1.1.   Missing values and type casting to categoricals/numeric

In [13]:
# Create a dictionary to store the datasets and their names
datasets = {
    'Water pump': water_pump,
}

# Iterate through the datasets and print the count and percentage of missing values
for name, df in datasets.items():
    print(f"Missing values in {name} dataset:")
    missing_count = df.isnull().sum()
    total_entries = len(df)
    missing_percentage = (missing_count / total_entries) * 100
    missing_info = pd.DataFrame({
        'Missing Count': missing_count,
        'Missing Percentage': missing_percentage
    })
    print(missing_info)
    print("=" * 50)  # Separator between datasets

Missing values in Water pump dataset:
             Missing Count  Missing Percentage
Unnamed: 0               0            0.000000
datetime                 0            0.000000
sensor_00            10208            4.633261
sensor_01              369            0.167484
sensor_02               19            0.008624
sensor_03               19            0.008624
sensor_04               19            0.008624
sensor_05               19            0.008624
sensor_06             4798            2.177741
sensor_07             5451            2.474129
sensor_08             5107            2.317992
sensor_09             4595            2.085603
sensor_10               19            0.008624
sensor_11               19            0.008624
sensor_12               19            0.008624
sensor_13               19            0.008624
sensor_14               21            0.009532
sensor_15           220320          100.000000
sensor_16               31            0.014070
sensor_17             

- Drop the 'Unamed 0' column because it is just an index column which is not useful for the analysis.

In [14]:
# Also drop the 'Unamed 0' column because it is just an index column which is not useful for the analysis.
water_pump.drop(water_pump.columns[0], axis=1, inplace=True)

In [15]:
water_pump = PandaReducer().reduce(water_pump)

Dataset reduced 3.47% : 90.77 to 87.62 MB in 2.59 seconds


In [16]:
print(f"Summary for DataFrame: Water pump")
summarize_categoricals(water_pump, show_levels=True)

Summary for DataFrame: Water pump


Unnamed: 0,Levels,No. of Levels,No. of Missing Values
datetime,"[2018-04-01T00:00:00.000000000, 2018-04-01T00:...",220320,0
sensor_00,"[2.465394, 2.444734, 2.460474, 2.445718, 2.453...",1254,10208
sensor_01,"[47.09201, 47.35243, 47.13541, 47.04861, 47.17...",832,369
sensor_02,"[53.2118, 53.1684, 53.168399810790994, 53.125,...",832,19
sensor_03,"[46.31076, 46.39757, 46.3975677490234, 46.3107...",589,19
sensor_04,"[634.375, 638.8889, 628.125, 636.4583, 637.615...",7845,19
sensor_05,"[76.45975, 73.54598, 76.98898, 76.588969999999...",190752,19
sensor_06,"[13.41146, 13.32465, 13.317420000000002, 13.35...",813,4798
sensor_07,"[16.13136, 16.037329999999997, 16.24711, 16.21...",532,5451
sensor_08,"[15.56713, 15.617770000000002, 15.69734, 15.89...",627,5107


In [17]:
to_cast = return_categoricals(water_pump, threshold=5)
water_pump = to_categorical(to_cast, water_pump)
water_pump.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 220320 entries, 0 to 220319
Data columns (total 53 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   datetime     220320 non-null  datetime64[ns]
 1   sensor_00    210112 non-null  float64       
 2   sensor_01    219951 non-null  float64       
 3   sensor_02    220301 non-null  float64       
 4   sensor_03    220301 non-null  float64       
 5   sensor_04    220301 non-null  float64       
 6   sensor_05    220301 non-null  float64       
 7   sensor_06    215522 non-null  float64       
 8   sensor_07    214869 non-null  float64       
 9   sensor_08    215213 non-null  float64       
 10  sensor_09    215725 non-null  float64       
 11  sensor_10    220301 non-null  float64       
 12  sensor_11    220301 non-null  float64       
 13  sensor_12    220301 non-null  float64       
 14  sensor_13    220301 non-null  float64       
 15  sensor_14    220299 non-null  floa

In [18]:
# View head of the dataset
water_pump.head(5)

Unnamed: 0,datetime,sensor_00,sensor_01,...,sensor_50,sensor_51,pump_status
0,2018-04-01 00:00:00,2.465394,47.09201,...,243.0556,201.3889,normal
1,2018-04-01 00:01:00,2.465394,47.09201,...,243.0556,201.3889,normal
2,2018-04-01 00:02:00,2.444734,47.35243,...,241.3194,203.7037,normal
3,2018-04-01 00:03:00,2.460474,47.09201,...,240.4514,203.125,normal
4,2018-04-01 00:04:00,2.445718,47.13541,...,242.1875,201.3889,normal


In [19]:
# water_pump.head(5).to_clipboard(sep=',', index=True)

In [20]:
# View tail of the dataset
water_pump.tail(5)

Unnamed: 0,datetime,sensor_00,sensor_01,...,sensor_50,sensor_51,pump_status
220315,2018-08-31 23:55:00,2.40735,47.69965,...,,231.1921,normal
220316,2018-08-31 23:56:00,2.400463,47.69965,...,,231.1921,normal
220317,2018-08-31 23:57:00,2.396528,47.69965,...,,232.0602,normal
220318,2018-08-31 23:58:00,2.406366,47.69965,...,,234.0856,normal
220319,2018-08-31 23:59:00,2.396528,47.69965,...,,234.0856,normal


In [21]:
# water_pump.tail(5).to_clipboard(sep=',', index=True)