In [None]:
import os
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
import plotly.express as px
from sklearn.decomposition import PCA
from datetime import datetime, timezone, timedelta

In [None]:
DATA_PATH = 'csv'
RANDOM_STATE = 123

In [None]:
def clusters(
    df: pd.DataFrame,
    n_clusters: int
):
    X = df.drop(columns=['time'])

    kmeans = KMeans(
        n_clusters=n_clusters,
        random_state=RANDOM_STATE
    ).fit(X)

    df = df[['time']].copy()
    df['cluster'] = kmeans.labels_

    return df

def plot_clusters_tsne(
    df: pd.DataFrame,
    n_clusters: int,
    method: str = 'tsne'
):
    X = df.drop(columns=['time'])

    df = clusters(df, n_clusters)

    if method == 'tsne':
        X_embedded = TSNE(
            n_components=2,
            random_state=RANDOM_STATE
        ).fit_transform(X)
        X_embedded = pd.DataFrame(X_embedded)
        df['x'] = X_embedded[0]
        df['y'] = X_embedded[1]
    elif method=='pca':
        pca = PCA(n_components=2).fit_transform(X.values)
        df['x'] = pca[:,0]
        df['y'] = pca[:,1]
    fig = px.scatter(df, x='x', y='y', color='cluster', hover_name='time')
    fig.show()

def make_plot(
    machine:str,
    n_clusters: int = 3,
    method: str = 'tsne'
):
    data = pd.read_csv(os.path.join(DATA_PATH, machine+'_norm.csv'))
    plot_clusters_tsne(data, n_clusters, method)

## TSNE

In [None]:
make_plot('WOS___174L', method='tsne')

In [None]:
make_plot('WOS___175L', method='tsne')

In [None]:
make_plot('WOS___176L', method='tsne')

In [None]:
make_plot('WOS___177L', method='tsne')

In [None]:
make_plot('WOS___179L', method='tsne')

## PCA

In [None]:
make_plot('WOS___174L', method='pca')

In [None]:
make_plot('WOS___175L', method='pca')

In [None]:
make_plot('WOS___176L', method='pca')

In [None]:
make_plot('WOS___177L', method='pca')

In [None]:
make_plot('WOS___179L', method='pca')

## Failure shifts

In [None]:
# machine = 'WOS___177L'
# start_date = datetime.strptime('2021-02-19T18:00:00Z', '%Y-%m-%dT%H:%M:%SZ').replace(tzinfo=timezone.utc)
# end_date = datetime.strptime('2021-02-20T04:00:00Z', '%Y-%m-%dT%H:%M:%SZ').replace(tzinfo=timezone.utc)

In [None]:
# data = pd.read_csv(os.path.join(DATA_PATH, machine+'_norm.csv'), parse_dates=['time'])

In [None]:
# selected_times = data[(data['time'] > start_date) & (data['time'] < end_date)]['time']

In [27]:
machine = 'WOS___177L'
start_date = datetime.strptime('2021-02-19T18:00:00Z', '%Y-%m-%dT%H:%M:%SZ').replace(tzinfo=timezone.utc)
end_date = datetime.strptime('2021-02-20T04:00:00Z', '%Y-%m-%dT%H:%M:%SZ').replace(tzinfo=timezone.utc)

data = pd.read_csv(os.path.join(DATA_PATH, machine+'_norm.csv'), parse_dates=['time'])

selected_times = data[(data['time'] > start_date) & (data['time'] < end_date)]['time']

df = data.copy()
method = 'tsne'

df['color'] = df.apply(
    lambda row: 'Failure shift' if row['time'] in list(selected_times) else 'Normal shift',
    axis=1
)

X = df.drop(columns=['time', 'color', 'ENGHOURS'])

if method == 'tsne':
    X_embedded = TSNE(
        n_components=2,
        random_state=RANDOM_STATE
    ).fit_transform(X)
    X_embedded = pd.DataFrame(X_embedded)
    df['x'] = X_embedded[0]
    df['y'] = X_embedded[1]
elif method=='pca':
    pca = PCA(n_components=2).fit_transform(X.values)
    df['x'] = pca[:,0]
    df['y'] = pca[:,1]
fig = px.scatter(df, x='x', y='y', color='color', hover_name='time')
fig.show()

In [34]:
machine = 'WOS___176L'
# start_date = datetime.strptime('2021-01-27T17:00:00Z', '%Y-%m-%dT%H:%M:%SZ').replace(tzinfo=timezone.utc)
# end_date = datetime.strptime('2021-01-28T05:00:00Z', '%Y-%m-%dT%H:%M:%SZ').replace(tzinfo=timezone.utc)

start_date = datetime.strptime('2021-01-13T06:00:00Z', '%Y-%m-%dT%H:%M:%SZ').replace(tzinfo=timezone.utc)
end_date = datetime.strptime('2021-01-13T17:00:00Z', '%Y-%m-%dT%H:%M:%SZ').replace(tzinfo=timezone.utc)

data = pd.read_csv(os.path.join(DATA_PATH, machine+'_norm.csv'), parse_dates=['time'])

selected_times = data[(data['time'] > start_date) & (data['time'] < end_date)]['time']

df = data.copy()
method = 'tsne'

df['color'] = df.apply(
    lambda row: 'Failure shift' if row['time'] in list(selected_times) else 'Normal shift',
    axis=1
)

X = df.drop(columns=['time', 'color'])

if method == 'tsne':
    X_embedded = TSNE(
        n_components=2,
        random_state=RANDOM_STATE
    ).fit_transform(X)
    X_embedded = pd.DataFrame(X_embedded)
    df['x'] = X_embedded[0]
    df['y'] = X_embedded[1]
elif method=='pca':
    pca = PCA(n_components=2).fit_transform(X.values)
    df['x'] = pca[:,0]
    df['y'] = pca[:,1]
fig = px.scatter(df, x='x', y='y', color='color', hover_name='time')
fig.show()

In [38]:
machine = 'WOS___175L'
start_date = datetime.strptime('2021-02-11T07:00:00Z', '%Y-%m-%dT%H:%M:%SZ').replace(tzinfo=timezone.utc)
end_date = datetime.strptime('2021-02-11T13:00:00Z', '%Y-%m-%dT%H:%M:%SZ').replace(tzinfo=timezone.utc)

data = pd.read_csv(os.path.join(DATA_PATH, machine+'_norm.csv'), parse_dates=['time'])

selected_times = data[(data['time'] > start_date) & (data['time'] < end_date)]['time']

df = data.copy()
method = 'tsne'

df['color'] = df.apply(
    lambda row: 'Failure shift' if row['time'] in list(selected_times) else 'Normal shift',
    axis=1
)

X = df.drop(columns=['time', 'color'])

if method == 'tsne':
    X_embedded = TSNE(
        n_components=2,
        random_state=RANDOM_STATE
    ).fit_transform(X)
    X_embedded = pd.DataFrame(X_embedded)
    df['x'] = X_embedded[0]
    df['y'] = X_embedded[1]
elif method=='pca':
    pca = PCA(n_components=2).fit_transform(X.values)
    df['x'] = pca[:,0]
    df['y'] = pca[:,1]
fig = px.scatter(df, x='x', y='y', color='color', hover_name='time')
fig.show()

In [39]:
machine = 'WOS___174L'
start_date = datetime.strptime('2020-10-01T23:00:00Z', '%Y-%m-%dT%H:%M:%SZ').replace(tzinfo=timezone.utc)
end_date = datetime.strptime('2020-10-02T09:00:00Z', '%Y-%m-%dT%H:%M:%SZ').replace(tzinfo=timezone.utc)

data = pd.read_csv(os.path.join(DATA_PATH, machine+'_norm.csv'), parse_dates=['time'])

selected_times = data[(data['time'] > start_date) & (data['time'] < end_date)]['time']

df = data.copy()
method = 'tsne'

df['color'] = df.apply(
    lambda row: 'Failure shift' if row['time'] in list(selected_times) else 'Normal shift',
    axis=1
)

X = df.drop(columns=['time', 'color'])

if method == 'tsne':
    X_embedded = TSNE(
        n_components=2,
        random_state=RANDOM_STATE
    ).fit_transform(X)
    X_embedded = pd.DataFrame(X_embedded)
    df['x'] = X_embedded[0]
    df['y'] = X_embedded[1]
elif method=='pca':
    pca = PCA(n_components=2).fit_transform(X.values)
    df['x'] = pca[:,0]
    df['y'] = pca[:,1]
fig = px.scatter(df, x='x', y='y', color='color', hover_name='time')
fig.show()

In [40]:
machine = 'WOS___179L'
start_date = datetime.strptime('2021-04-30T23:00:00Z', '%Y-%m-%dT%H:%M:%SZ').replace(tzinfo=timezone.utc)
end_date = datetime.strptime('2021-05-01T11:00:00Z', '%Y-%m-%dT%H:%M:%SZ').replace(tzinfo=timezone.utc)

data = pd.read_csv(os.path.join(DATA_PATH, machine+'_norm.csv'), parse_dates=['time'])

selected_times = data[(data['time'] > start_date) & (data['time'] < end_date)]['time']

df = data.copy()
method = 'tsne'

df['color'] = df.apply(
    lambda row: 'Failure shift' if row['time'] in list(selected_times) else 'Normal shift',
    axis=1
)

X = df.drop(columns=['time', 'color'])

if method == 'tsne':
    X_embedded = TSNE(
        n_components=2,
        random_state=RANDOM_STATE
    ).fit_transform(X)
    X_embedded = pd.DataFrame(X_embedded)
    df['x'] = X_embedded[0]
    df['y'] = X_embedded[1]
elif method=='pca':
    pca = PCA(n_components=2).fit_transform(X.values)
    df['x'] = pca[:,0]
    df['y'] = pca[:,1]
fig = px.scatter(df, x='x', y='y', color='color', hover_name='time')
fig.show()

## All machines

In [44]:
failure_shifts = {
    'WOS___179L': {
        'start': datetime.strptime('2021-04-30T23:00:00Z', '%Y-%m-%dT%H:%M:%SZ').replace(tzinfo=timezone.utc),
        'end': datetime.strptime('2021-05-01T11:00:00Z', '%Y-%m-%dT%H:%M:%SZ').replace(tzinfo=timezone.utc),
        'c': 1
    },
    'WOS___177L': {
        'start': datetime.strptime('2021-02-19T18:00:00Z', '%Y-%m-%dT%H:%M:%SZ').replace(tzinfo=timezone.utc),
        'end': datetime.strptime('2021-02-20T04:00:00Z', '%Y-%m-%dT%H:%M:%SZ').replace(tzinfo=timezone.utc),
        'c': 2
    },
    'WOS___176L': {
        'start': datetime.strptime('2021-01-27T17:00:00Z', '%Y-%m-%dT%H:%M:%SZ').replace(tzinfo=timezone.utc),
        'end': datetime.strptime('2021-01-28T05:00:00Z', '%Y-%m-%dT%H:%M:%SZ').replace(tzinfo=timezone.utc),
        'c': 3
    },
    'WOS___175L': {
        'start': datetime.strptime('2021-02-11T07:00:00Z', '%Y-%m-%dT%H:%M:%SZ').replace(tzinfo=timezone.utc),
        'end': datetime.strptime('2021-02-11T13:00:00Z', '%Y-%m-%dT%H:%M:%SZ').replace(tzinfo=timezone.utc),
        'c': 4
    },
    'WOS___174L': {
        'start': datetime.strptime('2020-10-01T23:00:00Z', '%Y-%m-%dT%H:%M:%SZ').replace(tzinfo=timezone.utc),
        'end': datetime.strptime('2020-10-02T09:00:00Z', '%Y-%m-%dT%H:%M:%SZ').replace(tzinfo=timezone.utc),
        'c': 5
    }
}

In [45]:
from sklearn import preprocessing

df_list = []
for f in os.listdir(DATA_PATH):
    if not f.endswith('.csv') or f.endswith('_norm.csv'):
        continue

    machine_name = f.replace('.csv', '')
    machine_df = pd.read_csv(os.path.join(DATA_PATH, f), parse_dates=['time'])
    machine_df['machine'] = machine_name

    df_list.append(
        machine_df
    )

df = pd.concat(df_list).reset_index(drop=True)

data_to_scale = df.drop(columns=['time', 'machine']).values  # returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
data_scaled = min_max_scaler.fit_transform(
    data_to_scale
)

df_scaled = pd.DataFrame(
    data_scaled,
    columns=df.drop(columns=['time', 'machine']).columns
)
df_scaled['time'] = df['time']
df_scaled['machine'] = df['machine']
cols = list(df_scaled.columns)
cols.pop()
cols.pop()
cols.insert(0, 'machine')
cols.insert(0, 'time')

df_scaled = df_scaled[cols].drop(columns='HYDDRV')

file_name = 'all_norm.csv'
df_scaled.to_csv(
    os.path.join(DATA_PATH, file_name),
    index=False
)

In [47]:
df = df_scaled.copy()
method = 'tsne'

df['color'] = df.apply(
    lambda row: 'Failure shift' if (
        row['time'] > failure_shifts[row['machine']]['start']
    ) and (
        row['time'] < failure_shifts[row['machine']]['end']
    ) else 'Normal shift',
    axis=1
)

X = df.drop(columns=['time', 'machine', 'color'])

if method == 'tsne':
    X_embedded = TSNE(
        n_components=2,
        random_state=1997
    ).fit_transform(X)
    X_embedded = pd.DataFrame(X_embedded)
    df['x'] = X_embedded[0]
    df['y'] = X_embedded[1]
elif method=='pca':
    pca = PCA(n_components=2).fit_transform(X.values)
    df['x'] = pca[:,0]
    df['y'] = pca[:,1]
fig = px.scatter(df, x='x', y='y', color='color', hover_name='time')
fig.show()

In [None]:
df = df_scaled.copy()
method = 'tsne'

df['color'] = df.apply(
    lambda row: failure_shifts[row['machine']]['c'] if (
        row['time'] > failure_shifts[row['machine']]['start'] - timedelta(days=1)
    ) and (
        row['time'] < failure_shifts[row['machine']]['end']
    ) else '0',
    axis=1
)

X = df.drop(columns=['time', 'machine', 'color'])

if method == 'tsne':
    X_embedded = TSNE(
        n_components=2,
        random_state=1997
    ).fit_transform(X)
    X_embedded = pd.DataFrame(X_embedded)
    df['x'] = X_embedded[0]
    df['y'] = X_embedded[1]
elif method=='pca':
    pca = PCA(n_components=2).fit_transform(X.values)
    df['x'] = pca[:,0]
    df['y'] = pca[:,1]
fig = px.scatter(df, x='x', y='y', color='color', hover_name='time')
fig.show()

In [None]:
df = df_scaled.copy()
method = 'tsne'

df['color'] = df.apply(
    lambda row: failure_shifts[row['machine']]['c'] if (
        row['time'] > failure_shifts[row['machine']]['start'] - timedelta(days=2)
    ) and (
        row['time'] < failure_shifts[row['machine']]['end']
    ) else '0',
    axis=1
)

X = df.drop(columns=['time', 'machine', 'color'])

if method == 'tsne':
    X_embedded = TSNE(
        n_components=2,
        random_state=1997
    ).fit_transform(X)
    X_embedded = pd.DataFrame(X_embedded)
    df['x'] = X_embedded[0]
    df['y'] = X_embedded[1]
elif method=='pca':
    pca = PCA(n_components=2).fit_transform(X.values)
    df['x'] = pca[:,0]
    df['y'] = pca[:,1]
fig = px.scatter(df, x='x', y='y', color='color', hover_name='time')
fig.show()

In [None]:
df = df_scaled.copy()
method = 'tsne'

df['color'] = df.apply(
    lambda row: failure_shifts[row['machine']]['c'] if (
        row['time'] > failure_shifts[row['machine']]['start'] - timedelta(days=3)
    ) and (
        row['time'] < failure_shifts[row['machine']]['end']
    ) else '0',
    axis=1
)

X = df.drop(columns=['time', 'machine', 'color'])

if method == 'tsne':
    X_embedded = TSNE(
        n_components=2,
        random_state=1997
    ).fit_transform(X)
    X_embedded = pd.DataFrame(X_embedded)
    df['x'] = X_embedded[0]
    df['y'] = X_embedded[1]
elif method=='pca':
    pca = PCA(n_components=2).fit_transform(X.values)
    df['x'] = pca[:,0]
    df['y'] = pca[:,1]
fig = px.scatter(df, x='x', y='y', color='color', hover_name='time')
fig.show()