In [1]:
import pandas as pd
import pickle
import random
import json
from openai_utils import (
    billing_calculator,
    refine_message,
    print_distribution,
    num_assistant_tokens_from_messages,
    num_tokens_from_messages,
    validate_formatting
)

In [2]:
feature_path = "./../dataset/iot_pmfp_data.feather"
label_path = "./../dataset/iot_pmfp_labels.feather"

In [29]:
feature = pd.read_feather(feature_path)
label = pd.read_feather(label_path)

In [4]:
temp_feature = feature.merge(label, on=['datetime', 'machineID'], how='left')
label['failure_comp'] = label[['failure_comp1', 'failure_comp2', 'failure_comp3', 'failure_comp4']].any(axis=1)
label['result'] = label.apply(lambda row: 'Abnormal' if row['failure_comp'] != row['failure'] else 'Normal', axis=1)

In [5]:
# Create a 'time_to_fail' column
temp_feature['time_to_fail'] = None  # Initialize the column

tempdf = list()
for machine_id in range(1,101):
    df = temp_feature[temp_feature['machineID'] == machine_id]
    df.reset_index(drop=True, inplace=True)
    rul = []
    cont = len(df['failure']) - 1
    diff = 0
    while cont >= df.index.min():
        if df['failure'][cont] == False:
            diff = diff + 1
            rul.append(diff)
        else:
            rul.append(0)
            diff = 0
        diff = rul[-1]
        cont = cont - 1
    df['time_to_fail'] = list(reversed(rul))
    tempdf.append(df)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['time_to_fail'] = list(reversed(rul))


In [6]:
telemetry = pd.DataFrame()
for df in tempdf:
    telemetry = pd.concat([telemetry, df], axis=0)
telemetry.drop(['anomaly_x', 'failure_comp1', 'failure_comp2', 'failure_comp3','failure_comp4'], axis=1, inplace=True)
telemetry.drop(['error1', 'error2', 'error3', 'error4','error5'], axis=1, inplace=True)

In [8]:
telemetry.iloc[0:100]

Unnamed: 0,datetime,machineID,volt,rotate,pressure,vibration,model,age,maint_comp1,maint_comp2,maint_comp3,maint_comp4,failure,maint,error,anomaly_y,time_to_fail
0,2015-01-01 06:00:00,1,176.217853,418.504078,113.077935,45.087686,model3,18,0,0,0,0,False,False,False,False,96
1,2015-01-01 07:00:00,1,162.879223,402.747490,95.460525,43.413973,model3,18,0,0,0,0,False,False,False,False,95
2,2015-01-01 08:00:00,1,170.989902,527.349825,75.237905,34.178847,model3,18,0,0,0,0,False,False,False,False,94
3,2015-01-01 09:00:00,1,162.462833,346.149335,109.248561,41.122144,model3,18,0,0,0,0,False,False,False,False,93
4,2015-01-01 10:00:00,1,157.610021,435.376873,111.886648,25.990511,model3,18,0,0,0,0,False,False,False,False,92
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,2015-01-05 05:00:00,1,177.510419,469.787301,100.498426,59.577251,model3,18,0,0,0,0,False,False,False,False,1
96,2015-01-05 06:00:00,1,179.303153,499.777962,111.833028,52.383097,model3,18,1,0,0,1,True,True,False,True,0
97,2015-01-05 07:00:00,1,155.511452,498.398435,103.068134,33.270415,model3,18,0,0,0,0,False,False,False,False,1439
98,2015-01-05 08:00:00,1,172.439821,392.124959,108.135159,39.477497,model3,18,0,0,0,0,False,False,False,False,1438


In [10]:
pds = list()
for machine_id in range(1,101):
    df = telemetry[telemetry["machineID"] == machine_id].copy()
    ttf_index = df.loc[df["time_to_fail"] == 0].index
    df["date"] = df["datetime"].dt.date
    
    mean_feature = pd.pivot_table(df, values=['volt', 'rotate', 'pressure', 'vibration', 'age','machineID'], index=["date"],
                          # columns=["datetime","volt","rotate","pressure","vibration","age"],
                           aggfunc={'volt': "mean", 'rotate': "mean", 'pressure': "mean", "vibration": "mean", 'age':'first', 'machineID':'first'}).reset_index()
    
    mean_feature["failure"] = 0
    mean_feature["failure_time"] = 0
    for i in ttf_index:
        rec = df.iloc[i]
    
        fail_time = rec["datetime"].strftime('%H')
    
        mean_feature.loc[mean_feature["date"] == rec["date"],"failure"] = 1
        mean_feature.loc[mean_feature["date"] == rec["date"],"failure_time"] = int(fail_time.replace("0",""))
    pds.append(mean_feature)


In [11]:
def create_finetune_data_v1(pds):
    # Loop through machine id for easier processing
    train_data = list()
    val_data = list()
    test_data = list()
    total_failure = 0
    for pd in pds:
        machine_data = list()
        failure = 1
        non_failure = 0
        for i, row in pd.iterrows():
            train_record = ""
            if i == len(pd) - 1: # Skip the last row since it doesn't contribute anything
                continue
            if pd.iloc[i+1]["failure"] == 1: # Next day is a failure day
                train_record = ",".join([str(row["volt"]),str(row["rotate"]),str(row["pressure"]),str(row["vibration"]),str(row["age"])])
                train_label = "1," + str(pd.iloc[i+1]["failure_time"]) # Label is error,ttf
                failure += 2
                total_failure += 1
                machine_data.append((train_record,train_label))
            elif pd.iloc[i]["failure"] == 1:
                pass
            else: # Others:
                if non_failure < failure: # We add by 2:1 ratio
                    train_record = ",".join([str(row["volt"]),str(row["rotate"]),str(row["pressure"]),str(row["vibration"]),str(row["age"])])
                    train_label = "0,0"
                    non_failure += 1
                    machine_data.append((train_record,train_label))
                
        train_data.append(machine_data[:int(0.8*len(machine_data))])
        val_data.append(machine_data[int(0.8*len(machine_data)):int(0.9*len(machine_data))])
        test_data.append(machine_data[int(0.9*len(machine_data)):])
    print("Total failure of system ", total_failure)
    return train_data,val_data,test_data
    
train,val,test = create_finetune_data_v1(pds)

Total failure of system  718


In [12]:
train[0]

[('167.57653306422822,440.5153281100609,98.52234494954433,40.049622600969,18',
  '0,0'),
 ('174.79242790764866,448.7432005535771,101.45226610981973,52.19026764133094,18',
  '1,6'),
 ('171.02503280606336,454.614347533371,102.37766530242241,41.50692990087219,18',
  '0,0'),
 ('174.13940979461287,444.3377719206731,96.67484166557482,41.702770858686335,18',
  '0,0'),
 ('188.8601607667583,439.1189540791834,99.30271500764299,39.363431597480336,18',
  '1,6'),
 ('165.0574377173926,440.97004332027376,102.05995611478106,41.03359753186023,18',
  '0,0'),
 ('166.0182681049503,446.3205863396833,99.3176460161741,40.91129491979947,18',
  '0,0'),
 ('172.75438825921916,372.41727184436036,102.72268459417678,39.96045755377374,18',
  '1,6'),
 ('169.37171871790227,454.0032631783519,98.57399630051646,39.68227199243509,18',
  '0,0'),
 ('172.18666258283133,452.50108736528733,100.20698593229156,38.54170620116928,18',
  '0,0'),
 ('169.4686216398003,457.36454829764074,101.31461572225065,48.8138967300599,18',
  '1,6

In [13]:
system_prompt = """I will provide a data, the first line is header, that is the (volt,rotate,pressure,vibration,age) of a machine
the second line is the data. You must return follow predictions in format (number,number): Does the machine need to be maintance for the next 24h. 0 or 1. If 1, what is the time?"""
system_prompt= """Giving input volt,rotate,pressure,vibration,age answer with number,number, first number is 0/1, second is regression in 0-23 range"""
base_record = {"messages": [{"role": "system", "content": system_prompt}]}

In [14]:
result_train = list()
for machine in train:
    for i in machine:
        
        record = {"messages": [{"role": "system", "content": system_prompt}]}
        user_message = {
            "role": "user",
            "content": i[0]
        }

        bot_message = {
            "role": "assistant",
            "content": i[1]
        }
    
        record["messages"].append(user_message)
        record["messages"].append(bot_message)
    
        result_train.append(record)

result_val = list()
for machine in val:
    for i in machine:
        
        record = {"messages": [{"role": "system", "content": system_prompt}]}
        user_message = {
            "role": "user",
            "content": i[0]
        }

        bot_message = {
            "role": "assistant",
            "content": i[1]
        }
    
        record["messages"].append(user_message)
        record["messages"].append(bot_message)
    
        result_val.append(record)

result_test = list()
for machine in test:
    for i in machine:
        
        record = {"messages": [{"role": "system", "content": system_prompt}]}
        user_message = {
            "role": "user",
            "content": i[0]
        }

        bot_message = {
            "role": "assistant",
            "content": i[1]
        }
    
        record["messages"].append(user_message)
        record["messages"].append(bot_message)
    
        result_test.append(record)

In [15]:
c,r = refine_message(result_train)
billing_calculator(r,c)
with open("train_v1.jsonl","w") as f:
    for i in r:
        f.write(json.dumps(i) + "\n")

c,r = refine_message(result_val)
billing_calculator(r,c)
with open("val_v1.jsonl","w") as f:
    for i in r:
        f.write(json.dumps(i) + "\n")
c,r = refine_message(result_test)
billing_calculator(r,c)
with open("test_v1.jsonl","w") as f:
    for i in r:
        f.write(json.dumps(i) + "\n")

Num examples missing system message: 0
Num examples missing user message: 0

#### Distribution of num_messages_per_example:
min / max: 3, 3
mean / median: 3.0, 3.0
p5 / p95: 3.0, 3.0

#### Distribution of num_total_tokens_per_example:
min / max: 82, 84
mean / median: 83.88, 84.0
p5 / p95: 83.0, 84.0

#### Distribution of num_assistant_tokens_per_example:
min / max: 3, 3
mean / median: 3.0, 3.0
p5 / p95: 3.0, 3.0

0 examples may be over the 4096 token limit, they will be truncated during fine-tuning
Dataset has ~146790 tokens that will be charged for during training
By default, you'll train for 2 epochs on this dataset
By default, you'll be charged for ~293580 tokens
Num examples missing system message: 0
Num examples missing user message: 0

#### Distribution of num_messages_per_example:
min / max: 3, 3
mean / median: 3.0, 3.0
p5 / p95: 3.0, 3.0

#### Distribution of num_total_tokens_per_example:
min / max: 82, 84
mean / median: 83.89090909090909, 84.0
p5 / p95: 83.0, 84.0

#### Distri

In [16]:
r[0]

{'messages': [{'role': 'system',
   'content': 'Giving input volt,rotate,pressure,vibration,age answer with number,number, first number is 0/1, second is regression in 0-23 range'},
  {'role': 'user',
   'content': '171.9678148606189,439.9010247946871,101.06991433551951,50.54947026536121,18'},
  {'role': 'assistant', 'content': '1,6'}]}

In [23]:
# Current r is the test set
from sklearn.metrics import accuracy_score
import numpy as np
import openai
openai.api_key="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"

b_gt = list()
m_gt = list()

b_pred = list()
m_pred = list()
for i, data in enumerate(r):
    print("Process record ", i, " on total of ", len(r))
    input = data["messages"][0:2]
    messages = input
    print(num_tokens_from_messages(messages))
    
    gt = data["messages"][2]["content"]
    splitter = gt.split(",")
    binary_gt = int(splitter[0])
    multi_gt = int(splitter[1])

    b_gt.append(binary_gt)
    m_gt.append(multi_gt)
    
    completion = openai.ChatCompletion.create(
      model="ft:gpt-3.5-turbo-0613:ai4s:pdm:7yL7mvqc",
      messages=messages
    )

    resp = completion.choices[0].message
    pred = resp["content"].split(",")

    binary_pred = int(pred[0])
    multi_pred = int(pred[1])

    b_pred.append(binary_pred)
    m_pred.append(multi_pred)

    



Process record  0  on total of  273
77
Process record  1  on total of  273
77
Process record  2  on total of  273
76
Process record  3  on total of  273
77
Process record  4  on total of  273
77
Process record  5  on total of  273
77
Process record  6  on total of  273
75
Process record  7  on total of  273
77
Process record  8  on total of  273
77
Process record  9  on total of  273
76
Process record  10  on total of  273
77
Process record  11  on total of  273
77
Process record  12  on total of  273
77
Process record  13  on total of  273
77
Process record  14  on total of  273
77
Process record  15  on total of  273
76
Process record  16  on total of  273
77
Process record  17  on total of  273
76
Process record  18  on total of  273
77
Process record  19  on total of  273
77
Process record  20  on total of  273
77
Process record  21  on total of  273
77
Process record  22  on total of  273
77
Process record  23  on total of  273
77
Process record  24  on total of  273
77
Process re

In [24]:
print(accuracy_score(b_gt,b_pred))

0.9157509157509157


In [25]:
print(accuracy_score(m_gt,m_pred))

0.9157509157509157
