In [None]:
import pandas as pd
import pickle
import random
import json
from openai_utils import (
    billing_calculator,
    refine_message,
    print_distribution,
    num_assistant_tokens_from_messages,
    num_tokens_from_messages,
    validate_formatting
)

In [4]:
feature_path = "./../dataset/iot_pmfp_data.feather"
label_path = "./../dataset/iot_pmfp_labels.feather"

In [5]:
feature = pd.read_feather(feature_path)
label = pd.read_feather(label_path)

In [6]:
temp_feature = feature.merge(label, on=['datetime', 'machineID'], how='left')
label['failure_comp'] = label[['failure_comp1', 'failure_comp2', 'failure_comp3', 'failure_comp4']].any(axis=1)
label['result'] = label.apply(lambda row: 'Abnormal' if row['failure_comp'] != row['failure'] else 'Normal', axis=1)

In [9]:
# Got from kaggle notebook
temp_feature['time_to_fail'] = None
tempdf = list()
for machine_id in range(1,101):
    df = temp_feature[temp_feature['machineID'] == machine_id]
    df.reset_index(drop=True, inplace=True)
    rul = []
    cont = len(df['failure']) - 1
    diff = 0
    while cont >= df.index.min():
        if df['failure'][cont] == False:
            diff = diff + 1
            rul.append(diff)
        else:
            rul.append(0)
            diff = 0
        diff = rul[-1]
        cont = cont - 1
    df['time_to_fail'] = list(reversed(rul))
    tempdf.append(df)

print(len(tempdf))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['time_to_fail'] = list(reversed(rul))


100


In [10]:
telemetry = pd.DataFrame()
for df in tempdf:
    telemetry = pd.concat([telemetry, df], axis=0)
    
# Drop unused columns
telemetry.drop(['anomaly_x', 'failure_comp1', 'failure_comp2', 'failure_comp3','failure_comp4'], axis=1, inplace=True)
telemetry.drop(['error1', 'error2', 'error3', 'error4','error5'], axis=1, inplace=True)

In [12]:
def create_finetune_data(data_frame):
    # Loop through machine id for easier processing
    train_data = list()
    for machine_id in data_frame['machineID'].unique():
        print("PROCESSING FOR MACHINE ID = ", machine_id)
        df = data_frame[data_frame['machineID'] == machine_id]
        # Process by time_to_fail index
        ttf_index = df.loc[df["time_to_fail"] == 0].index
        prev_idx = 0

        for i in range(0, len(ttf_index)):
            # [TBD] Need to troubleshoot the boundary problem, in this case i will miss the last record data
            create_negative = True
            short_length = True
            if ttf_index[i]-24 < 0 or ttf_index[i]-24 < prev_idx:
                negative_range = None
                create_negative = False
                short_length = False
                positive_range = [prev_idx, ttf_index[i]]
                
            else:
                negative_range = [prev_idx,ttf_index[i] - 24]
                positive_range = [ttf_index[i] - 24, ttf_index[i]]

            # Pick some single record first
            p = 0
            while p < 10:
                if create_negative == True:
                    # Negative data
                    index = random.randint(negative_range[0],negative_range[1])
                    data = df.iloc[index]
                    train_record = ",".join([str(data["volt"]),str(data["rotate"]),str(data["pressure"]),str(data["vibration"]),str(data["age"])])
                    train_label = 0
                    ttf_label = ttf_index[i] - index

                    line = train_record + "," + str(train_label) + "," + str(ttf_label) + "\n"
                    train_data.append((train_record,train_label, ttf_label, index)) # Only use for debugging

                # Positive data
                index = random.randint(positive_range[0],positive_range[1])
                data = df.iloc[index]
                train_record = ",".join([str(data["volt"]),str(data["rotate"]),str(data["pressure"]),str(data["vibration"]),str(data["age"])])
                train_label = 1
                ttf_label = ttf_index[i] - index

                line = train_record + "," + str(train_label) + "," + str(ttf_label) + "\n"                
                train_data.append((train_record, train_label, ttf_label, index))
                
                p += 1

            # Pick a list of record
            p = 0
            while p < 5:
                if create_negative == True:
                    # Negative data
                    index = random.randint(negative_range[0],negative_range[1])
                    list_length = random.randint(1,24)
                    ttf_label = ttf_index[i] - index

                    # print("Create negative list, start from ", index, " with length = ", list_length, ttf_label > 24)
                    train_record = ""
                    for sub_index in range(0, list_length):
                        data = df.iloc[index+sub_index]
                        train_record += ",".join([str(data["volt"]),str(data["rotate"]),str(data["pressure"]),str(data["vibration"]),str(data["age"])])+"\n"
                        train_label = 0

                    line = train_record + "," + str(train_label) + "," + str(ttf_label) + "\n"                   
                    train_data.append((train_record,train_label, ttf_label, index)) # Index Only use for debugging

                # Positive data, random an integer, and a list length, pick list from that integer to the positive[1]
                index = random.randint(positive_range[0],positive_range[1])

                if short_length == False:
                    list_length = random.randint(1,24)
                else:
                    list_length = random.randint(0,ttf_index[i])
                ttf_label = ttf_index[i] - index
                # print("Create positive list, start from ", index - list_length, " with start_index = ", index, "ttf label = ", ttf_label, ttf_label <= 24)
                train_record = ""
                for sub_index in range(list_length, 0, -1):
                    data = df.iloc[index-sub_index]
                    train_record += ",".join([str(data["volt"]),str(data["rotate"]),str(data["pressure"]),str(data["vibration"]),str(data["age"])]) + "\n"
                    train_label = 1

                line = train_record + "," + str(train_label) + "," + str(ttf_label) + "\n"
                train_data.append((train_record, train_label, ttf_label, index))
                
                p += 1
            prev_idx = ttf_index[i]
    return train_data

In [13]:
train_data = create_finetune_data(telemetry)

PROCESSING FOR MACHINE ID =  1
PROCESSING FOR MACHINE ID =  2
PROCESSING FOR MACHINE ID =  3
PROCESSING FOR MACHINE ID =  4
PROCESSING FOR MACHINE ID =  5
PROCESSING FOR MACHINE ID =  6
PROCESSING FOR MACHINE ID =  7
PROCESSING FOR MACHINE ID =  8
PROCESSING FOR MACHINE ID =  9
PROCESSING FOR MACHINE ID =  10
PROCESSING FOR MACHINE ID =  11
PROCESSING FOR MACHINE ID =  12
PROCESSING FOR MACHINE ID =  13
PROCESSING FOR MACHINE ID =  14
PROCESSING FOR MACHINE ID =  15
PROCESSING FOR MACHINE ID =  16
PROCESSING FOR MACHINE ID =  17
PROCESSING FOR MACHINE ID =  18
PROCESSING FOR MACHINE ID =  19
PROCESSING FOR MACHINE ID =  20
PROCESSING FOR MACHINE ID =  21
PROCESSING FOR MACHINE ID =  22
PROCESSING FOR MACHINE ID =  23
PROCESSING FOR MACHINE ID =  24
PROCESSING FOR MACHINE ID =  25
PROCESSING FOR MACHINE ID =  26
PROCESSING FOR MACHINE ID =  27
PROCESSING FOR MACHINE ID =  28
PROCESSING FOR MACHINE ID =  29
PROCESSING FOR MACHINE ID =  30
PROCESSING FOR MACHINE ID =  31
PROCESSING FOR MA

In [14]:
# Dump and load data and recheck for further usage
with open("feature_list_v1.pkl","wb") as f:
    pickle.dump(train_data,f)
with open("feature_list_v1.pkl","rb") as f:
    raw_data = pickle.load(f)
train_data == raw_data

True

In [16]:
system_prompt = """You are an assistant that only speaks JSON. Do not write normal text. I will provide a timeseries data, the first line is header. You must return follow predictions:
Does the machine need to be maintance for the next 24h. Yes or No,
If yes, what is the time?"""

base_record = {"messages": [{"role": "system", "content": system_prompt}]}
headers = "volt,rotate,pressure,vibration,age"+"\n"

def create_finetune_data(train_data):
    result = list()
    for i in train_data:
        record = {"messages": [{"role": "system", "content": system_prompt}]}
        user_message = {
            "role": "user",
            "content": headers + i[0]
        }

        is_maintenance = i[1]
        if is_maintenance == 0:
            bot_content = {
                "need_maintenance": i[1],
            }
        else:
            bot_content = {
                "need_maintenance": i[1],
                "predict_ttf": i[3]
            }gpt_data[0]
        bot_content = json.dumps(bot_content)
        bot_message = {
            "role": "assistant",
            "content": bot_content
        }

        record["messages"].append(user_message)
        record["messages"].append(bot_message)

        result.append(record)
    return result

In [19]:
gpt_data = create_finetune_data(train_data)

In [21]:
validate_formatting(gpt_data)

No errors found


In [22]:
random.shuffle(gpt_data)
len(gpt_data)

21300

In [24]:
l = len(gpt_data)
train_length = int(0.05*l)
train_set = gpt_data[:train_length]
test_set = gpt_data[train_length:]

print(len(train_set))
print(len(test_set))

1065
20235


In [25]:
c,r = refine_message(train_set)
billing_calculator(r,c)

Num examples missing system message: 0
Num examples missing user message: 0

#### Distribution of num_messages_per_example:
min / max: 3, 3
mean / median: 3.0, 3.0
p5 / p95: 3.0, 3.0

#### Distribution of num_total_tokens_per_example:
min / max: 124, 3275
mean / median: 216.4742857142857, 133.0
p5 / p95: 125.0, 442.0

#### Distribution of num_assistant_tokens_per_example:
min / max: 8, 17
mean / median: 12.36244131455399, 8.0
p5 / p95: 8.0, 17.0

0 examples may be over the 4096 token limit, they will be truncated during fine-tuning
Dataset has ~189415 tokens that will be charged for during training
By default, you'll train for 2 epochs on this dataset
By default, you'll be charged for ~378830 tokens
