In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

print("numpy", np.__version__)
print("panda", pd.__version__)
print("tensorflow", tf.__version__)

2023-10-14 13:53:49.026059: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


numpy 1.24.4
panda 2.0.3
tensorflow 2.11.1


In [2]:
# Load your historical Warranty Database
data = pd.read_csv('challenge2_data.csv')

In [18]:
# Assuming your CSV has columns named 'claim' and 'fm'
claims = data['claim'].values
failure_modes = data['fm'].values

# Tokenize the text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(claims)
total_words = len(tokenizer.word_index) + 1
print(tokenizer.word_index)



In [19]:
input_sequences = []

for line in claims[0:100]:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)
        
max_sequence_length = max([len(x) for x in input_sequences])
input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_length, padding='pre')


In [20]:
# Define and compile the model
model = Sequential()
model.add(Embedding(total_words, 100, input_length=max_sequence_length-1))
model.add(LSTM(150))
model.add(Dense(total_words, activation='softmax'))
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam')

In [22]:
# Create input and output data
X = input_sequences[:, :-1] # take tokens up to but not including the last
y = input_sequences[:, -1]  # take the last token
y = y.reshape(-1, 1)

print(X)
print(y)

[[  0   0   0 ...   0   0 984]
 [  0   0   0 ...   0 984 185]
 [  0   0   0 ... 984 185   2]
 ...
 [  0   0   0 ...   3 365 175]
 [  0   0   0 ... 365 175  24]
 [  0   0   0 ... 175  24 809]]
[[185]
 [  2]
 [  6]
 ...
 [ 24]
 [809]
 [273]]


In [23]:
# Train the model
model.fit(X, y, epochs=10, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x157ad87c0>

In [15]:
seed_text = ' '  # Start with an empty string
for _ in range(15):
    sequence = tokenizer.texts_to_sequences([seed_text])[0]
    print(sequence)
    sequence = pad_sequences([sequence], maxlen=max_sequence_length-1, padding='post')
    prediction = model.predict(sequence, verbose=0)
    print(prediction)
    predicted_word_index = np.argmax(prediction, axis=-1)[0]
    print(predicted_word_index)
    predicted_word = tokenizer.index_word[predicted_word_index]
    if predicted_word == '':  # Break if the prediction is empty
        break
    seed_text += ' ' + predicted_word

print(seed_text.strip())    

[]
[[1.4064699e-09 1.7381109e-01 1.2103095e-01 ... 9.9839848e-10
  8.4114488e-10 1.0005374e-09]]
1
[1]
[[1.4064699e-09 1.7381109e-01 1.2103095e-01 ... 9.9839848e-10
  8.4114488e-10 1.0005374e-09]]
1
[1, 1]
[[1.4064699e-09 1.7381109e-01 1.2103095e-01 ... 9.9839848e-10
  8.4114488e-10 1.0005374e-09]]
1
[1, 1, 1]
[[1.4064699e-09 1.7381109e-01 1.2103095e-01 ... 9.9839848e-10
  8.4114488e-10 1.0005374e-09]]
1
[1, 1, 1, 1]
[[1.4064699e-09 1.7381109e-01 1.2103095e-01 ... 9.9839848e-10
  8.4114488e-10 1.0005374e-09]]
1
[1, 1, 1, 1, 1]
[[1.4064699e-09 1.7381109e-01 1.2103095e-01 ... 9.9839848e-10
  8.4114488e-10 1.0005374e-09]]
1
[1, 1, 1, 1, 1, 1]
[[1.4064699e-09 1.7381109e-01 1.2103095e-01 ... 9.9839848e-10
  8.4114488e-10 1.0005374e-09]]
1
[1, 1, 1, 1, 1, 1, 1]
[[1.4064699e-09 1.7381109e-01 1.2103095e-01 ... 9.9839848e-10
  8.4114488e-10 1.0005374e-09]]
1
[1, 1, 1, 1, 1, 1, 1, 1]
[[1.4064699e-09 1.7381109e-01 1.2103095e-01 ... 9.9839848e-10
  8.4114488e-10 1.0005374e-09]]
1
[1, 1, 1, 1, 1, 1

In [None]:
synthetic_text.append(seed_text.strip())
synthetic_classes.append(class_name)

# Create a DataFrame for synthetic data
#synthetic_data = pd.DataFrame({'claim': synthetic_text, 'fm': synthetic_classes})

# Save the synthetic data to a CSV file
#synthetic_data.to_csv('synthetic_data.csv', index=False)