In [None]:


import json
import re

# Load the dataset from the local JSON file
file_path = "/content/dataset.json"
with open(file_path, "r") as file:
    data = json.load(file)

# Define a function to clean and format text
def clean_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove special characters, punctuation, and extra spaces
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()

# Clean and format external status descriptions and internal status labels
for item in data:
    item["externalStatus"] = clean_text(item["externalStatus"])
    item["internalStatus"] = clean_text(item["internalStatus"])

# Print the preprocessed dataset
print(json.dumps(data, indent=2))


[
  {
    "externalStatus": "port out",
    "internalStatus": "port out"
  },
  {
    "externalStatus": "terminal in",
    "internalStatus": "inbound terminal"
  },
  {
    "externalStatus": "port in",
    "internalStatus": "port in"
  },
  {
    "externalStatus": "vessel departure from first pol vessel name tian fu he",
    "internalStatus": "departure"
  },
  {
    "externalStatus": "vessel arrival at final pod vessel name tian fu he",
    "internalStatus": "arrival"
  },
  {
    "externalStatus": "departcu",
    "internalStatus": "departure"
  },
  {
    "externalStatus": "gate in",
    "internalStatus": "gate in"
  },
  {
    "externalStatus": "load on maersk sembawang 237e",
    "internalStatus": "loaded on vessel"
  },
  {
    "externalStatus": "discharge",
    "internalStatus": "departure"
  },
  {
    "externalStatus": "gate out",
    "internalStatus": "gate out"
  },
  {
    "externalStatus": "gate in",
    "internalStatus": "gate in"
  },
  {
    "externalStatus": "load on ms

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

# Convert preprocessed data to numpy arrays
X = np.array([item['externalStatus'] for item in data])
y = np.array([item['internalStatus'] for item in data])

# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Tokenize input text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)
X_sequences = tokenizer.texts_to_sequences(X)

In [None]:
# Pad sequences to ensure uniform length
max_sequence_length = max(len(seq) for seq in X_sequences)
X_padded = pad_sequences(X_sequences, maxlen=max_sequence_length, padding='post')

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_padded, y_encoded, test_size=0.2, random_state=42)

# Define model architecture
embedding_dim = 100
model = Sequential([
    Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=embedding_dim, input_length=max_sequence_length),
    LSTM(units=128),
    Dense(units=len(label_encoder.classes_), activation='softmax')
])

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Loss: {loss}')
print(f'Test Accuracy: {accuracy}')

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Loss: 0.22112798690795898
Test Accuracy: 0.9306122660636902


In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Convert externalStatus descriptions into numerical vectors using one-hot encoding
vectorizer = CountVectorizer()
X_train_encoded = vectorizer.fit_transform(X_train)
X_test_encoded = vectorizer.transform(X_test)

# Now train the model
model = SVC()
model.fit(X_train_encoded, y_train_encoded)

# Predict labels for the test set
y_pred = model.predict(X_test_encoded)

# Evaluate the model
accuracy = accuracy_score(y_test_encoded, y_pred)
precision = precision_score(y_test_encoded, y_pred, average='macro')
recall = recall_score(y_test_encoded, y_pred, average='macro')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)


Accuracy: 0.9959183673469387
Precision: 0.9987654320987654
Recall: 0.9944444444444444


In [None]:
import joblib

# Save the trained model
joblib.dump(model, "trained_model.pkl")

# Save the trained CountVectorizer
joblib.dump(vectorizer, "vectorizer.pkl")


['vectorizer.pkl']

In [None]:
# Load the trained model
model = joblib.load("trained_model.pkl")

# Load the trained CountVectorizer
vectorizer = joblib.load("vectorizer.pkl")

# Now you can use both model and vectorizer for prediction in your API


In [5]:
import json
import re
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.svm import SVC
import joblib

# Load the dataset from the local JSON file
file_path = "/content/dataset.json"
with open(file_path, "r") as file:
    data = json.load(file)

# Define a function to clean and format text
def clean_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove special characters, punctuation, and extra spaces
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()

# Clean and format external status descriptions and internal status labels
for item in data:
    item["externalStatus"] = clean_text(item["externalStatus"])
    item["internalStatus"] = clean_text(item["internalStatus"])

# Convert preprocessed data to numpy arrays
X = np.array([item['externalStatus'] for item in data])
y = np.array([item['internalStatus'] for item in data])

# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Tokenize input text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_test_sequences = tokenizer.texts_to_sequences(X_test)

# Pad sequences to ensure uniform length
max_sequence_length = max(len(seq) for seq in X_train_sequences + X_test_sequences)
X_train_padded = pad_sequences(X_train_sequences, maxlen=max_sequence_length, padding='post')
X_test_padded = pad_sequences(X_test_sequences, maxlen=max_sequence_length, padding='post')

# Define LSTM model architecture
embedding_dim = 100
model = Sequential([
    Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=embedding_dim, input_length=max_sequence_length),
    LSTM(units=128),
    Dense(units=len(label_encoder.classes_), activation='softmax')
])

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train_padded, y_train, epochs=10, batch_size=32, validation_split=0.2)

# Evaluate the model
loss, accuracy = model.evaluate(X_test_padded, y_test)
print(f'Test Loss: {loss}')
print(f'Test Accuracy: {accuracy}')

# Save the trained LSTM model
model.save("trained_model.h5")

# Save the label encoder
joblib.dump(label_encoder, "label_encoder.pkl")

# Convert externalStatus descriptions into numerical vectors using CountVectorizer
vectorizer = CountVectorizer()
X_train_encoded = vectorizer.fit_transform(X_train)
X_test_encoded = vectorizer.transform(X_test)

# Train the SVM model
svm_model = SVC()
svm_model.fit(X_train_encoded, y_train)

# Predict labels for the test set using SVM model
y_pred_svm = svm_model.predict(X_test_encoded)

# Evaluate the SVM model
accuracy_svm = accuracy_score(y_test, y_pred_svm)
precision_svm = precision_score(y_test, y_pred_svm, average='macro')
recall_svm = recall_score(y_test, y_pred_svm, average='macro')

print("SVM Model Accuracy:", accuracy_svm)
print("SVM Model Precision:", precision_svm)
print("SVM Model Recall:", recall_svm)

# Save the trained SVM model
joblib.dump(svm_model, "svm_model.pkl")

# Save the trained CountVectorizer
joblib.dump(vectorizer, "vectorizer.pkl")


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Loss: 0.42470023036003113
Test Accuracy: 0.8571428656578064
SVM Model Accuracy: 0.9959183673469387
SVM Model Precision: 0.9987654320987654
SVM Model Recall: 0.9944444444444444


  saving_api.save_model(


['vectorizer.pkl']

In [2]:
import joblib
from tensorflow.keras.models import load_model

# Load the trained LSTM model
lstm_model = load_model("trained_model.h5")

# Load the trained SVM model
svm_model = joblib.load("svm_model.pkl")

# Load the trained CountVectorizer
vectorizer = joblib.load("vectorizer.pkl")

# Define a function to preprocess input text
def preprocess_text(text):
    # Clean and format text
    cleaned_text = clean_text(text)
    # Tokenize text
    sequences = tokenizer.texts_to_sequences([cleaned_text])
    # Pad sequence
    padded_sequence = pad_sequences(sequences, maxlen=max_sequence_length, padding='post')
    return padded_sequence

# Test input text
test_text = "Load on MAERSK SEMBAWANG / 237E"

# Preprocess input text for LSTM model
preprocessed_text_lstm = preprocess_text(test_text)

# Preprocess input text for SVM model
preprocessed_text_svm = vectorizer.transform([test_text])

# Predictions using LSTM model
lstm_prediction = lstm_model.predict(preprocessed_text_lstm)
lstm_predicted_label = label_encoder.inverse_transform([np.argmax(lstm_prediction)])

# Predictions using SVM model
svm_prediction = svm_model.predict(preprocessed_text_svm)
svm_predicted_label = label_encoder.inverse_transform([svm_prediction])

print("LSTM Model Prediction:", lstm_predicted_label[0])
print("SVM Model Prediction:", svm_predicted_label[0])


LSTM Model Prediction: loaded on vessel
SVM Model Prediction: loaded on vessel


  y = column_or_1d(y, warn=True)


In [3]:
!pip install fastapi uvicorn


Collecting fastapi
  Downloading fastapi-0.110.1-py3-none-any.whl (91 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.9/91.9 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting uvicorn
  Downloading uvicorn-0.29.0-py3-none-any.whl (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.8/60.8 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
Collecting starlette<0.38.0,>=0.37.2 (from fastapi)
  Downloading starlette-0.37.2-py3-none-any.whl (71 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m71.9/71.9 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
Collecting h11>=0.8 (from uvicorn)
  Downloading h11-0.14.0-py3-none-any.whl (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.3/58.3 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: h11, uvicorn, starlette, fastapi
Successfully installed fastapi-0.110.1 h11-0.14.0 starlette-0.37.2 uvicorn-0.29.0


In [6]:
from fastapi import FastAPI
from pydantic import BaseModel
import joblib
from tensorflow.keras.models import load_model
import numpy as np

# Load the trained LSTM model
lstm_model = load_model("trained_model.h5")
# Load the trained CountVectorizer
vectorizer = joblib.load("vectorizer.pkl")
# Load label encoder
label_encoder = joblib.load("label_encoder.pkl")

# Define input data model
class InputData(BaseModel):
    external_status: str

# Initialize FastAPI app
app = FastAPI()

# Define API endpoint
@app.post("/predict")
def predict_internal_status(data: InputData):
    # Preprocess input text
    preprocessed_text = preprocess_text(data.external_status)
    # Make prediction using LSTM model
    prediction = lstm_model.predict(preprocessed_text)
    # Get predicted label
    predicted_label = label_encoder.inverse_transform([np.argmax(prediction)])
    return {"predicted_internal_status": predicted_label[0]}

# Define function to preprocess input text
def preprocess_text(text):
    # Clean and format text
    cleaned_text = clean_text(text)
    # Tokenize text
    sequences = tokenizer.texts_to_sequences([cleaned_text])
    # Pad sequence
    padded_sequence = pad_sequences(sequences, maxlen=max_sequence_length, padding='post')
    return padded_sequence


In [8]:
import requests

# Define the API endpoint URL
endpoint_url = "http://localhost:8000/predict"

# Define sample input data
input_data = {"external_status": "Sample external status description"}

# Send POST request to the API endpoint
response = requests.post(endpoint_url, json=input_data)

# Print the response
print(response.json())



ConnectionError: HTTPConnectionPool(host='localhost', port=8000): Max retries exceeded with url: /predict (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x79871dde2ec0>: Failed to establish a new connection: [Errno 111] Connection refused'))