In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
df=pd.read_json('dataset.json')

In [4]:
df

Unnamed: 0,externalStatus,internalStatus
0,PORT OUT,Port Out
1,TERMINAL IN,Inbound Terminal
2,PORT IN,Port In
3,Vessel departure from first POL (Vessel name :...,Departure
4,Vessel arrival at final POD (Vessel name : TIA...,Arrival
...,...,...
1217,Import Loaded on Rail,Loaded on Vessel
1218,Full Transshipment Loaded,Loaded on Vessel
1219,Full Transshipment Loaded,Loaded on Vessel
1220,Export Loaded on Vessel,Loaded on Vessel


In [5]:
df.describe()

Unnamed: 0,externalStatus,internalStatus
count,1222,1222
unique,108,15
top,Gate out,Loaded on Vessel
freq,144,331


In [6]:
df.shape

(1222, 2)

In [7]:
df.dtypes

externalStatus    object
internalStatus    object
dtype: object

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1222 entries, 0 to 1221
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   externalStatus  1222 non-null   object
 1   internalStatus  1222 non-null   object
dtypes: object(2)
memory usage: 28.6+ KB


In [9]:
df.isnull().sum()

externalStatus    0
internalStatus    0
dtype: int64

In [10]:
# Function to clean and format text
def clean_text(text):
    # Remove extra information from Vessel departure and arrival statuses
    text = text.split('(Vessel name :')[0].strip()
    return text

# Apply text cleaning to the externalStatus column
df['externalStatus'] = df['externalStatus'].apply(clean_text)

# Print the preprocessed dataset
print(df)

                       externalStatus            internalStatus
0                            PORT OUT                  Port Out
1                         TERMINAL IN          Inbound Terminal
2                             PORT IN                   Port In
3     Vessel departure from first POL                 Departure
4         Vessel arrival at final POD                   Arrival
...                               ...                       ...
1217            Import Loaded on Rail          Loaded on Vessel
1218        Full Transshipment Loaded          Loaded on Vessel
1219        Full Transshipment Loaded          Loaded on Vessel
1220          Export Loaded on Vessel          Loaded on Vessel
1221                 Empty to Shipper  Empty Container Released

[1222 rows x 2 columns]


In [11]:
df

Unnamed: 0,externalStatus,internalStatus
0,PORT OUT,Port Out
1,TERMINAL IN,Inbound Terminal
2,PORT IN,Port In
3,Vessel departure from first POL,Departure
4,Vessel arrival at final POD,Arrival
...,...,...
1217,Import Loaded on Rail,Loaded on Vessel
1218,Full Transshipment Loaded,Loaded on Vessel
1219,Full Transshipment Loaded,Loaded on Vessel
1220,Export Loaded on Vessel,Loaded on Vessel


In [12]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, Bidirectional, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [13]:
# Encode the internal status labels
label_encoder = LabelEncoder()
df['internalStatus_encoded'] = label_encoder.fit_transform(df['internalStatus'])

In [14]:
# Tokenize the external status descriptions
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['externalStatus'])
X = tokenizer.texts_to_sequences(df['externalStatus'])
X = pad_sequences(X, maxlen=10, padding='post')


In [15]:
# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, df['internalStatus_encoded'], test_size=0.2, random_state=42)

In [17]:
# Define the model architecture
model = Sequential([
    Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=50),
    Bidirectional(LSTM(64)),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')
])

In [18]:
# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [19]:
# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.1)

Epoch 1/10
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 80ms/step - accuracy: 0.2415 - loss: -5.8674 - val_accuracy: 0.2347 - val_loss: -71.5647
Epoch 2/10
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 19ms/step - accuracy: 0.2588 - loss: -117.1908 - val_accuracy: 0.2347 - val_loss: -249.2324
Epoch 3/10
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 20ms/step - accuracy: 0.2515 - loss: -311.6208 - val_accuracy: 0.2347 - val_loss: -474.3298
Epoch 4/10
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 20ms/step - accuracy: 0.2256 - loss: -574.8356 - val_accuracy: 0.2347 - val_loss: -770.5912
Epoch 5/10
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 17ms/step - accuracy: 0.2352 - loss: -899.1128 - val_accuracy: 0.2347 - val_loss: -1140.3286
Epoch 6/10
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 20ms/step - accuracy: 0.2296 - loss: -1376.5824 - val_accuracy: 0.2347 - val_loss: -1586.65

<keras.src.callbacks.history.History at 0x1cdfe2f7350>

In [21]:
# Evaluate the model
y_pred_probs = model.predict(X_test)
y_pred = (y_pred_probs > 0.5).astype(int).flatten()  # Convert probabilities to binary predictions
accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy:", accuracy)

[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 148ms/step
Test Accuracy: 0.24489795918367346
