In [1]:
import numpy as np
import tensorflow as tf
import os
import xml.etree.ElementTree as ET

In [129]:
image='data/images'
Annotation='data/Annotations'
classes={'okay':0,"hello":1,'thankyou':2}

In [130]:
import cv2

In [132]:
def parse_voc_annotations(Annotation,image,classes):
    img_path=[]
    class_label=[]
    bbox=[]
    for xml_file in sorted(os.listdir(Annotation)):
        if not xml_file.endswith('.xml'):
            continue
            # print(xml_file)
        tree=ET.parse(os.path.join(Annotation,xml_file))
        print(tree)
        root=tree.getroot()
        image_file_name=root.find('filename').text
        path=os.path.join(image,image_file_name)
        size=root.find('size')

        image_width=int(size.find('width').text)
        image_height=int(size.find('height').text)

        objects=root.find('object')
        if object is not None:
            class_name=objects.find('name').text
            if class_name not in classes:
                continue
            class_id=classes[class_name]

            bnbox=objects.find('bndbox')
            xmin=float(bnbox.find('xmin').text)/image_width
            ymin=float(bnbox.find('ymin').text)/image_height
            xmax=float(bnbox.find('xmax').text)/image_width
            ymax=float(bnbox.find('ymax').text)/image_height

            img_path.append(path)
            bbox.append([xmin,ymin,xmax,ymax])
            class_label.append(class_id)
    return img_path,bbox,class_label


In [None]:
img_path,bnbox_data,class_labels=parse_voc_annotations(Annotation=Annotation,image=image,classes=classes)
img_path=tf.constant(img_path)
bnbox_data=tf.constant(bnbox_data,dtype=tf.float32)
class_labels=tf.constant(class_labels,dtype=tf.int32)

In [100]:
def load_and_preprocess_image(path, bbox, label):
    image = tf.io.read_file(path)
    image = tf.image.decode_jpeg(image, channels=3)
    image.set_shape([None, None, 3])
    image = tf.image.resize(image, [128, 128])
    label_encode = tf.one_hot(label, depth=len(classes))
    image = image / 255.0
    return image, {"class_output": label_encode, "box_output": bbox}

dataset= tf.data.Dataset.from_tensor_slices((img_path, bnbox_data, class_labels))
dataset = dataset.map(load_and_preprocess_image, num_parallel_calls=tf.data.AUTOTUNE)


dataset = dataset.shuffle(buffer_size=71).batch(10).prefetch(tf.data.AUTOTUNE)


DATASET_SIZE = len(img_path)
train_size = int(0.8 * DATASET_SIZE)
train_ds = dataset.take(train_size)
val_ds = dataset.take(train_size)

In [101]:
len(val_ds)

7

In [None]:
print(f"img_path length: {len(img_path)}")
print(f"bnbox_data length: {len(bnbox_data)}")
print(f"class_labels length: {len(class_labels)}")


In [134]:
base_model=tf.keras.applications.MobileNetV2(
    input_shape=(128,128,3),
    include_top=False,
    weights='imagenet'
)
base_model.trainable=True

In [135]:
input=tf.keras.layers.Input(shape=(128,128,3))

x=tf.keras.layers.RandomZoom(0.1)(input)
x=tf.keras.layers.RandomBrightness(0.2)(x)
x=tf.keras.layers.RandomFlip('vertical and horizontal')(x)
x=tf.keras.layers.Rescaling(1./255)(x)
x=base_model(input)

x=tf.keras.layers.GlobalMaxPooling2D()(x)


class_output=tf.keras.layers.Dense(30,activation='relu')(x)
class_output=tf.keras.layers.Dense(3,activation='softmax',name='class_output')(class_output)

box_output=tf.keras.layers.Dense(30,activation='relu')(x)
box_output=tf.keras.layers.Dense(4,activation='sigmoid',name='box_output')(box_output)

detecting_signs_model=tf.keras.models.Model(inputs=input,outputs=[class_output,box_output])

In [136]:
detecting_signs_model.compile(
    optimizer='adam',
    loss={
        
        'class_output':'categorical_crossentropy',
        'box_output':'mse'
    },
    metrics={
        'class_output':'accuracy',
        'box_output':'mse'
    }
)

In [None]:
detecting_signs_model.fit(
    train_ds,
    epochs=10,
    validation_data=val_ds,
    callbacks=[
        tf.keras.callbacks.EarlyStopping(monitor='val_class_output_accuracy',patience=3,restore_best_weights=True,mode='max'),
        tf.keras.callbacks.ModelCheckpoint('sign_language.keras',monitor='val_class_output_loss',save_best_only=True,mode='min',verbose=1)
    ]
    
)

In [None]:
image=tf.keras.utils.load_img('data/images/0f3ccdf7-IMG-20250715-WA0040.jpg',target_size=(128,128))
img_array=tf.keras.utils.img_to_array(image)
norm=img_array/.255
img_batch=np.expand_dims(norm,axis=0)
too=inference.predict(img_batch)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 97ms/step


In [19]:
classes=['okay',"hello",'thankyou']

In [None]:
predicted_classes=[np.argmax(s) for s in too]
predicted=predicted_classes[0]
classes[predicted]

In [10]:
inference=tf.keras.models.load_model('sign_language.keras')

In [None]:
import numpy as np

In [None]:
cap = cv2.VideoCapture(0)
if not cap.isOpened():
    raise Exception('could not open the camera')

while True:
    ret, frame = cap.read()
    if not ret:
        break

    original = frame.copy() 

    frame = cv2.resize(frame, (128, 128))       
    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)  
    input_data = frame_rgb / 255.0               
    input_data = input_data.reshape(1, 128, 128, 3) 

    predictions = inference.predict(input_data) 
    Prediction= [np.argmax(p) for p in predictions]
    s=Prediction[0]
    print(classes[s])


    cv2.putText(original, f'Prediction: {classes[s]}',
                
                (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
    
    try:
        x, y, w = map(int, predictions[0].flatten())
    
        scale_x = original.shape[1] / 128
        scale_y = original.shape[0] / 128
        x = int(x * scale_x)
        y = int(y * scale_y)
        w = int(w * ((scale_x + scale_y) / 2))  
        cv2.circle(original, (x, y), w, (0, 255, 0), 2)
    except Exception as e:
        print("Prediction output error:", e)

    cv2.imshow('Real-Time Inference', original)

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()


In [25]:
cap.release()