In [5]:
import datetime
from ultralytics import YOLO
import cv2

# Load the pre-trained YOLOv8n model
model = YOLO(r"models\yolov8n_sign_language.pt")

id = 0
video_cap = cv2.VideoCapture(id)

def draw_detections(frame, last_detections):
    # Define some constants
    CONFIDENCE_THRESHOLD = 0.8
    COLOR = (153, 255, 204)
    if last_detections is not None:
        for data in last_detections.boxes.data.tolist():
            # Extract the confidence (i.e., probability) associated with the detection
            confidence = data[4]

            # Filter out weak detections by ensuring the 
            # confidence is greater than the minimum confidence
            if float(confidence) < CONFIDENCE_THRESHOLD:
                continue

            # If the confidence is greater than the minimum confidence,
            # draw the bounding box on the frame
            xmin, ymin, xmax, ymax = int(data[0]), int(data[1]), int(data[2]), int(data[3])
            cv2.rectangle(frame, (xmin, ymin), (xmax, ymax), COLOR, 2)

            # Get the class ID and draw it on the frame
            class_id = data[5]
            text = f"{class_id}, {confidence:.2f}"
            cv2.putText(frame, text, (xmin, ymin - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, COLOR, 2)

    return frame

prev_time = datetime.datetime.now()

while True:
    ret, frame = video_cap.read()
    if not ret:
        break
    
    frame = cv2.flip(frame, 1)
    detections = model(frame)[0]
    frame = draw_detections(frame, detections)

    # Calculate FPS
    curr_time = datetime.datetime.now()
    delta_time = curr_time - prev_time
    fps = 1 / delta_time.total_seconds()
    prev_time = curr_time

    # Display FPS on the frame
    fps_text = f"FPS: {fps:.2f}"
    cv2.putText(frame, fps_text, (20, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)

    cv2.imshow("Image", frame)
    if cv2.waitKey(1) == ord('q'):
        break

video_cap.release()
cv2.destroyAllWindows()



0: 480x640 (no detections), 6.2ms
Speed: 2.1ms preprocess, 6.2ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 O, 5.4ms
Speed: 2.5ms preprocess, 5.4ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 6.2ms
Speed: 1.1ms preprocess, 6.2ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 7.2ms
Speed: 1.0ms preprocess, 7.2ms inference, 1.1ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 7.7ms
Speed: 0.0ms preprocess, 7.7ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 5.2ms
Speed: 1.3ms preprocess, 5.2ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 5.1ms
Speed: 1.0ms preprocess, 5.1ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 C, 6.2ms
Speed: 1.0ms preprocess, 6.2ms inference, 1.1ms postprocess per image at

In [16]:
import numpy as np

# Giả sử đây là mảng numpy của bạn
arr = np.array([1, 2, 3, 2, 4, 5, 2, 6, 2, 7])

# Sử dụng np.unique để lấy các phần tử duy nhất và số lần xuất hiện của chúng
values, counts = np.unique(arr, return_counts=True)

# Tìm chỉ số của phần tử có số lần xuất hiện nhiều nhất
index_of_most_common = np.argmax(counts)

# Phần tử xuất hiện nhiều nhất
most_common_element = values[index_of_most_common]

print(f'Phần tử xuất hiện nhiều nhất là {most_common_element} với {counts[index_of_most_common]} lần xuất hiện.')

Phần tử xuất hiện nhiều nhất là 2 với 4 lần xuất hiện.


In [46]:
sign_arr = np.empty(0)

sign_arr = np.append(sign_arr, 1)
sign_arr = np.append(sign_arr, 1)
sign_arr

array([          1,           1])

In [47]:
def numpy_array_to_string(arr):
    return ' '.join(map(str, arr))

In [49]:
arr = numpy_array_to_string(sign_arr)
arr

'1.0 1.0'

In [4]:
import cv2
import mediapipe as mp
import numpy as np
import datetime
from ultralytics import YOLO
import collections
from PIL import ImageFont, ImageDraw, Image

# Load the pre-trained YOLOv8n model
model = YOLO(r"models\yolov8n_sign_language.pt")

# Initialize MediaPipe Hands
mp_hands = mp.solutions.hands
hands = mp_hands.Hands()

# Initialize MediaPipe drawing utils for drawing hands on the image
mp_drawing = mp.solutions.drawing_utils

def draw_detections(frame, last_detections):
    CONFIDENCE_THRESHOLD = 0.7
    COLOR = (153, 255, 204)
    result = 99
    if last_detections is not None:
        for data in last_detections.boxes.data.tolist():
            confidence = data[4]
            if float(confidence) < CONFIDENCE_THRESHOLD:
                continue
            xmin, ymin, xmax, ymax = int(data[0]), int(data[1]), int(data[2]), int(data[3])
            cv2.rectangle(frame, (xmin, ymin), (xmax, ymax), COLOR, 2)
            class_id = data[5]
            result = class_id
            text = f"{class_id}, {confidence:.2f}"
            cv2.putText(frame, text, (xmin, ymin - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, COLOR, 2)
    return frame, result

def numpy_array_to_string(arr):
    return ' '.join(map(str, arr))

def put_vietnamese_text(img, text, position, font_path, font_size, color):
    # Chuyển đổi hình ảnh từ OpenCV sang PIL
    pil_img = Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
    draw = ImageDraw.Draw(pil_img)
    font = ImageFont.truetype(font_path, font_size)
    draw.text(position, text, font=font, fill=color)

    # Chuyển đổi hình ảnh từ PIL sang OpenCV
    img = cv2.cvtColor(np.array(pil_img), cv2.COLOR_RGB2BGR)
    return img

label_map = {0: "Toi",
             1: "",
             2: "com",
             3: "",
             4: "",
             5: "",
             6: "",
             7: "",
             8: "",
             9: "",
             10: "ban",
             11: "",
             12: "",
             13: "",
             14: "an",
             15: "",
             16: "",
             17: "",
             18: "",
             19: "",
             20: "di",
             21: "choi",
             22: "chao",
             23: "",
             24: "",
             25: "",
             26: "",
             27: "",
             28: "",
             29: "",
             30: "",
             }

prev_time = datetime.datetime.now()

# Create a VideoCapture object to capture the video from your webcam
id = 0
cap = cv2.VideoCapture(id)

sign_arr = np.empty(0)
pred_count = 0
result_arr = []
max_empty_hand_frame = 30
empty_hand_frame = 0
current_result = 88
font_path = "Disney.ttf"
result = 0

# Define a deque to store the last N results for smoothing
N = 10  # Size of the sliding window
result_buffer = collections.deque(maxlen=N)

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        continue

    # Convert the frame color from BGR to RGB
    frame = cv2.flip(frame, 1)
    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

    # Process the frame to detect hands
    results = hands.process(frame_rgb)

    if results.multi_hand_landmarks:
        empty_hand_frame = 0
        for hand_landmarks in results.multi_hand_landmarks:

            h, w, _ = frame.shape
            x_min = w
            x_max = 0
            y_min = h
            y_max = 0
            for lm in hand_landmarks.landmark:
                x, y = int(lm.x * w), int(lm.y * h)
                if x < x_min:
                    x_min = x
                if x > x_max:
                    x_max = x
                if y < y_min:
                    y_min = y
                if y > y_max:
                    y_max = y

            expand_width = int((x_max - x_min) * 0.8)
            expand_height = int((y_max - y_min) * 0.8)
            x_min = max(0, x_min - expand_width)
            x_max = min(w, x_max + expand_width)
            y_min = max(0, y_min - expand_height)
            y_max = min(h, y_max + expand_height)

            cropped_image = frame[y_min:y_max, x_min:x_max]
            detections = model(cropped_image)[0]
            cropped_image, result = draw_detections(cropped_image, detections)

            if result != 99:
                result_buffer.append(result)

            cv2.imshow("Image", cropped_image)
            mp_drawing.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)
    else:
        empty_hand_frame += 1

    if len(result_buffer) > 0:
        # Use the most common element in the buffer as the stable result
        result = max(set(result_buffer), key=result_buffer.count)
        if result != current_result:
            sign_arr = np.append(sign_arr, label_map[result])
            current_result = result

    if empty_hand_frame == max_empty_hand_frame:
        if result == 0:
            current_result = 88
        else:
            current_result = result
        sign_arr = np.empty(0)
    
    curr_time = datetime.datetime.now()
    delta_time = curr_time - prev_time
    fps = 1 / delta_time.total_seconds()
    prev_time = curr_time

    # Display FPS on the frame
    FPS_COLOR = (153, 255, 204)
    fps_text = f"FPS: {fps:.2f}"
    cv2.putText(frame, fps_text, (20, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.8, FPS_COLOR, 2)

    # Create a separate window for displaying the results
    result_window = np.zeros((80, 1000, 3), dtype=np.uint8)
    show_result = numpy_array_to_string(sign_arr)
    cv2.putText(result_window, str(show_result), (10, 50), cv2.FONT_HERSHEY_COMPLEX, 1.5, (255, 255, 255), 2)

    cv2.imshow('Hand Detection', frame)
    cv2.imshow("Result window", result_window)

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# Release the capture and destroy all OpenCV windows
cap.release()
cv2.destroyAllWindows()



0: 640x640 (no detections), 5.0ms
Speed: 2.0ms preprocess, 5.0ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 1 E, 6.0ms
Speed: 2.0ms preprocess, 6.0ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 1 E, 5.0ms
Speed: 2.0ms preprocess, 5.0ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 1 E, 4.0ms
Speed: 3.0ms preprocess, 4.0ms inference, 0.0ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 1 E, 4.0ms
Speed: 3.0ms preprocess, 4.0ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 5.0ms
Speed: 3.0ms preprocess, 5.0ms inference, 0.0ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 1 E, 4.0ms
Speed: 3.0ms preprocess, 4.0ms inference, 0.0ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 1 E, 4.0ms
Speed: 3.0ms preprocess, 4.0ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 1 E, 5.0ms
S

In [None]:
import socket
import cv2
import numpy as np
import struct



while True:
    while len(data) < payload_size:
        data += client_socket.recv(4096)

    packed_msg_size = data[:payload_size]
    data = data[payload_size:]
    msg_size = struct.unpack("<L", packed_msg_size)[0]

    while len(data) < msg_size:
        data += client_socket.recv(4096)

    frame_data = data[:msg_size]
    data = data[msg_size:]

    # Giải mã hình ảnh
    frame = cv2.imdecode(np.frombuffer(frame_data, dtype=np.uint8), cv2.IMREAD_COLOR)
    frame = cv2.flip(frame, 1)
    frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    # Hiển thị hình ảnh
    cv2.imshow('Video', frame)
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

client_socket.close()
cv2.destroyAllWindows()


In [3]:
import socket
import struct
import cv2
import mediapipe as mp
import numpy as np
import datetime
from ultralytics import YOLO
import collections
from PIL import ImageFont, ImageDraw, Image

# Load the pre-trained YOLOv8n model
model = YOLO(r"runs\detect\train9\weights\best.pt")

# Initialize MediaPipe Hands
mp_hands = mp.solutions.hands
hands = mp_hands.Hands()

# Thiết lập socket client
client_socket = socket.socket()
client_socket.connect(('172.20.10.12', 8000))

# Nhận dữ liệu hình ảnh từ server
data = b""
payload_size = struct.calcsize("<L")

# Initialize MediaPipe drawing utils for drawing hands on the image
mp_drawing = mp.solutions.drawing_utils

def draw_detections(frame, last_detections):
    CONFIDENCE_THRESHOLD = 0.7
    COLOR = (153, 255, 204)
    result = 99
    if last_detections is not None:
        for data in last_detections.boxes.data.tolist():
            confidence = data[4]
            if float(confidence) < CONFIDENCE_THRESHOLD:
                continue
            xmin, ymin, xmax, ymax = int(data[0]), int(data[1]), int(data[2]), int(data[3])
            cv2.rectangle(frame, (xmin, ymin), (xmax, ymax), COLOR, 2)
            class_id = data[5]
            result = class_id
            text = f"{class_id}, {confidence:.2f}"
            cv2.putText(frame, text, (xmin, ymin - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, COLOR, 2)
    return frame, result

def numpy_array_to_string(arr):
    return ' '.join(map(str, arr))

def put_vietnamese_text(img, text, position, font_path, font_size, color):
    # Chuyển đổi hình ảnh từ OpenCV sang PIL
    pil_img = Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
    draw = ImageDraw.Draw(pil_img)
    font = ImageFont.truetype(font_path, font_size)
    draw.text(position, text, font=font, fill=color)

    # Chuyển đổi hình ảnh từ PIL sang OpenCV
    img = cv2.cvtColor(np.array(pil_img), cv2.COLOR_RGB2BGR)
    return img

label_map = {0: "Toi",
             1: "",
             2: "com",
             3: "",
             4: "",
             5: "",
             6: "",
             7: "",
             8: "",
             9: "",
             10: "ban",
             11: "",
             12: "",
             13: "",
             14: "an",
             15: "",
             16: "",
             17: "",
             18: "",
             19: "",
             20: "di",
             21: "choi",
             22: "chao",
             23: "",
             24: "",
             25: "",
             26: "",
             27: "",
             28: "",
             29: "",
             30: "",
             }

prev_time = datetime.datetime.now()

sign_arr = np.empty(0)
pred_count = 0
result_arr = []
max_empty_hand_frame = 30
empty_hand_frame = 0
current_result = 88
font_path = "Disney.ttf"
result = 0

# Define a deque to store the last N results for smoothing
N = 10  # Size of the sliding window
result_buffer = collections.deque(maxlen=N)

while True:
    while len(data) < payload_size:
        data += client_socket.recv(4096)

    packed_msg_size = data[:payload_size]
    data = data[payload_size:]
    msg_size = struct.unpack("<L", packed_msg_size)[0]

    while len(data) < msg_size:
        data += client_socket.recv(4096)

    frame_data = data[:msg_size]
    data = data[msg_size:]

    # Giải mã hình ảnh
    frame = cv2.imdecode(np.frombuffer(frame_data, dtype=np.uint8), cv2.IMREAD_COLOR)

    # Convert the frame color from BGR to RGB
    frame = cv2.flip(frame, 1)
    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

    # Process the frame to detect hands
    results = hands.process(frame_rgb)

    if results.multi_hand_landmarks:
        empty_hand_frame = 0
        for hand_landmarks in results.multi_hand_landmarks:

            h, w, _ = frame.shape
            x_min = w
            x_max = 0
            y_min = h
            y_max = 0
            for lm in hand_landmarks.landmark:
                x, y = int(lm.x * w), int(lm.y * h)
                if x < x_min:
                    x_min = x
                if x > x_max:
                    x_max = x
                if y < y_min:
                    y_min = y
                if y > y_max:
                    y_max = y

            expand_width = int((x_max - x_min) * 0.8)
            expand_height = int((y_max - y_min) * 0.8)
            x_min = max(0, x_min - expand_width)
            x_max = min(w, x_max + expand_width)
            y_min = max(0, y_min - expand_height)
            y_max = min(h, y_max + expand_height)

            cropped_image = frame[y_min:y_max, x_min:x_max]
            # cropped_image = frame
            detections = model(cropped_image)[0]
            cropped_image, result = draw_detections(cropped_image, detections)

            if result != 99:
                result_buffer.append(result)

            cv2.imshow("Image", cropped_image)
            mp_drawing.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)
    else:
        empty_hand_frame += 1

    if len(result_buffer) > 0:
        # Use the most common element in the buffer as the stable result
        result = max(set(result_buffer), key=result_buffer.count)
        if result != current_result:
            sign_arr = np.append(sign_arr, label_map[result])
            current_result = result

    if empty_hand_frame == max_empty_hand_frame:
        if result == 0:
            current_result = 88
        else:
            current_result = result
        sign_arr = np.empty(0)
    
    curr_time = datetime.datetime.now()
    delta_time = curr_time - prev_time
    fps = 1 / delta_time.total_seconds()
    prev_time = curr_time

    # Display FPS on the frame
    FPS_COLOR = (153, 255, 204)
    fps_text = f"FPS: {fps:.2f}"
    cv2.putText(frame, fps_text, (20, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.8, FPS_COLOR, 2)

    # Create a separate window for displaying the results
    result_window = np.zeros((80, 1000, 3), dtype=np.uint8)
    show_result = numpy_array_to_string(sign_arr)
    cv2.putText(result_window, str(show_result), (10, 50), cv2.FONT_HERSHEY_COMPLEX, 1.5, (255, 255, 255), 2)

    cv2.imshow('Hand Detection', frame)
    cv2.imshow("Result window", result_window)

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# Release the capture and destroy all OpenCV windows
client_socket.close()
cap.release()
cv2.destroyAllWindows()



0: 480x640 1 A, 17.0ms
Speed: 2.0ms preprocess, 17.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 A, 15.0ms
Speed: 2.0ms preprocess, 15.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 A, 16.0ms
Speed: 1.0ms preprocess, 16.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 A, 15.8ms
Speed: 2.0ms preprocess, 15.8ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 A, 15.0ms
Speed: 2.0ms preprocess, 15.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 A, 15.0ms
Speed: 1.0ms preprocess, 15.0ms inference, 2.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 A, 15.2ms
Speed: 1.5ms preprocess, 15.2ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 A, 15.0ms
Speed: 1.0ms preprocess, 15.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 A, 15.4ms
Speed: 1

NameError: name 'cap' is not defined

In [4]:
import socket
import struct
import cv2
import mediapipe as mp
import numpy as np
import datetime
from ultralytics import YOLO
import collections
from PIL import ImageFont, ImageDraw, Image

# Load the pre-trained YOLOv8n model
model = YOLO(r"runs\detect\train9\weights\best.pt")

# Initialize MediaPipe Hands
mp_hands = mp.solutions.hands
hands = mp_hands.Hands()

# Thiết lập socket client
client_socket = socket.socket()
client_socket.connect(('172.20.10.12', 8000))

# Nhận dữ liệu hình ảnh từ server
data = b""
payload_size = struct.calcsize("<L")

# Initialize MediaPipe drawing utils for drawing hands on the image
mp_drawing = mp.solutions.drawing_utils

def draw_detections(frame, last_detections):
    CONFIDENCE_THRESHOLD = 0.7
    COLOR = (153, 255, 204)
    result = 99
    if last_detections is not None:
        for data in last_detections.boxes.data.tolist():
            confidence = data[4]
            if float(confidence) < CONFIDENCE_THRESHOLD:
                continue
            xmin, ymin, xmax, ymax = int(data[0]), int(data[1]), int(data[2]), int(data[3])
            cv2.rectangle(frame, (xmin, ymin), (xmax, ymax), COLOR, 2)
            class_id = data[5]
            result = class_id
            text = f"{class_id}, {confidence:.2f}"
            cv2.putText(frame, text, (xmin, ymin - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, COLOR, 2)
    return frame, result

def numpy_array_to_string(arr):
    return ' '.join(map(str, arr))

def put_vietnamese_text(img, text, position, font_path, font_size, color):
    # Chuyển đổi hình ảnh từ OpenCV sang PIL
    pil_img = Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
    draw = ImageDraw.Draw(pil_img)
    font = ImageFont.truetype(font_path, font_size)
    draw.text(position, text, font=font, fill=color)

    # Chuyển đổi hình ảnh từ PIL sang OpenCV
    img = cv2.cvtColor(np.array(pil_img), cv2.COLOR_RGB2BGR)
    return img

label_map = {0: "Toi",
             1: "",
             2: "com",
             3: "",
             4: "",
             5: "",
             6: "",
             7: "",
             8: "",
             9: "",
             10: "ban",
             11: "",
             12: "",
             13: "",
             14: "an",
             15: "",
             16: "",
             17: "",
             18: "",
             19: "",
             20: "di",
             21: "choi",
             22: "chao",
             23: "",
             24: "",
             25: "",
             26: "",
             27: "",
             28: "",
             29: "",
             30: "",
             }

prev_time = datetime.datetime.now()

sign_arr = np.empty(0)
pred_count = 0
result_arr = []
max_empty_hand_frame = 30
empty_hand_frame = 0
current_result = 88
font_path = "Disney.ttf"
result = 0

# Define a deque to store the last N results for smoothing
N = 10  # Size of the sliding window
result_buffer = collections.deque(maxlen=N)

while True:
    while len(data) < payload_size:
        data += client_socket.recv(4096)

    packed_msg_size = data[:payload_size]
    data = data[payload_size:]
    msg_size = struct.unpack("<L", packed_msg_size)[0]

    while len(data) < msg_size:
        data += client_socket.recv(4096)

    frame_data = data[:msg_size]
    data = data[msg_size:]

    # Giải mã hình ảnh
    frame = cv2.imdecode(np.frombuffer(frame_data, dtype=np.uint8), cv2.IMREAD_COLOR)

    # Convert the frame color from BGR to RGB
    frame = cv2.flip(frame, 1)
    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)


    detections = model(frame)[0]
    frame, result = draw_detections(frame, detections)

    if result != 99:
        result_buffer.append(result)
        empty_hand_frame = 0
    
    if not result:
        empty_hand_frame += 1


    if len(result_buffer) > 0:
        # Use the most common element in the buffer as the stable result
        result = max(set(result_buffer), key=result_buffer.count)
        if result != current_result:
            sign_arr = np.append(sign_arr, label_map[result])
            current_result = result

    if empty_hand_frame == max_empty_hand_frame:
        if result == 0:
            current_result = 88
        else:
            current_result = result
        sign_arr = np.empty(0)
    
    curr_time = datetime.datetime.now()
    delta_time = curr_time - prev_time
    fps = 1 / delta_time.total_seconds()
    prev_time = curr_time

    # Display FPS on the frame
    FPS_COLOR = (153, 255, 204)
    fps_text = f"FPS: {fps:.2f}"
    cv2.putText(frame, fps_text, (20, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.8, FPS_COLOR, 2)

    # Create a separate window for displaying the results
    result_window = np.zeros((80, 1000, 3), dtype=np.uint8)
    show_result = numpy_array_to_string(sign_arr)
    cv2.putText(result_window, str(show_result), (10, 50), cv2.FONT_HERSHEY_COMPLEX, 1.5, (255, 255, 255), 2)

    cv2.imshow('Hand Detection', frame)
    cv2.imshow("Result window", result_window)

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# Release the capture and destroy all OpenCV windows
client_socket.close()
cap.release()
cv2.destroyAllWindows()


AttributeError: module 'google.protobuf.internal.api_implementation' has no attribute '_c_module'

In [3]:
import socket
import struct
import cv2
import mediapipe as mp
import numpy as np
import datetime
from ultralytics import YOLO
import collections
from PIL import ImageFont, ImageDraw, Image

# Load the pre-trained YOLOv8n model
model = YOLO(r"runs\detect\train9\weights\best.pt")

# Initialize MediaPipe Hands
mp_hands = mp.solutions.hands
hands = mp_hands.Hands()

# Thiết lập socket client
client_socket = socket.socket()
client_socket.connect(('172.20.10.12', 8000))

# Nhận dữ liệu hình ảnh từ server
data = b""
payload_size = struct.calcsize("<L")

# Initialize MediaPipe drawing utils for drawing hands on the image
mp_drawing = mp.solutions.drawing_utils

def draw_detections(frame, last_detections):
    CONFIDENCE_THRESHOLD = 0.6
    COLOR = (153, 255, 204)
    result = 99
    status = False
    if last_detections is not None:
        for data in last_detections.boxes.data.tolist():
            confidence = data[4]
            if float(confidence) < CONFIDENCE_THRESHOLD:
                continue
            status = True
            xmin, ymin, xmax, ymax = int(data[0]), int(data[1]), int(data[2]), int(data[3])
            cv2.rectangle(frame, (xmin, ymin), (xmax, ymax), COLOR, 2)
            class_id = data[5]
            result = class_id
            text = f"{class_id}, {confidence:.2f}"
            cv2.putText(frame, text, (xmin, ymin - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, COLOR, 2)
    return frame, result, status

def numpy_array_to_string(arr):
    return ' '.join(map(str, arr))

def put_vietnamese_text(img, text, position, font_path, font_size, color):
    # Chuyển đổi hình ảnh từ OpenCV sang PIL
    pil_img = Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
    draw = ImageDraw.Draw(pil_img)
    font = ImageFont.truetype(font_path, font_size)
    draw.text(position, text, font=font, fill=color)

    # Chuyển đổi hình ảnh từ PIL sang OpenCV
    img = cv2.cvtColor(np.array(pil_img), cv2.COLOR_RGB2BGR)
    return img

label_map = {0: "Toi",
             1: "",
             2: "com",
             3: "",
             4: "",
             5: "",
             6: "",
             7: "",
             8: "",
             9: "",
             10: "ban",
             11: "",
             12: "",
             13: "",
             14: "an",
             15: "",
             16: "",
             17: "",
             18: "",
             19: "",
             20: "di",
             21: "choi",
             22: "chao",
             23: "",
             24: "",
             25: "",
             26: "",
             27: "",
             28: "",
             29: "",
             30: "",
             }

prev_time = datetime.datetime.now()

sign_arr = np.empty(0)
pred_count = 0
result_arr = []
max_empty_hand_frame = 20
empty_hand_frame = 0
current_result = 88
font_path = "Disney.ttf"
result = 0

# Define a deque to store the last N results for smoothing
N = 10  # Size of the sliding window
result_buffer = collections.deque(maxlen=N)

while True:
    while len(data) < payload_size:
        data += client_socket.recv(4096)

    packed_msg_size = data[:payload_size]
    data = data[payload_size:]
    msg_size = struct.unpack("<L", packed_msg_size)[0]

    while len(data) < msg_size:
        data += client_socket.recv(4096)

    frame_data = data[:msg_size]
    data = data[msg_size:]

    # Giải mã hình ảnh
    frame = cv2.imdecode(np.frombuffer(frame_data, dtype=np.uint8), cv2.IMREAD_COLOR)

    #rotate frame
    frame = cv2.rotate(frame, cv2.ROTATE_90_COUNTERCLOCKWISE)

    # Convert the frame color from BGR to RGB
    frame = cv2.flip(frame, 1)
    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

    detections = model(frame)[0]
    frame, result, status = draw_detections(frame, detections)

    if result != 99:
        result_buffer.append(result)
        empty_hand_frame = 0
    
    if not status:
        empty_hand_frame += 1

    if len(result_buffer) > 0:
        # Use the most common element in the buffer as the stable result
        result = max(set(result_buffer), key=result_buffer.count)
        if result != current_result:
            sign_arr = np.append(sign_arr, label_map[result])
            current_result = result

    if empty_hand_frame == max_empty_hand_frame:
        if result == 0:
            current_result = 88
        else:
            current_result = result
        sign_arr = np.empty(0)
    
    curr_time = datetime.datetime.now()
    delta_time = curr_time - prev_time
    fps = 1 / delta_time.total_seconds()
    prev_time = curr_time

    # Display FPS on the frame
    FPS_COLOR = (153, 255, 204)
    fps_text = f"FPS: {fps:.2f}"
    cv2.putText(frame, fps_text, (20, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.8, FPS_COLOR, 2)

    # Create a separate window for displaying the results
    result_window = np.zeros((80, 1000, 3), dtype=np.uint8)
    show_result = numpy_array_to_string(sign_arr)
    cv2.putText(result_window, str(show_result), (10, 50), cv2.FONT_HERSHEY_COMPLEX, 1.5, (255, 255, 255), 2)

    cv2.imshow('Hand Detection', frame)
    cv2.imshow("Result window", result_window)

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# Release the socket and destroy all OpenCV windows
client_socket.close()
cv2.destroyAllWindows()



0: 640x480 (no detections), 15.0ms
Speed: 1.0ms preprocess, 15.0ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 480)

0: 640x480 1 H, 15.9ms
Speed: 1.0ms preprocess, 15.9ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 480)

0: 640x480 1 H, 15.0ms
Speed: 1.0ms preprocess, 15.0ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 480)

0: 640x480 1 Q, 14.1ms
Speed: 1.0ms preprocess, 14.1ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 480)

0: 640x480 1 Q, 13.4ms
Speed: 1.0ms preprocess, 13.4ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 480)

0: 640x480 1 Q, 14.0ms
Speed: 1.0ms preprocess, 14.0ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 480)

0: 640x480 1 H, 14.0ms
Speed: 1.0ms preprocess, 14.0ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 480)

0: 640x480 (no detections), 14.5ms
Speed: 1.0ms preprocess, 14.5ms inference, 0.0ms postprocess per image at shape (1, 3, 640, 480)

0: 640x