# The Full Pipeline of Image 2


### At this pipeline we will use the code from both task a and b

__Tasks:__
- a) Extract and print  ‘title of graph’ and make bounding boxes around the texts detected in the images and save the resultant images with Bounding boxes.Save images as ’ {image_name}_bboxes.png’ 
- b) Make use of computer vision skills, get individual bars and their values (Hint : contour detection, edge detection etc). Write the individual results into ‘{image_name}.csv’

In [12]:
import cv2
import numpy as np
from sklearn.cluster import KMeans
import os
import pytesseract
import csv
import pandas as pd
import easyocr

pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

In [13]:
# %pip install easyocr
# %pip install pytesseract
# %pip install opencv-python

### Get the Bar data

In [14]:
# Load and adjust the image
image_path = os.path.join('dataset_part2', 'image_2.png')
image = cv2.imread(image_path)
output = image.copy()

# Convert to HSV to isolate colored bars better
hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)

# Define a mask to isolate all strong color regions (ignoring yellow bg)
# We'll keep saturation and value high to exclude light yellow
lower = np.array([0, 100, 100])
upper = np.array([179, 255, 255])
mask = cv2.inRange(hsv, lower, upper)

# Morph to remove small gaps/noise
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3))
mask = cv2.morphologyEx(mask, cv2.MORPH_CLOSE, kernel)

# Find contours
contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

bars = []
for cnt in contours:
    x, y, w, h = cv2.boundingRect(cnt)
    if h > 50 and w > 20 and y > 50:  # basic bar-like filtering
        bars.append((x, y, w, h))
        cv2.line(output, (x, y + h), (x + w, y + h), (0, 255, 255), 2)


# Highlight the largest bar
if bars:
    largest = max(bars, key=lambda b: b[2] * b[3])
    # Crop the image to the size of the largest bar
    x, y, w, h = largest  # Coordinates of the largest bar
    cropped_image = output[y:y + h, x:x + w]
    
    # Save crop info for later use
    crop_info = {
    "x": x,
    "y": y,
    "w": w,
    "h": h
}

### Now we will use the cropped image to detect the 4 bars and get the bboxes of them and then save those values for later.

In [15]:
# Use k-means clustering to segment the image by color and isolate the bars more reliably
Z = cropped_image.reshape((-1, 3))
Z = np.float32(Z)

# Define criteria, number of clusters(K) and apply kmeans()
criteria = (cv2.TERM_CRITERIA_EPS + cv2.TERM_CRITERIA_MAX_ITER, 10, 1.0)
K = 4  # number of color clusters
_, labels, centers = cv2.kmeans(Z, K, None, criteria, 10, cv2.KMEANS_RANDOM_CENTERS)

# Convert back to uint8 and reshape to original image
centers = np.uint8(centers)
segmented_data = centers[labels.flatten()]
segmented_image = segmented_data.reshape((cropped_image.shape))

# Convert to grayscale and threshold to isolate the bars
gray_segmented = cv2.cvtColor(segmented_image, cv2.COLOR_BGR2GRAY)
_, thresh_segmented = cv2.threshold(gray_segmented, 200, 255, cv2.THRESH_BINARY_INV)

# Find contours again
contours_final, _ = cv2.findContours(thresh_segmented, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

# Draw only valid bar-like bounding boxes
final_boxed_image = cropped_image.copy()
detected_bars = []
for cnt in contours_final:
    x, y, w, h = cv2.boundingRect(cnt)
    aspect_ratio = h / float(w)
    if w > 10 and h > 100 and aspect_ratio > 1.5:
        detected_bars.append((x, y, w, h))
        cv2.rectangle(final_boxed_image, (x, y), (x + w, y + h), (0, 255, 0), 2)

# Convert for display
final_boxed_rgb = cv2.cvtColor(final_boxed_image, cv2.COLOR_BGR2RGB)

# Now use the crop info to calculate the correct coordinates relative to the original image
bars_in_original_image = []
for (x, y, w, h) in detected_bars:
    original_x = crop_info['x'] + x
    original_y = crop_info['y'] + y
    bars_in_original_image.append((original_x, original_y, w, h))

### Now check all the detected bars

In [16]:
# Exclude the biggest detected bar
detected_bars_data = []
max_area = max(bars_in_original_image, key=lambda bar: bar[2] * bar[3])  # Find the bar with the largest area

for (x, y, w, h) in bars_in_original_image:  # Exclude the largest bar
    detected_bars_data.append({
        'x': x,
        'y': y,
        'width': w,
        'height': h
    })

# Sort detected bars data by x-axis
detected_bars_data_sorted = sorted(detected_bars_data, key=lambda bar: bar['x'])

### Extract all the values

In [17]:
# Draw a horizontal line on top of the first detected bar
first_bar = detected_bars_data_sorted[0]
line_y = first_bar['y']  # y-coordinate of the top of the first bar

# On this line detect the first value
first_value = None
min_distance = float('inf')

# Read image
image = cv2.imread(image_path)

# Initialize EasyOCR
reader = easyocr.Reader(['en'])

# Run OCR
results = reader.readtext(image)

for (bbox, text, confidence) in results:
    if confidence > 0.5:  # You can tweak this
        top_left = tuple(map(int, bbox[0]))
        bottom_right = tuple(map(int, bbox[2]))

        # Get the center Y of the text box
        text_center_y = (top_left[1] + bottom_right[1]) // 2

        # Check if this text is just above the bar (e.g., within 20 pixels)
        distance = abs(text_center_y - line_y)
        if text_center_y < line_y and distance < 30:  # text must be above the bar
            try:
                value = float(text)
                if distance < min_distance:
                    first_value = value
                    min_distance = distance
            except ValueError:
                continue  # Not a number, skip



if first_value is None:
    print("Could not detect a numerical value above the first bar.")
else:
    print(f"Detected first bar value: {first_value}")

    # Calculate scale and values for all bars
    highest_bar_height = max(detected_bars_data_sorted, key=lambda bar: bar['height'])['height']
    first_bar_height = first_bar['height']
    scale_factor = first_value / first_bar_height

    bar_values = []
    for bar in detected_bars_data_sorted:
        value = round(bar['height'] * scale_factor, 1)
        bar_values.append(value)

    print("Calculated bar values:", bar_values)

Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.


Detected first bar value: 8.0
Calculated bar values: [8.0, 17.9, 10.0, 14.0]


### Detect the text under the bars


In [18]:
# Ensure Tesseract is installed and added to PATH
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

# Dictionary to store bar coordinates and detected text
bar_text_mapping = {}

for bar in detected_bars_data_sorted:
    x, y, width, height = bar['x'], bar['y'], bar['width'], bar['height']
    
    # Define ROI below the bar (adjust the height as needed)
    roi_y_start = y + height + 10  # Start 10 pixels below the bar
    roi_y_end = roi_y_start + 50   # Define height of the ROI

    # Add a margin to the left and right of the bar's bounding box
    margin = 50 # Adjust margin as needed
    roi_x_start = max(0, x - margin)  # Ensure it doesn't go out of bounds
    roi_x_end = min(image.shape[1], x + width + margin)  # Ensure it doesn't go out of bounds

    # Extract the ROI with the expanded width
    roi = image[roi_y_start:roi_y_end, roi_x_start:roi_x_end]
    
    # Perform OCR on the ROI
    detected_text = pytesseract.image_to_string(roi, config='--psm 6').strip()
    
    # Map the detected text to the bar
    bar_text_mapping[(x, y, width, height)] = detected_text
    
# Print the mapping of bars to detected text
bar_titles = []
for bar, text in bar_text_mapping.items():
    bar_titles.append(text)
print("Bar titles list:", bar_titles)

Bar titles list: ['Reading', 'Playing', 'Baking', 'Washing hands']


### Now we have all the data and now it is time to gather the x-axis title and the y-axis title. And of course the Title of the graph

### The title

In [19]:
# Path to image
image_path = 'dataset_part2/image_2.png'
output_dir = 'output_bboxes'

# Create output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# Initialize EasyOCR reader )
reader = easyocr.Reader(['en'])

# Read the image
image = cv2.imread(image_path)

# Crop the image to exclude useless parts
image = image[:-80, :] 

# OCR text detection
results = reader.readtext(image)
title_text = []
topmost_y = float('inf')

for (bbox, text, confidence) in results:
    if confidence > 0.1:
        top_left = tuple(map(int, bbox[0]))
        bottom_right = tuple(map(int, bbox[2]))

        cv2.rectangle(image, top_left, bottom_right, (0, 255, 0), 2)

        y = top_left[1]
        if y <= topmost_y + 10:
            topmost_y = min(topmost_y, y)
            title_text.append(text)

print(f"The extracted title is: {title_text}")

# Save output image
output_path = os.path.join(output_dir, "image_2_bboxes_easyocr.png")
cv2.imwrite(output_path, image)

Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.


The extracted title is: ['Activities at home']


True

### The x-axis and y-axis

In [20]:
# Extract the x-axis text
words = []
for (bbox, text, confidence) in results:
    if confidence > 0.1:
        top_left = tuple(map(int, bbox[0]))
        bottom_right = tuple(map(int, bbox[2]))

        cv2.rectangle(image, top_left, bottom_right, (0, 255, 0), 2)

        words.append(text)
        
# Extract the last bounding box
x_axis_text = results[-1][1]  # Access the text directly from the last result
print(f"The x-axis text is: {x_axis_text}")

The x-axis text is: Type of activity


In [21]:
# Extract the y-axis text
image_path = 'dataset_part2/image_2.png'

# Initialize EasyOCR reader
reader = easyocr.Reader(['en'])

# Read the image
image = cv2.imread(image_path)
# Rotate the image to get the y-axis text
rotated_image = cv2.rotate(image, cv2.ROTATE_90_CLOCKWISE)

# OCR text detection on the rotated image
results_rotated = reader.readtext(rotated_image)
y_axis_text = []

for (bbox, text, confidence) in results_rotated:
    if confidence > 0.1:
        top_left = tuple(map(int, bbox[0]))
        bottom_right = tuple(map(int, bbox[2]))

        cv2.rectangle(rotated_image, top_left, bottom_right, (0, 255, 0), 2)

        y_axis_text.append(text)

# Extract the first bounding box
y_axis_text = results_rotated[0][1]  # Access the text directly from the first result
print(f"The y-axis text is: {y_axis_text}")

Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.


The y-axis text is: Number of children


#

### Save everything

In [22]:
# Define the output CSV file path
output_csv_path = f"{'_'.join(title_text)}.csv"

# Combine bar text and values into rows
rows = [{x_axis_text: bar_text_mapping[(bar['x'], bar['y'], bar['width'], bar['height'])], y_axis_text: value} for bar, value in zip(detected_bars_data_sorted, bar_values)]

# Write to CSV
with open(output_csv_path, mode='w', newline='') as csv_file:
    fieldnames = [x_axis_text, y_axis_text]
    writer = csv.DictWriter(csv_file, fieldnames=fieldnames)

    writer.writeheader()
    writer.writerows(rows)

print(f"CSV file '{output_csv_path}' created successfully.")

# Print final values in a df
df = pd.DataFrame(rows)
print(df)

CSV file 'Activities at home.csv' created successfully.
  Type of activity  Number of children
0          Reading                 8.0
1          Playing                17.9
2           Baking                10.0
3    Washing hands                14.0
