# The Full Pipeline of Image 1


### At this pipeline we will use the code from both task a and b

In [10]:
import cv2
import numpy as np
from sklearn.cluster import KMeans
import os
import pytesseract
import csv
import pandas as pd
import easyocr
from nltk.corpus import wordnet as wn
from collections import Counter

pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

In [11]:
# %pip install easyocr
# %pip install pytesseract
# %pip install opencv-python

### Get the Bar data

In [12]:
image_path = os.path.join('dataset_part2', 'image_1.png')
image = cv2.imread(image_path)
image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
output_image = image.copy()

# Mask out top 250 and bottom 50 px
height = image_rgb.shape[0]
roi = image_rgb[200:height-50, :, :]  # Keep only the middle section

# Flatten ROI for clustering
pixel_data = roi.reshape((-1, 3))

# K-means clustering on ROI
num_clusters = 15 
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
labels = kmeans.fit_predict(pixel_data)
segmented_roi = kmeans.cluster_centers_.astype("uint8")[labels]
segmented_roi = segmented_roi.reshape(roi.shape)

# Create a blank full-size segmented image
segmented_img = np.zeros_like(image_rgb)
segmented_img[200:height-50] = segmented_roi


# Prepare a blank mask image for each cluster
detected_bars = []

for i in range(num_clusters):
    cluster_mask = (labels == i).reshape(roi.shape[:2]).astype("uint8") * 255
    full_mask = np.zeros(image_rgb.shape[:2], dtype="uint8")
    full_mask[200:height-50] = cluster_mask

    # Optional cleanup
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3))
    full_mask = cv2.morphologyEx(full_mask, cv2.MORPH_CLOSE, kernel)

    # Find and draw contours
    contours, _ = cv2.findContours(full_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    for cnt in contours:
        x, y, w, h = cv2.boundingRect(cnt)
        area = cv2.contourArea(cnt)

        if area > 200 and w > 10 and h > 20:
            detected_bars.append((x, y, w, h))
            cv2.rectangle(output_image, (x, y), (x + w, y + h), (0, 255, 0), 2)



In [13]:
# Exclude the biggest detected bar
detected_bars_data = []
max_area = max(detected_bars, key=lambda bar: bar[2] * bar[3])  # Find the bar with the largest area

for (x, y, w, h) in detected_bars:
    if (x, y, w, h) != max_area:  # Exclude the largest bar
        detected_bars_data.append({
            'x': x,
            'y': y,
            'width': w,
            'height': h
        })

# Sort detected bars data by x-axis
detected_bars_data_sorted = sorted(detected_bars_data, key=lambda bar: bar['x'])

In [14]:
# Load the image again for OCR
image_path = os.path.join('dataset_part2', 'image_1.png')
image = cv2.imread(image_path)

# Convert to grayscale
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

# Apply slight thresholding to clean background
_, thresh = cv2.threshold(gray, 180, 255, cv2.THRESH_BINARY)

bar_value_mapping = {}

for bar_data in detected_bars_data_sorted:
    x, y, w, h = bar_data['x'], bar_data['y'], bar_data['width'], bar_data['height']
    
    # Define ROI above the bar
    roi_y_start = max(0, y - 60)
    roi_y_end = y - 5
    roi = thresh[roi_y_start:roi_y_end, x:x + w]

    # Enlarge ROI to help OCR
    roi = cv2.resize(roi, None, fx=2, fy=2, interpolation=cv2.INTER_LINEAR)

    # Run OCR just on digits
    config = "--psm 7 -c tessedit_char_whitelist=0123456789."
    detected_text = pytesseract.image_to_string(roi, config=config).strip()

    # Add a decimal point if two numbers are detected in a row
    if len(detected_text) == 2 and detected_text.isdigit():
        detected_text = f"{detected_text[0]}.{detected_text[1]}"

    bar_value_mapping[(x, y, w, h)] = detected_text

# Output
bar_values = []
for bar, value in bar_value_mapping.items():
    bar_values.append(value)

print("Bar values list:", bar_values)

Bar values list: ['4.3', '2.5', '3.5', '4.5', '2.4', '4.4', '1.8', '2.8', '2', '2', '3', '5']


In [15]:
# Dictionary to store bar coordinates and detected text
bar_text_mapping = {}

for bar in detected_bars_data_sorted:
    x, y, width, height = bar['x'], bar['y'], bar['width'], bar['height']
    
    # Define ROI below the bar (adjust the height as needed)
    roi_y_start = y + height + 10  # Start 10 pixels below the bar
    roi_y_end = roi_y_start + 50  # Define height of the ROI
    roi = image[roi_y_start:roi_y_end, x:x + width]
    
    # Perform OCR on the ROI
    detected_text = pytesseract.image_to_string(roi, config='--psm 6').strip()
    
    # Map the detected text to the bar
    bar_text_mapping[(x, y, width, height)] = detected_text

# Print the mapping of bars to detected text
bar_titles = []
for bar, text in bar_text_mapping.items():
    bar_titles.append(text)
    
print("Bar title list:", bar_titles)

Bar title list: ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']


### Get the title and column names

### Title First

In [16]:
# Path to image
image_path = 'dataset_part2/image_1.png'
output_dir = 'output_bboxes'

# Create output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# Initialize EasyOCR reader
reader = easyocr.Reader(['en'])

# Read the image
image = cv2.imread(image_path)

# OCR text detection
results = reader.readtext(image)
title_text = []
topmost_y = float('inf')

for (bbox, text, confidence) in results:
    if confidence > 0.1:
        # bbox is a list of 4 points [(x0, y0), (x1, y1), (x2, y2), (x3, y3)]
        top_left = tuple(map(int, bbox[0]))
        bottom_right = tuple(map(int, bbox[2]))

        cv2.rectangle(image, top_left, bottom_right, (0, 255, 0), 2)

        y = top_left[1]
        if y <= topmost_y + 10:
            topmost_y = min(topmost_y, y)
            title_text.append(text)

print(f"The extracted title is: {title_text}")

# Save output image
output_path = os.path.join(output_dir, "image_1_bboxes_easyocr.png")
cv2.imwrite(output_path, image)

Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.


The extracted title is: ['Bar Graph PPT']


True

### Genreate my own category with NLTK

In [17]:
def get_common_categories(word_list):
    all_categories = []

    for word in word_list:
        synsets = wn.synsets(word)
        for syn in synsets:
            for hyper in syn.hypernyms():
                all_categories.extend(lemma.name() for lemma in hyper.lemmas())

    category_counts = Counter(all_categories)
    return category_counts.most_common()

category = get_common_categories(bar_titles)

# Top most common hypernym with modifications 
category_x = category[0][0].replace('_', ' ') + 's'  # Make sure it is plural 
print("Final category:", category_x)

Final category: Gregorian calendar months


### Save everything

In [18]:
# Define the output CSV file path
output_csv_path = f"{'_'.join(title_text)}.csv"

# Combine bar text and values into rows
rows = [{category_x: bar_text_mapping[(bar['x'], bar['y'], bar['width'], bar['height'])], 'Value': value} for bar, value in zip(detected_bars_data_sorted, bar_values)]

# Write to CSV
with open(output_csv_path, mode='w', newline='') as csv_file:
    fieldnames = [category_x, 'Value']
    writer = csv.DictWriter(csv_file, fieldnames=fieldnames)

    writer.writeheader()
    writer.writerows(rows)

print(f"CSV file '{output_csv_path}' created successfully.")

# Print final values in a df
df = pd.DataFrame(rows)
print(df)

CSV file 'Bar Graph PPT.csv' created successfully.
   Gregorian calendar months Value
0                    January   4.3
1                   February   2.5
2                      March   3.5
3                      April   4.5
4                        May   2.4
5                       June   4.4
6                       July   1.8
7                     August   2.8
8                  September     2
9                    October     2
10                  November     3
11                  December     5
