# Pandas DataFrame generation including images content



In [10]:
from paddleocr import PaddleOCR 
import numpy as np
import cv2 as cv
import pandas as pd
import glob
from typing import Tuple
import matplotlib.pyplot as plt
import os


par_dict_config = {'scale_of_the_image': 0.5, 'thresh': 100}  # dictionary containing configurations


class array_class:
    """initial class: configurable parameters in from dictionary"""
    def __init__(self, config):
        self.ocr = PaddleOCR(use_gpu=False)
        self._config = config
    
    @staticmethod
    def bright_coeff(img: np.ndarray) -> float:
        """input: np.array grey scale image, return:  laplacian coefficient (int) """
        return  np.mean(img)

    def blur_coeff(self, img: np.ndarray) -> Tuple[int, str]:
        """input: np.array grey scale image, return tuple: array average, sharp/blurry """
        img_variance = int(cv.Laplacian(img, cv.CV_16S).var())
        sharp_message='sharp'
        blurry_message='blurry'
        if img_variance > self._config['thresh']:
            return img_variance, sharp_message
        else:
            return img_variance, blurry_message

    @staticmethod
    def img_area(img: np.ndarray) -> float:
        """input: np.array grey scale image, return image np.array area """        
        area = img.shape[0]*img.shape[1]
        return area
    
    @staticmethod
    def size_diff(img_1_area: np.ndarray, img_2_area: np.ndarray) -> int:
        """input: 2 np.array images, return: difference between areas """
        diff = int(abs(img_1_area-img_2_area))
        return diff

    def resize_img(self, image_to_scale: np.ndarray) -> np.ndarray:
            """scale image to float parameter"""
            image_to_scale_width = int(
                image_to_scale.shape[1] * self._config['scale_of_the_image'])
            
            image_to_scale_height = int(
                image_to_scale.shape[0] * self._config['scale_of_the_image'])
            image_to_scale_dim = (image_to_scale_width, image_to_scale_height)
            image_resized = cv.resize(
                image_to_scale, image_to_scale_dim, interpolation=cv.INTER_AREA)
            return image_resized
        
    def text_from_img(self, img: np.ndarray) -> str:
        """input: array image, returns image content"""
        text_from_img = self.ocr.ocr(img.astype("float32"))
        string_from_txt = text_from_img[-1][-1][-1][0] # partial return of entire content 
        return string_from_txt
    
    @staticmethod
    def string_length(string: str) -> int:
        """input: string, output: string length"""
        return len(string)
    
    def run_methods_above(self, arr: np.ndarray, name_of_image: str) ->  np.ndarray:
        """input: np.array image, name of image. Returns np.array with images information"""
        img_scaled = self.resize_img(arr)
        image_blurriness = self.blur_coeff(img_scaled)[0]
        blurriness_categ = self.blur_coeff(img_scaled)[1]
        image_brightness = self.bright_coeff(img_scaled)
        image_area = self.img_area(arr)
        scaled_image_area= self.img_area(img_scaled)
        size_diff = self.size_diff(image_area, scaled_image_area)
        image_content = self.text_from_img(img_scaled)
        image_content_size = self.string_length(image_content) 
        array=np.array(
                        [name_of_image,image_blurriness,image_brightness,blurriness_categ,
                         image_area, size_diff
                             ,image_content,image_content_size])
                         
        return array
    

images_folder= r"images_folder\*.png"

rows_array=[]
for image_file in glob.glob(images_folder):
    image_name=os.path.basename(image_file)
    img = cv.imread(image_file, 0)
    rows= array_class(par_dict_config).run_methods_above(img, image_name)
    rows_array.append(rows)

    
"""create pandas dataframe containing images information"""

df = pd.DataFrame(rows_array, columns=['image_name','image_blurriness','image_brightness','blurr_categ',
                                      'area','size_diff','image_content','image_content_size'])
df #display dataframe


[2023/04/18 12:16:35] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=False, use_xpu=False, use_npu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, image_dir=None, page_num=0, det_algorithm='DB', det_model_dir='C:\\Users\\YaniDiGiovanni/.paddleocr/whl\\det\\ch\\ch_PP-OCRv3_det_infer', det_limit_side_len=960, det_limit_type='max', det_box_type='quad', det_db_thresh=0.3, det_db_box_thresh=0.6, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_pse_thresh=0, det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, scales=[8, 16, 32], alpha=1.0, beta=1.0, fourier_degree=5, rec_algorithm='SVTR_LCNet', rec_model_dir='C:\\Users\\YaniDiGiovanni/.paddleocr/whl\\rec\\ch\\ch_PP-OCRv3_rec_infer', rec_image_inverse=True, rec_image_shape='3, 48, 320', rec_batch_num=6, max_

[2023/04/18 12:16:44] ppocr DEBUG: dt_boxes num : 1, elapse : 0.1145637035369873
[2023/04/18 12:16:45] ppocr DEBUG: rec_res num  : 1, elapse : 1.6292338371276855


Unnamed: 0,image_name,image_blurriness,image_brightness,blurr_categ,area,size_diff,image_content,image_content_size
0,pic_1.png,3400,164.98223055052986,sharp,62409,46933,TEOR,4
1,pic_2.png,1844,133.15043542905693,sharp,75895,57063,POTTER,6
2,pic_3.png,25,22.480778041567746,blurry,193011,144993,FLOW,4
