-
Notifications
You must be signed in to change notification settings - Fork 0
/
ocr.py
67 lines (63 loc) · 3.24 KB
/
ocr.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import cv2, os, pytesseract
import pandas as pd
import corrections, logo_matcher
from config import imgdir, config, savedir, image_counter
from regions import *
from icecream import ic
from regions import replies, comments
import pandas as pd
def process_image(img_name, region_info, correction_function=None):
temp_dict = {}
img = cv2.imread(os.path.join(imgdir, img_name), 0)
region = img[region_info[0][1]:region_info[1][1], region_info[0][0]:region_info[1][0]]
text = pytesseract.image_to_string(region, config=config)
if correction_function is not None:
if correction_function.__name__ == 'corrected_source':
source_name = corrections.corrected_source(text)
logo_name = logo_matcher.logo_match(region)
platform_name = os.path.splitext(logo_name)[0]
dict = platform_name +" " + source_name
temp_dict['Source'] = dict
elif correction_function.__name__ == 'corrected_date_comments':
date, completed_items = correction_function(text)
temp_dict['Date'] = date
temp_dict['Completed Items'] = completed_items
else:
dict = correction_function(text)
temp_dict[region_info[3]] = dict
else:
dict = corrections.corrected_replies(text)
if img_name.endswith('r.png'):
temp_dict['Replies'] = dict
else:
temp_dict['Comments'] = dict
temp_dict['Image Name'] = img_name
temp_dict['Image Type'] = 'Replies' if img_name.endswith('r.png') else 'Comments'
ic(temp_dict)
# Save the region image with a name based on the image name, date, section name, and value of the section
region_name = f"{img_name.split('.')[0]}_{temp_dict.get('Date', '')}_{region_info[3]}_{temp_dict.get(region_info[3], '')}.png"
# cv2.imshow(region_name, region)
# cv2.waitKey(0)
cv2.imwrite(os.path.join(savedir, region_name), region)
return temp_dict
def ocr():
df = pd.DataFrame()
for img_name in os.listdir(imgdir):
if img_name.endswith('.png'):
result_dict = {} # Reset the dictionary for each new image
if img_name.endswith('r.png'):
result_dict.update(process_image(img_name, replies[0], corrections.corrected_date_of_replies))
result_dict.update(process_image(img_name, replies[1], corrections.corrected_source))
result_dict.update(process_image(img_name, replies[2])) # No correction function for replies
result_dict.update(process_image(img_name, replies[3], corrections.corrected_time))
elif img_name.endswith('c.png'):
result_dict.update(process_image(img_name, comments[0], corrections.corrected_date_comments))
result_dict.update(process_image(img_name, comments[1], corrections.corrected_source))
# Convert the dictionary to a DataFrame and append it to the main DataFrame
temp_df = pd.DataFrame(result_dict, index=[0])
df = pd.concat([df, temp_df], ignore_index=True)
# Set 'Date' as the index and save the DataFrame to a CSV file
df.set_index('Date', inplace=True)
df.to_csv('input.csv')
if __name__ == '__main__':
ocr()