In [1]:
# pip install ipykernel

In [2]:
# pip install easyocr torch

In [3]:
# pip install pandas

In [None]:
# pip install requests

In [None]:
# pip install pyspellchecker


In [2]:
import numpy as np
import pandas as pd
import os
from typing import List 
import easyocr
import re
import requests
import shutil
from spellchecker import SpellChecker

## Download images

In [None]:
def download_image(url, save_folder):
    # Send a HTTP request to the specified URL
    response = requests.get(url, stream=True)
    # Check if the request was successful
    if response.status_code == 200:
        # Extract the filename from the URL
        filename = url.split('/')[-1]
        # Construct the full path to save the image (relative to the current directory)
        save_path = os.path.join(os.getcwd(), save_folder, filename)
        # Open the specified file path in write-binary mode
        with open(save_path, 'wb') as out_file:
            # Copy the content of the response to the file
            shutil.copyfileobj(response.raw, out_file)
        print(f"Image successfully downloaded: {save_path}")
    else:
        print(f"Failed to retrieve image. HTTP Status code: {response.status_code}")

# URL of the image to be downloaded
image_url = 'https://www.stats.gov.cn/sj/ndsj/2023/html/E19-03.jpg'
# Folder where the image will be saved (relative path)
save_folder = 'Real_estate'

# Download the image
download_image(image_url, save_folder)

## OCR

In [3]:
reader = easyocr.Reader(['en'], gpu=False)

Using CPU. Note: This module is much faster with a GPU.


In [4]:
def ocr_scan(image_path: str) -> str:
    result = reader.readtext(str(image_path))
    # print("Result: ", result)
    # for element in result:
    #     print("Element: ", element)
        
    recognized = " ".join(element[1] for element in result)
    return recognized

In [26]:
img_path = "Real estate\E19-01.jpg" # Main indicator of real estate
real_estate_indicator = ocr_scan(img_path)

In [8]:
real_estate_indicator

'19-1 Main Indicators of Enterprises for Real Estate Development Item 2019 2020 2021 2022 Number ofEnterprises (unit) 99544 103262 105434 102852 Domestic Invested 95691 99150 101374 99054 State Otned Enterprises 671 1133 1209 1387 Collective-owned Enterprises 230 227 208 185 Enterprises with Investment from Hong Kong; Macao and Taiwan 2664 2759 2703 2550 Foreign Invested 1189 1353 1357 1248 Arerage Number Employed ersons (10 000 persons) 293.74 290.13 280.16 244.67 Domestic Invested 278.41 274.65 265.76 232.80 State Otned Enterprises 2.51 3.65 3.54 3.74 Collective-owned Enterprises 0.50 0.41 Enterprises with Investment from Hong Kong; Macao and Taiwan 10.37 10.19 9.24 Foreign Invested 4.95 5.29 5.15 Area of Land Purchased in the Year (10 000 sq.m) 25822.3 25536.3 21589_ 10041.7 ntesmment Completed in the Year (100 million yuan) 127746.8 136438.2 142247.7 128074.6 Residential Buildings 93572.4 100488.2 106855_ 96735 Actual Funds in Place in the Year (100 million yuan) 178608.6 193114.9 

### Spellchecker

In [36]:
# from spellchecker import SpellChecker

# Initialize spell checker
spell_checker = SpellChecker()

# Test sentence
sentence = "State Otned Enterprises"

# Split sentence into words
words = sentence.split()

# Check each word
for word in words:
    # if spell_checker.correction(word) == word:
    #     print(f"{word} is spelled correctly.")
    # else:
    #     suggestions = spell_checker.candidates(word)
    #     print(f"{word} is misspelled. Suggestions: {suggestions}")
    print(spell_checker.correction(word))


State
owned
Enterprises


## Save into txt file

### Remove the title of the table

In [25]:
ex_paragraph = real_estate_indicator

# Sentence to be removed
sentence_to_remove = "19-1 Main Indicators of Enterprises for Real Estate Development Item 2019 2020 2021 2022"

# Define a function to remove the specified sentence
def remove_sentence(paragraph, sentence):
    # Escape special characters in the sentence for regex
    escaped_sentence = re.escape(sentence)
    # Use regex to remove the sentence and any extra whitespace that may be left
    modified_paragraph = re.sub(r'\s*{}\s*'.format(escaped_sentence), ' ', paragraph)
    # Clean up any extra spaces created by removal
    modified_paragraph = re.sub(r'\s{2,}', ' ', modified_paragraph).strip()
    return modified_paragraph

# Remove the specified sentence from the paragraph
updated_paragraph = remove_sentence(ex_paragraph, sentence_to_remove)

# Display the updated paragraph
print(updated_paragraph)

Number ofEnterprises (unit) 99544 103262 105434 102852 Domestic Invested 95691 99150 101374 99054 State Otned Enterprises 671 1133 1209 1387 Collective-owned Enterprises 230 227 208 185 Enterprises with Investment from Hong Kong; Macao and Taiwan 2664 2759 2703 2550 Foreign Invested 1189 1353 1357 1248 Arerage Number Employed ersons (10 000 persons) 293.74 290.13 280.16 244.67 Domestic Invested 278.41 274.65 265.76 232.80 State Otned Enterprises 2.51 3.65 3.54 3.74 Collective-owned Enterprises 0.50 0.41 Enterprises with Investment from Hong Kong; Macao and Taiwan 10.37 10.19 9.24 Foreign Invested 4.95 5.29 5.15 Area of Land Purchased in the Year (10 000 sq.m) 25822.3 25536.3 21589_ 10041.7 ntesmment Completed in the Year (100 million yuan) 127746.8 136438.2 142247.7 128074.6 Residential Buildings 93572.4 100488.2 106855_ 96735 Actual Funds in Place in the Year (100 million yuan) 178608.6 193114.9 201132.2 148357.3 Domestic Loans 25228 26675.9 23295 17359 Foreign Investment 175.7 192.0 

### Save into txt file

In [28]:
# import re

def separate_paragraph(paragraph):
    # Define the regex pattern to find groups of words followed by 3 to 4 consecutive numbers
    # pattern = re.compile(r'([a-zA-Z\s]+)((?:\s\d+(?:\.\d+)?){2,4})')
    # pattern = re.compile(r'([a-zA-Z\s\(\)]+)((?:\s\d+(?:\.\d+)?){2,4})')
    pattern = re.compile(r'([a-zA-Z\s\(\)_]+)((?:\s\d+(?:\.\d+)?_?){2,4})')
    
    # Find all matches in the paragraph
    matches = pattern.findall(paragraph)
    
    lines = []
    for match in matches:
        # Combine the words part and the numbers part into a single line
        words = match[0].strip()
        numbers = match[1].strip()
        line = f"{words} {numbers}"
        lines.append(line)
    
    return lines

# Example input paragraph
paragraph = updated_paragraph

# Call the function and print the results
lines = separate_paragraph(paragraph)

output_e19_01 = "Txt_files\Real_estate\E19_01.txt"

with open(output_e19_01, 'w') as file:
    for line in lines:
        file.write(line + '\n')

# for line in lines:
#     print(line)


### Read from txt file to create pandas data frame

In [None]:
def separate_lines(lines):
    # Define the regex pattern to find groups of words followed by 3 to 4 consecutive numbers
    pattern = re.compile(r'([a-zA-Z\s\(\)_]+)((?:\s\d+(?:\.\d+)?_?){2,4})')
    
    data = []
    for line in lines:
        # Match the line with the regex pattern
        match = pattern.match(line)
        if match:
            words = match.group(1).strip()
            numbers = match.group(2).strip().split()
            
            # Ensure numbers length is exactly 4
            while len(numbers) < 4:
                numbers.append(None)  # Fill missing numbers with None
            
            # Append the data row
            data.append([words] + numbers[:4])
    
    return data

# Specify the input text file name
input_txt = 'output.txt'

# Read lines from the text file
with open(input_txt, 'r') as file:
    lines = file.readlines()

# Strip newline characters from each line
lines = [line.strip() for line in lines if line.strip()]

# Call the function to get the results
data = separate_lines(lines)

# Create a DataFrame with the specified columns
df = pd.DataFrame(data, columns=['Item', 'Year 1', 'Year 2', 'Year 3', 'Year 4'])

# Specify the output CSV file name
output_csv = 'output.csv'

# Write the DataFrame to the CSV file
df.to_csv(output_csv, index=False)

print(f"The DataFrame has been saved to {output_csv}")

# Display the DataFrame
print(df)

## 2nd test (Đoạn này t đang test ở ảnh khác, không cần để ý)

In [6]:
img_path = "Real_estate\E19-02.jpg" # Number of enterprises for real estate development (7 cols)
real_estate_enterprises = ocr_scan(img_path)

In [7]:
real_estate_enterprises

'19-2 Number of Enterprises for Real Estate Development (unit) Number of Domestic Enterprises Foreign Year Enterprises Invested State-owned Collective- with Investnent Invested Region Enterprises Enterprises owned from Hong Kong; Enterprises Enterprises Macao and Taiwan 998 24378 19960 7958 4538 3214 1204 2000 27303 23277 6641 3492 2899 1127 2005 56290 50957 4145 1796 3443 1890 2006 58710 53268 3797 1586 3519 1923 2007 62518 56965 3617 1430 3524 2029 2008 87562 81282 3941 1520 3916 2364 2009 80407 74674 3835 1361 3633 2100 2010 85218 79489 3685 1220 3677 2052 2011 88419 83011 3427 1023 3565 1843 2012 89859 84695 3354 904 3451 1713 2013 91444 86379 1739 570 3391 1674 201- 94197 89218 1476 457 3414 1565 2015 93426 88773 1329 409 3235 1418 2016 94948 90408 1093 364 3232 1308 2017 95897 91608 943 319 3066 1223 2018 97937 94063 767 280 2719 1155 2019 99544 95691 671 230 2664 1189 2020 103262 99150 1133 227 2759 1353 2021 105434 101374 1209 208 2703 1357 2022 102852 99054 1387 185 2550 1248 

In [10]:
ex_paragraph = real_estate_enterprises

# Sentence to be removed
sentence_to_remove = "19-2 Number of Enterprises for Real Estate Development (unit) Number of Domestic Enterprises Foreign Year Enterprises Invested State-owned Collective- with Investnent Invested Region Enterprises Enterprises owned from Hong Kong; Enterprises Enterprises Macao and Taiwan"

# Define a function to remove the specified sentence
def remove_sentence(paragraph, sentence):
    # Escape special characters in the sentence for regex
    escaped_sentence = re.escape(sentence)
    # Use regex to remove the sentence and any extra whitespace that may be left
    modified_paragraph = re.sub(r'\s*{}\s*'.format(escaped_sentence), ' ', paragraph)
    # Clean up any extra spaces created by removal
    modified_paragraph = re.sub(r'\s{2,}', ' ', modified_paragraph).strip()
    return modified_paragraph

# Remove the specified sentence from the paragraph
updated_paragraph = remove_sentence(ex_paragraph, sentence_to_remove)

# Display the updated paragraph
print(updated_paragraph)

998 24378 19960 7958 4538 3214 1204 2000 27303 23277 6641 3492 2899 1127 2005 56290 50957 4145 1796 3443 1890 2006 58710 53268 3797 1586 3519 1923 2007 62518 56965 3617 1430 3524 2029 2008 87562 81282 3941 1520 3916 2364 2009 80407 74674 3835 1361 3633 2100 2010 85218 79489 3685 1220 3677 2052 2011 88419 83011 3427 1023 3565 1843 2012 89859 84695 3354 904 3451 1713 2013 91444 86379 1739 570 3391 1674 201- 94197 89218 1476 457 3414 1565 2015 93426 88773 1329 409 3235 1418 2016 94948 90408 1093 364 3232 1308 2017 95897 91608 943 319 3066 1223 2018 97937 94063 767 280 2719 1155 2019 99544 95691 671 230 2664 1189 2020 103262 99150 1133 227 2759 1353 2021 105434 101374 1209 208 2703 1357 2022 102852 99054 1387 185 2550 1248 Beijing 1182 1109 Tianjin 1127 1064 Hebei 4036 4002 Shanxi 2768 2756 Inner Mongolia 1711 1710 Liaoning 2757 2558 149 Jilin 1444 1432 Heilongjiang 1359 1344 Shanghai 2660 2297 128 267 Jiangsu 7146 5574 188 426 146 Zhejiang 6599 5341 130 128 Anhui 3962 902 Fujian 3426 3162

In [None]:
# import re
# import pandas as pd
# from spellchecker import SpellChecker

def correct_spelling(line, spell_checker):
    # Split the line into words and check each word
    corrected_words = []
    for word in line.split():
        # Correct the word if it is misspelled
        corrected_word = spell_checker.correction(word) if word not in spell_checker else word
        corrected_words.append(corrected_word)
    return ' '.join(corrected_words)

def separate_lines(lines, spell_checker):
    # Define the regex pattern to find groups of words followed by 3 to 4 consecutive numbers
    pattern = re.compile(r'([a-zA-Z\s\(\)_]+)((?:\s\d+(?:\.\d+)?_?){3,4})')
    
    data = []
    for line in lines:
        # Correct the spelling of the line
        corrected_line = correct_spelling(line, spell_checker)
        
        # Match the corrected line with the regex pattern
        match = pattern.match(corrected_line)
        if match:
            words = match.group(1).strip()
            numbers = match.group(2).strip().split()
            
            # Ensure numbers length is exactly 4
            while len(numbers) < 4:
                numbers.append(None)  # Fill missing numbers with None
            
            # Append the data row
            data.append([words] + numbers[:4])
    
    return data

# Specify the input text file name
input_txt = 'Txt_files\Real_estate\E19_02.txt'

# Create a spell checker instance
spell_checker = SpellChecker()

# Read lines from the text file
with open(input_txt, 'r') as file:
    lines = file.readlines()

# Strip newline characters from each line and apply spell checking
lines = [line.strip() for line in lines if line.strip()]

# Call the function to get the results
data = separate_lines(lines, spell_checker)

# Create a DataFrame with the specified columns
df = pd.DataFrame(data, columns=['Item', 'Year 1', 'Year 2', 'Year 3', 'Year 4'])

# Specify the output CSV file name
output_csv = 'output.csv'

# Write the DataFrame to the CSV file
df.to_csv(output_csv, index=False)

print(f"The DataFrame has been saved to {output_csv}")

# Display the DataFrame
print(df)
