In [1]:
import os
import pandas as pd
import json
import re
from pathlib import Path

pd.set_option('max_colwidth', 400)

In [6]:
df_reg = pd.read_csv('csv/regulations.csv')
# df_reg.head()

In [7]:
df = pd.read_csv('csv/data_file.csv')
# df.head()
# print(len(df))

In [8]:
df2 = pd.read_csv('csv/data_detail.csv')
# df2.head()

In [9]:
duplicate_titles = df2[df2.duplicated(subset='title', keep=False)]
# duplicate_titles

In [10]:
df3 = df.merge(df2, how='left', on='title')

In [11]:
df3['type_of_regulation'].value_counts()

type_of_regulation
Bank Indonesia Circular Letters                391
Bank Indonesia Regulation                      339
Member Of The Board Of Governors Regulation     27
Name: count, dtype: int64

In [166]:
# df3.loc[df3['type_of_regulation'] == 'Bank Indonesia Circular Letters'].tail()

In [57]:
# df3.loc[df3['file_name'] == "Appendix"]

# Standardizing File Names

In [12]:
# Function to convert type_of_regulation to its abbreviation
def format_type_of_regulation(regulation_type):
    mapping = {
        "Bank Indonesia Regulation": "pbi",
        "Member Of The Board Of Governors Regulation": "padg",
        "Bank Indonesia Circular Letters": "sebi"
    }
    return mapping.get(regulation_type, "unknown")

# Function to extract the number from title and type_of_regulation
def format_number(title, type_of_regulation):
    # Check for pattern a/b/c/d (three slashes)
    match = re.search(r'\d+/\d+/\w+/\d+', title)
    if match:
        return format_title(match.group(0))
    
    # Check for pattern x/y/z (two slashes)
    match = re.search(r'\d+/\d+/\w+', title)
    if match:
        return format_title(match.group(0))

    # Combined regex pattern for all types
    match = re.search(r'(NUMBER[:.]?|NO\.?|NOMOR|NR\.?) ?(\d+(?:/\d+)*(/\w+)*/\d+|\d+)', title, re.IGNORECASE)
    
    if match:
        number = match.group(2)
        return format_title(number)
    
    # Special case for 'Bank Indonesia Regulation' with additional year handling
    if type_of_regulation == 'Bank Indonesia Regulation':
        match_number = re.search(r'NUMBER (\d+(?:/\d+)*(/\w+)*/\d+|\d+) (?:OF )?(\d+)?', title, re.IGNORECASE)
        if match_number:
            if match_number.group(3):
                number = match_number.group(1) + match_number.group(3)
            else:
                number = match_number.group(1)
            return format_title(number)
    
    return "unknown"

# Function to format date
def format_date(date_str):
    months = {
        "January": "01", "February": "02", "March": "03",
        "April": "04", "May": "05", "June": "06",
        "July": "07", "August": "08", "September": "09",
        "October": "10", "November": "11", "December": "12"
    }
    day, month, year = date_str.split()
    return f"{int(day):02d}{months[month]}{year}"

# Function to format title
def format_title(title):
    formatted_title = title.lower().replace(' ', '_').replace('/', '_').replace('-', '_')
    formatted_title = re.sub(r'[^a-z0-9_]', '', formatted_title)
    return formatted_title[:250]

# Function to generate the standardized file name
def generate_standardized_file_name(row):
    regulation_type = format_type_of_regulation(row['type_of_regulation'])
    number = format_number(row['title'], row['type_of_regulation'])
    date = format_date(row['date'])
    title = format_title(row['title'])
    extension = row['file_link'].split('.')[-1]
    return f"{regulation_type}-{number}-{date}-{title}.{extension}"

df3['standardized_file_name'] = df3.apply(generate_standardized_file_name, axis=1)

In [13]:
unknown_rows = df3[df3['standardized_file_name'].str.contains("unknown", na=False)]
unknown_rows.loc[unknown_rows['type_of_regulation'] == 'Member Of The Board Of Governors Regulation']['title']

Series([], Name: title, dtype: object)

In [18]:
df3_dict = df3.to_dict('records')
# with open('files_metadata.json', 'w') as file:
#     json.dump(df3_dict, file)
# df3.to_csv('csv/data_final.csv', index=False)

In [19]:
df3

Unnamed: 0,title,file_name,file_link,date,type_of_regulation,sector,standardized_file_name
0,BANK INDONESIA REGULATION NUMBER 12 OF 2023 ON ISSUANCE OF MONEY MARKET INSTRUMENTS AND MONEY MARKET TRANSACTIONS,Bank Indonesia Regulation Number 12 of 2023.pdf,https://www.bi.go.id/en/publikasi/peraturan/Documents/PBI_122023_EN.pdf,16 November 2023,Bank Indonesia Regulation,Monetary,pbi-12-16112023-bank_indonesia_regulation_number_12_of_2023_on_issuance_of_money_market_instruments_and_money_market_transactions.pdf
1,BANK INDONESIA REGULATION NUMBER 6 OF 2023 ON MONEY MARKET AND FOREIGN EXCHANGE MARKET,Bank Indonesia Regulation Number 6 of 2023.pdf,https://www.bi.go.id/en/publikasi/peraturan/Documents/PBI_062023_EN.pdf,27 June 2023,Bank Indonesia Regulation,Monetary,pbi-6-27062023-bank_indonesia_regulation_number_6_of_2023_on_money_market_and_foreign_exchange_market.pdf
2,BANK INDONESIA REGULATION NUMBER 4 OF 2023 ON SHORT-TERM LIQUIDITY ASSISTANCE FOR CONVENTIONAL COMMERCIAL BANKS,Bank Indonesia Regulation Number 4 of 2023.pdf,https://www.bi.go.id/en/publikasi/peraturan/Documents/PBI_042023_EN.pdf,27 June 2023,Bank Indonesia Regulation,Macroprudential,pbi-4-27062023-bank_indonesia_regulation_number_4_of_2023_on_short_term_liquidity_assistance_for_conventional_commercial_banks.pdf
3,BANK INDONESIA REGULATION NUMBER 24/2/PBI/2022 ON TRANSACTIONS BETWEEN BANKS AND BANK INDONESIA TO SUPPORT LOCAL CURRENCY SETTLEMENT,Bank Indonesia Regulation Number 24/2/PBI/2022.pdf,https://www.bi.go.id/en/publikasi/peraturan/Documents/PBI_240222_EN.pdf,28 January 2023,Bank Indonesia Regulation,Monetary,pbi-24_2_pbi_2022-28012023-bank_indonesia_regulation_number_24_2_pbi_2022_on_transactions_between_banks_and_bank_indonesia_to_support_local_currency_settlement.pdf
4,BANK INDONESIA REGULATION NUMBER 24/20/PBI/2022 ON SHARIA HEDGE SWAP TRANSACTIONS TO BANK INDONESIA,Bank Indonesia Regulation Number 24/20/PBI/2022.pdf,https://www.bi.go.id/en/publikasi/peraturan/Documents/PBI_242022_EN.pdf,30 December 2022,Bank Indonesia Regulation,Monetary,pbi-24_20_pbi_2022-30122022-bank_indonesia_regulation_number_24_20_pbi_2022_on_sharia_hedge_swap_transactions_to_bank_indonesia.pdf
...,...,...,...,...,...,...,...
752,CIRCULAR LETTER NO.6/3/DPM REQUIREMENTS AND PROCEDURE FOR APPOINTMENT OF SUB-REGISTRIES FOR SECURITIES ADMINISTRATION,Requirements and Procedure for Appointment of Sub-Registries for Securities Administration,https://www.bi.go.id/en/publikasi/peraturan/Documents/909648987c3a420eab3b63cacf7a6866SE_63DPM_engl.pdf,16 February 2004,Bank Indonesia Circular Letters,Monetary,sebi-6_3_dpm-16022004-circular_letter_no6_3_dpm_requirements_and_procedure_for_appointment_of_sub_registries_for_securities_administration.pdf
753,CIRCULAR LETTER NR. 6/2/DPM USER FEES FOR THE BANK INDONESIA - SCRIPLESS SECURITIES SETTLEMENT SYSTEM AND APPENDIX,User Fees for the Bank Indonesia - Scripless Securities Settlement System,https://www.bi.go.id/en/publikasi/peraturan/Documents/2e82d5a22d044aa2b05a68a523430ee5se_62DPM_eng.pdf,16 February 2004,Bank Indonesia Regulation,Monetary,pbi-6_2_dpm-16022004-circular_letter_nr_6_2_dpm_user_fees_for_the_bank_indonesia___scripless_securities_settlement_system_and_appendix.pdf
754,CIRCULAR LETTER NR. 6/2/DPM USER FEES FOR THE BANK INDONESIA - SCRIPLESS SECURITIES SETTLEMENT SYSTEM AND APPENDIX,Appendix,https://www.bi.go.id/en/publikasi/peraturan/Documents/03b449c5450c4cecb7e9f7054007be7aApdx_se_62DPM_eng.pdf,16 February 2004,Bank Indonesia Regulation,Monetary,pbi-6_2_dpm-16022004-circular_letter_nr_6_2_dpm_user_fees_for_the_bank_indonesia___scripless_securities_settlement_system_and_appendix.pdf
755,"BANK INDONESIA REGULATION NR. 6/3/PBI/2004 ISSUANCE, SALE AND PURCHASE, AND ADMINISTRATION OF GOVERNMENT SECURITIES","Issuance, Sale And Purchase, And Administration Of Government Securities",https://www.bi.go.id/en/publikasi/peraturan/Documents/1146549b80614fb484861f5277018de4PBI6304eng.pdf,16 February 2004,Bank Indonesia Regulation,Monetary,pbi-6_3_pbi_2004-16022004-bank_indonesia_regulation_nr_6_3_pbi_2004_issuance_sale_and_purchase_and_administration_of_government_securities.pdf


# OS Stuff

In [10]:
_, _, files = next(os.walk("files/"))
file_count = len(files)
print(file_count)

757


In [11]:
_, _, files = next(os.walk("extracted_files/"))
file_count = len(files)
print(file_count)

238


In [20]:
def get_folder_size(folder):
    return ByteSize(sum(file.stat().st_size for file in Path(folder).rglob('*')))


class ByteSize(int):

    _KB = 1024
    _suffixes = 'B', 'KB', 'MB', 'GB', 'PB'

    def __new__(cls, *args, **kwargs):
        return super().__new__(cls, *args, **kwargs)

    def __init__(self, *args, **kwargs):
        self.bytes = self.B = int(self)
        self.kilobytes = self.KB = self / self._KB**1
        self.megabytes = self.MB = self / self._KB**2
        self.gigabytes = self.GB = self / self._KB**3
        self.petabytes = self.PB = self / self._KB**4
        *suffixes, last = self._suffixes
        suffix = next((
            suffix
            for suffix in suffixes
            if 1 < getattr(self, suffix) < self._KB
        ), last)
        self.readable = suffix, getattr(self, suffix)

        super().__init__()

    def __str__(self):
        return self.__format__('.2f')

    def __repr__(self):
        return '{}({})'.format(self.__class__.__name__, super().__repr__())

    def __format__(self, format_spec):
        suffix, val = self.readable
        return '{val:{fmt}} {suf}'.format(val=val, fmt=format_spec, suf=suffix)

    def __sub__(self, other):
        return self.__class__(super().__sub__(other))

    def __add__(self, other):
        return self.__class__(super().__add__(other))
    
    def __mul__(self, other):
        return self.__class__(super().__mul__(other))

    def __rsub__(self, other):
        return self.__class__(super().__sub__(other))

    def __radd__(self, other):
        return self.__class__(super().__add__(other))
    
    def __rmul__(self, other):
        return self.__class__(super().__rmul__(other))   
    
files_size = get_folder_size("files")
extracted_files_size = get_folder_size("extracted_files")
print(files_size + extracted_files_size)

182.27 MB
