In [5]:
import re
import pandas as pd
import os
from datetime import datetime
import yfinance as yfi
from tqdm import tqdm
import pickle
# Function to extract date and name from the filename
def extract_date_and_name(filename):
    # match = re.match(r'(\d{8})_([A-Za-z]+)', filename)
    match = re.match(r'(\d{8})(.*?)(?=\d|$)', filename)
    if match:
        date, name = match.groups()
        modified_name = re.sub(r'\s+', '', name)  # Remove all spaces
        modified_name = re.sub(r'\.', '_', modified_name) 
        return date, name
    return None, None

directory = '../../data_txt/pdftotext_upright_pages/'

In [21]:
date_pattern = re.compile(r'\b(0?[1-9]|1[0-2])/(0?[1-9]|[12][0-9]|3[01])/([0-9]{4})\b')
transaction_pattern = re.compile(r'(?:^|[\t ])(s|P|S|p|E)[\t ]+(?:\d+|\(partial\))')
amount_range_pattern = re.compile(r'1 -')
amount_number_pattern = re.compile(r'\$(\d+\.?\d*) -')
additional_line_amount_pattern = re.compile(r'\$(\.\d+|\d+\.\d+|\d+)')
token_pattern = re.compile(r'\(([A-Za-z]+)\)')
additional_line_pattern = re.compile(r'\$')

# checked_tickers = set()
with open('checked_tickers.pkl', 'rb') as file:
    checked_tickers = pickle.load(file)
    
with open('valid_tickers.pkl', 'rb') as file:
    valid_tickers = pickle.load(file)

def is_normal_pattern(line):
    statement = date_pattern.search(line) and transaction_pattern.search(line) \
        and amount_range_pattern.search(line)
    return statement

def is_additional_pattern(line):
    statement_additional_line = date_pattern.search(line) and transaction_pattern.search(line) \
        and (not amount_range_pattern.search(line)) and additional_line_pattern.search(line)
    return statement_additional_line

def accept_line(line):
    return is_normal_pattern(line) or is_additional_pattern(line)
    # return is_normal_pattern(line)

def find_token(line):
    token = token_pattern.search(line)
    if (not token is None):
        if (is_ticker_valid(token.group(1))):
            return token.group(1)
    return None

def is_ticker_valid(ticker):
    if ticker in checked_tickers:
        return ticker in valid_tickers
    else:
        checked_tickers.add(ticker)
        res = not (yfi.Ticker(ticker).history(period='max').size == 0)
        if res:
            valid_tickers.add(ticker)
        return res

def find_token_by_index(lines, line_ind, index):
    next_line_ind = line_ind + index
    if next_line_ind < len(lines):
        token = find_token(lines[next_line_ind])
        if (not token is None):
            return token
    return None

amounts_set = set()
transaction_type_set = set()
token_set = set()
count = 0
normal_count = 0
additional_line_count = 0
token_count = 0
rows = []

for filename in tqdm(os.listdir(directory)):
    # Check if the file has a .md extension
    if filename.endswith('.txt'):
        date, name = extract_date_and_name(filename)

        # Read the file
        with open(os.path.join(directory, filename), 'r', encoding='utf-8') as file:
            file_content=file.read()

        # Split the file into lines
        lines = file_content.strip().split('\n')

        # Iterate over the lines
        for line_ind, line in enumerate(lines):
            # Filter out lines that contain the pattern
            if accept_line(line):
                count += 1
                # Recover transaction date
                dates = date_pattern.findall(line)
                if len(dates) != 2:
                    transaction_date = dates[0]
                else:
                    transaction_date = dates[1]
                transaction_date = transaction_date[2] + transaction_date[0] \
                    + transaction_date[1]

                # Recover amount
                if (is_normal_pattern(line)):
                    amount = amount_number_pattern.search(line.replace(',', '')).group(1)
                    amounts_set.add(amount)
                    normal_count += 1
                else:
                    try:
                        amount = additional_line_amount_pattern.search(line.replace(',', '')).group(1)
                        additional_line_count += 1
                    except:
                        amount = 'N/A'
                        print(line)
                # Recover transaction type
                transaction_type = transaction_pattern.search(line).group(1)
                if (transaction_type =='s'):
                    transaction_type = 'S'
                elif (transaction_type =='p'):
                    transaction_type = 'P'
                transaction_type_set.add(transaction_type)

                # Recover residual of the line:
                name_residual = line[:transaction_pattern.search(line).start()].strip()
                
                # Recover token
                token = find_token(name_residual)
                current_index = 1
                transaction_name = name_residual
                N_lines = len(lines)
                if (token is None):
                    # Iterate over lines until we don't find ticker, or hit bounding 
                    # criteria
                    continue_search = True
                    while((not accept_line(lines[line_ind + current_index])) \
                          and (current_index < min(10, N_lines - line_ind)) and continue_search):
                        token = find_token_by_index(lines, line_ind, current_index)
                        if (not token is None):
                            continue_search = False
                        current_index += 1
                if (not token is None):
                    token_count += 1
                    for i in range(current_index):
                        transaction_name += ' ' + lines[line_ind + i + 1]
                else:
                    token = 'N/A'
                    
                # Save the transaction
                rows.append({
                    'Stock_name': transaction_name,
                    'Ticker': token,
                    'Type': transaction_type,
                    'Date': transaction_date,
                    'Min_amount': amount,
                    'Trader_name': name,
                    'Document_date': date
                })

df = pd.DataFrame(rows)
print(normal_count)
print(additional_line_count)
print(normal_count + additional_line_count)
print(count)

# Save checked tickers
with open('valid_tickers.pkl', 'wb') as f:
    pickle.dump(valid_tickers, f)
with open('checked_tickers.pkl', 'wb') as f:
    pickle.dump(checked_tickers, f)

100%|██████████| 4882/4882 [00:06<00:00, 766.27it/s]

43824
489
44313
44313





In [3]:
df[df['Ticker'] == 'N/A']

Unnamed: 0,Stock_name,Ticker,Type,Date,Min_amount,Trader_name,Document_date
3,JT DE WITT MICH PUB SCHS,,P,2017051,15001,DebbieDingell,20170504
7,JT Avaya Holdings Corp. (AVYA) [ST],,P,20201201,1001,SusieLee,20201208
8,JT Ball Corporation (BLL) [ST],,S,20201101,1001,SusieLee,20201208
16,JT Clarivate Plc Ordinary Shares,,P,20201201,1001,SusieLee,20201208
22,JT Duke Realty Corporation (DRE),,P,20201101,1001,SusieLee,20201208
...,...,...,...,...,...,...,...
43819,JT auSTIN TX WTR & WST 4% DuE,,S,20150611,250001,SuzanKDelBene,20150712
43820,JT gEORgIa ST 5% DuE 02/01/21,,P,20150610,500001,SuzanKDelBene,20150712
43821,JT JP MORgaN CHaSE 5.55% DuE,,P,2015068,15001,SuzanKDelBene,20150712
43822,JT MORgaN STaNlEY 4.75% DuE,,P,20150630,15001,SuzanKDelBene,20150712


In [35]:
df.head(60)

Unnamed: 0,Stock_name,Ticker,Type,Date,Min_amount,Trader_name,Document_date
0,"Home Depot, Inc. (HD) [ST] ...",HD,P,20210405,1001,LloydDoggett,20210407
1,International Business Machines ...,IBM,P,20210405,1001,LloydDoggett,20210407
2,"PPg Industries, Inc. (PPg) [ST] ...",PPg,P,20210405,1001,LloydDoggett,20210407
3,JT DE WITT MICH PUB SCHS,,P,2017051,15001,DebbieDingell,20170504
4,JT AECOM (ACM) [ST] ...,ACM,S,20201201,1001,SusieLee,20201208
5,JT Aptiv PLC Ordinary Shares (APTV) ...,APTV,P,20201101,1001,SusieLee,20201208
6,JT Aptiv PLC Ordinary Shares (APTV) ...,APTV,P,20201101,1001,SusieLee,20201208
7,JT Avaya Holdings Corp. (AVYA) [ST],,P,20201201,1001,SusieLee,20201208
8,JT Ball Corporation (BLL) [ST],,S,20201101,1001,SusieLee,20201208
9,"JT Bio-Rad Laboratories, Inc. Class A ...",BIO,S,20201101,1001,SusieLee,20201208
