## Imports

In [120]:
import pandas as pd
import numpy as np
import tabula
import configparser
import os


In [121]:
cwd = os.getcwd()

In [122]:
pd.set_option('display.max_rows', None)

## Config

In [123]:
#Creating configuration
config_path = os.path.join(cwd, '../docs/config.ini')
config = configparser.ConfigParser()
config.read(config_path)

['c:\\Users\\tasco\\OneDrive\\Python\\Projects\\financialstatements\\financialstatements\\notebooks\\../docs/config.ini']

## Reading Data

### Paystubs

In [124]:
PAYSTUB_DIRECTORY = os.path.join(cwd, config.get("data_inputs_directory", "PAYSTUB_DIRECTORY"))

In [125]:
paystubs_files = os.listdir(PAYSTUB_DIRECTORY)

In [126]:
paystub_paths = []
for file in paystubs_files:
    path = os.path.join(PAYSTUB_DIRECTORY, file)
    paystub_paths.append(path)

In [127]:
dfs = []
for path in paystub_paths:
    tables = tabula.read_pdf(path, pages = 'all', area = [396, 36, 756, 612], guess = False)
    df = pd.concat(tables).reset_index(drop=True)
    df['Date'] = path[-14:-4]
    dfs.append(df)


In [128]:
df = pd.concat(dfs).reset_index(drop=True)

### Table Data

In [129]:
COA_DATA = config.get('table_files', 'COA_DATA')

In [130]:
coa_purch_df = pd.read_excel(COA_DATA, sheet_name='coa_paystub_link_table')

## ETL

In [131]:
def find_deductions(df):
    df = df[['Unnamed: 4', 'Unnamed: 5', 'Date']]
    df = df.rename(columns = {'Unnamed: 4': 'Item', 'Unnamed: 5':'Amount'})
    df = df[1:]
    df = df.dropna().reset_index(drop = True)
    df = df.query("Item != 'Total' and Item != 'DEDUCTIONS' and Item != 'CURRENT'")
    return df

In [132]:
def find_earnings(df):
    df = df[['Unnamed: 0', 'Unnamed: 3', 'Date']]
    df = df.rename(columns = {'Unnamed: 0': 'Item', 'Unnamed: 3':'Amount'})
    df = df[1:]
    df = df.dropna().reset_index(drop = True)
    df = df.query("Item != 'Total' and Item != 'TAX' and Item != 'EARNINGS'")
    return df

In [133]:
deductions_df = find_deductions(df)

In [134]:
earnings_df = find_earnings(df)

In [135]:
df = pd.concat([deductions_df, earnings_df]).reset_index(drop=True)

In [136]:
df = pd.merge(df, 
              coa_purch_df,
                on = 'Item',
                how = 'left')

In [137]:
df = df.rename(columns = {'Item': 'Description'})

In [138]:
df = df[['Date', 'GL_Code', 'Account', 'Description', 'Amount', 'Category', 'Account_Type']]

In [139]:
df['Date'] = pd.to_datetime(df['Date'])
df['Amount'] = df['Amount'].str.replace('$', '')

  df['Amount'] = df['Amount'].str.replace('$', '')


In [140]:
df = df.sort_values(by = 'Date').reset_index(drop = True)

In [142]:
df['DEBIT'] = np.where(df['Account_Type'] == 'Asset', df['Amount'], np.nan)
df['DEBIT'] = np.where(df['Account_Type'] == 'Deduction', df['Amount'], np.nan)
df['CREDIT'] = np.where(df['Account_Type'] == 'Revenue', df['Amount'],np.nan)

In [143]:
df

Unnamed: 0,Date,GL_Code,Account,Description,Amount,Category,Account_Type,DEBIT,CREDIT
0,2023-01-05,500101.0,Medical Insurance,INS MED U *,35.0,Insurance,Deduction,35.0,
1,2023-01-05,500204.0,State Income Tax,ARKANSAS,102.0,Taxes,Deduction,102.0,
2,2023-01-05,500201.0,Federal Income Tax,FEDERAL TAX,287.65,Taxes,Deduction,287.65,
3,2023-01-05,400103.0,Walmart Stock Match,CO STK,10.5,Income,Revenue,,10.5
4,2023-01-05,400101.0,Regular Earnings - Walmart Salary,REGULAR,2746.16,Income,Revenue,,2746.16
5,2023-01-05,500203.0,Medicare Tax,MEDICARE,38.58,Taxes,Deduction,38.58,
6,2023-01-05,500202.0,Social Security Tax,SOCIAL SECURITY,164.96,Taxes,Deduction,164.96,
7,2023-01-05,100305.0,Walmart Roth 401k - Merrill,ROTH 401K,274.62,Long Term Investments,Asset,,
8,2023-01-05,500102.0,Dental Insurance,INS DEN U *,8.3,Insurance,Deduction,8.3,
9,2023-01-05,100202.0,Associate Stock Purchase Plan - Computer Share,STOCK PURCH,70.0,Short Term Investments,Asset,,
