## Imports

In [201]:
import pandas as pd
import tabula
import configparser
import os


In [202]:
cwd = os.getcwd()

## Config

In [203]:
#Creating configuration
config_path = os.path.join(cwd, '../docs/config.ini')
config = configparser.ConfigParser()
config.read(config_path)

['c:\\Users\\tasco\\OneDrive\\Python\\Projects\\financialstatements\\financialstatements\\notebooks\\../docs/config.ini']

## Reading Data

In [204]:
PAYSTUB_DIRECTORY = os.path.join(cwd, config.get("data_inputs_directory", "PAYSTUB_DIRECTORY"))

In [205]:
paystubs_files = os.listdir(PAYSTUB_DIRECTORY)

In [206]:
paystub_paths = []
for file in paystubs_files:
    path = os.path.join(PAYSTUB_DIRECTORY, file)
    paystub_paths.append(path)

In [207]:
dfs = []
for path in paystub_paths:
    tables = tabula.read_pdf(path, pages = 'all', area = [396, 36, 756, 612], guess = False)
    df = pd.concat(tables).reset_index(drop=True)
    df['File'] = path[-18:]
    dfs.append(df)


In [208]:
df = pd.concat(dfs).reset_index(drop=True)

## ETL

In [209]:
def find_deductions(df):
    df = df[['Unnamed: 4', 'Unnamed: 5', 'File']]
    df = df.rename(columns = {'Unnamed: 4': 'Account', 'Unnamed: 5':'Amount'})
    df = df[1:]
    df = df.dropna().reset_index(drop = True)
    df = df.query("Account != 'Total' and Account != 'DEDUCTIONS' and Account != 'CURRENT'")
    return df

In [210]:
def find_earnings(df):
    df = df[['Unnamed: 0', 'Unnamed: 3', 'File']]
    df = df.rename(columns = {'Unnamed: 0': 'Account', 'Unnamed: 3':'Amount'})
    df = df[1:]
    df = df.dropna().reset_index(drop = True)
    df = df.query("Account != 'Total' and Account != 'TAX' and Account != 'EARNINGS'")
    return df

In [211]:
deductions_df = find_deductions(df)

In [212]:
earnings_df = find_earnings(df)
earnings_df

Unnamed: 0,Account,Amount,File
0,REGULAR,"$2,746.16",01_paystub_jan.pdf
1,CO STK,$10.50,01_paystub_jan.pdf
4,FEDERAL TAX,$287.29,01_paystub_jan.pdf
5,SOCIAL SECURITY,$164.85,01_paystub_jan.pdf
6,MEDICARE,$38.55,01_paystub_jan.pdf
7,ARKANSAS,$101.80,01_paystub_jan.pdf
10,REGULAR,"$2,746.16",01_paystub_jan.pdf
11,CO STK,$10.50,01_paystub_jan.pdf
14,FEDERAL TAX,$287.65,01_paystub_jan.pdf
15,SOCIAL SECURITY,$164.96,01_paystub_jan.pdf


In [213]:
df = pd.concat([deductions_df, earnings_df]).reset_index(drop=True)

In [214]:
df['Month'] = df['File'].str[-5:-7]