In [None]:
import pandas as pd
from scipy import stats

df = pd.read_csv('../data/stmt.csv')

# Convert to numeric
df['Amount'] = pd.to_numeric(df['Amount'].str.replace(',', '', regex=False))
df['Running Bal.'] = pd.to_numeric(df['Running Bal.'].str.replace(',', '', regex=False))

# Convert to datetime
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')

print(df.head())

print(df.dtypes)


In [None]:
# Caclculate descriptive statistics
print(df['Amount'].describe(include='all'))

# Skewness and Kurtosis
print(stats.skew(df['Amount'].dropna()))
print(stats.kurtosis(df['Amount'].dropna(), fisher=False)) # Use fisher=True for excess kurtosis


In [None]:
# Downsample dataset to Monthly Cashflow
# Set date as index
df.set_index('Date', inplace=True)

# Resample by month and sum the cashflow
monthly_cashflow = df['Amount'].resample('ME').sum()
print('\n////////////////////\nMonthly Cashflow Sum (12-month)\n////////////////////')
print(monthly_cashflow)

weekly_cashflow = df['Amount'].resample('W').sum()
print('\n////////////////////\nWeekly Cashflow Sum (12-month)\n////////////////////')
print(weekly_cashflow)

avg_monthly_bal = df['Running Bal.'].resample('ME').mean()
print('\n////////////////////\nMonthly Average Balance (12-month)\n////////////////////')
print(avg_monthly_bal)

In [None]:
# Plot weekly cashflow
import matplotlib.pyplot as plt

weekly_cashflow.plot(figsize=(10,5))
plt.title('Time Series Plot of Weekly Cashflow')
plt.xlabel('Date')
plt.ylabel('Value ($)')
plt.grid(True)
plt.legend()
plt.show()

In [None]:
import matplotlib.pyplot as plt

# Plot cashflow and running balance
df[['Amount', 'Running Bal.']].plot(figsize=(10,5))
plt.title('Time Series Plot of Cashflow and Running Balance')
plt.xlabel('Date')
plt.ylabel('Vale ($)')
plt.grid(True)
plt.legend()
plt.show()

In [None]:
# Probability Density Function (PDF) from daily cashflow data

# 1. Kernel Density Estimation (KDE)
import seaborn as sns

plt.figure(figsize=(10,5))
sns.kdeplot(df['Amount'], fill=True)
plt.title('Estimated PDF of Daily Cashflow')
plt.xlabel('Cashflow ($)')
plt.ylabel('Density')
plt.grid(True)
plt.show()

In [None]:
# 2. Fit a distribution (Normal)
import numpy as np
from scipy.stats import norm

# Fit normal distribution
mu, std = norm.fit(df['Amount'])

# Plot histogram and fitted PDF
x = np.linspace(df['Amount'].min(), df['Amount'].max(), 100)
pdf = norm.pdf(x, mu, std)

plt.figure(figsize=(10, 5))
plt.hist(df['Amount'], bins=20, density=True, alpha=0.5, label='Histogram')
plt.plot(x, pdf, 'r-', label='Fitted Normal PDF')
plt.title(f'Fitted Normal Distribution: mu={mu:.2f}, sigma={std:.2f}')
plt.xlabel('Cashflow')
plt.ylabel('Density')
plt.legend()
plt.grid(True)
plt.show()

