In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
from utils import get_stmt_data

df = get_stmt_data('../data/stmt.csv')

# Set date as index (convert to Timeseries )
df.set_index('Date', inplace=True)

monthly_cashflow = df['Amount'].resample('ME').sum()
print('\n////////////////////\nWeekly Cashflow Sum (12-month)\n////////////////////')
print(monthly_cashflow)

# Plot weekly cashflow
import matplotlib.pyplot as plt

monthly_cashflow.plot(figsize=(10,5))
plt.title('Time Series Plot of Weekly Cashflow')
plt.xlabel('Date')
plt.ylabel('Value ($)')
plt.grid(True)
plt.legend()
plt.show()


In [None]:
# Fit Distribution
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats

# Fit a normal dist.
mu, std = stats.norm.fit(monthly_cashflow)

# Fit gamma
params_gamma = stats.gamma.fit(monthly_cashflow)

# Fit lognormal
params_lognorm = stats.lognorm.fit(monthly_cashflow)

# Fit exponetial
params_exp = stats.expon.fit(monthly_cashflow)

x = np.linspace(min(monthly_cashflow), max(monthly_cashflow), 100)
pdf = stats.norm.pdf(x, mu, std)

plt.figure(figsize=(10,5))
plt.hist(monthly_cashflow, bins=15, density=True, alpha=0.5, label='Weekly Cashflow histogram (Normal)')
plt.plot(x, pdf, 'r-', label=f'Normal Fit\nmu={mu:.2f}, sigma={std:.2f}')
plt.title('Fitted Normal Distribution')
plt.xlabel('Value')
plt.ylabel('Density')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Kernel Density Estimation (KDE)
import seaborn as sns

plt.figure(figsize=(10,5))
sns.kdeplot(monthly_cashflow, fill=True)
plt.title('Estimated PDF of Monthly Cashflow (KDE)')
plt.xlabel('Cashflow ($)')
plt.ylabel('Density')
plt.grid(True)
plt.show()

In [None]:
# Compare multiple distributions
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats

distributions = [stats.norm, stats.expon, stats.gamma, stats.lognorm]
results = {}

for dist in distributions:
    params = dist.fit(monthly_cashflow)
    log_likelihood = np.sum(dist.logpdf(monthly_cashflow, *params))
    results[dist.name] = {"params": params, "log_likelihood": log_likelihood}

# Sort by best (highest log-likelihood)
results = dict(sorted(results.items(), key=lambda x: -x[1]['log_likelihood']))

for name, results in results.items():
    print(f"{name}: log-likelihood = {results['log_likelihood']:.2f}")

In [None]:
# Perform a Normality Test
from scipy.stats import shapiro, normaltest

stat, p = shapiro(monthly_cashflow)
print("Shapiro-Wilk Test  p-value:", p) # If p < 0.05, the data is not normally distributed.

In [None]:
# Probabilities (integration) from CDF

from scipy.stats import norm
lowerbound = -1000
upperbound = -100

mu, std = stats.norm.fit(monthly_cashflow)
dist = norm(loc=mu, scale=std)

# Probability that X is between a range.

prob = dist.cdf(upperbound) - dist.cdf(lowerbound)
print(f"P({lowerbound} <= X <= {upperbound}) = {prob:.4f}")



In [None]:
# Probabilities from Fitted PDF (e.g. KDE)

import numpy as np
from scipy.stats import gaussian_kde
from scipy.integrate import quad

# Fit KDE
kde = gaussian_kde(monthly_cashflow)

# Integrate KDE between lower and upper bound
prob, _ = quad(kde.evaluate, lowerbound, upperbound)
print(f"P({lowerbound} <= X <= {upperbound}) = {prob:.4f}")

In [None]:
# Histogram Approximation

import numpy as np

# PDF approximation (e.g., from histogram)
min_val, max_val = min(monthly_cashflow), max(monthly_cashflow)
x = np.linspace(min_val, max_val, 1000)
pdf_values = norm.pdf(x, loc=mu, scale=std)

# Estimate P(lowerbound <= X <= upperbound)
mask = (x >= lowerbound) & (x <= upperbound)
prob = np.trapezoid(pdf_values[mask], x[mask])
print(f"P({lowerbound} <= X <= {upperbound}) = {prob:.4f}")

In [None]:
summary = monthly_cashflow.describe(include='all')
print(summary)