In [3]:
pip install statsmodels

Defaulting to user installation because normal site-packages is not writeableNote: you may need to restart the kernel to use updated packages.



In [5]:
pip install pmdarima

Defaulting to user installation because normal site-packages is not writeable
Collecting pmdarima
  Downloading pmdarima-2.0.4-cp312-cp312-win_amd64.whl.metadata (8.0 kB)
Collecting Cython!=0.29.18,!=0.29.31,>=0.29 (from pmdarima)
  Using cached Cython-3.0.11-cp312-cp312-win_amd64.whl.metadata (3.2 kB)
Downloading pmdarima-2.0.4-cp312-cp312-win_amd64.whl (625 kB)
   ---------------------------------------- 0.0/625.1 kB ? eta -:--:--
   ---------------------------------------- 0.0/625.1 kB ? eta -:--:--
   ---------------------------------------- 625.1/625.1 kB 3.3 MB/s eta 0:00:00
Using cached Cython-3.0.11-cp312-cp312-win_amd64.whl (2.8 MB)
Installing collected packages: Cython, pmdarima
Successfully installed Cython-3.0.11 pmdarima-2.0.4
Note: you may need to restart the kernel to use updated packages.


In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from math import sqrt
import pickle

# Load your own CSV file (replace 'path_to_your_csv_file.csv' with the actual path)
file_path = 'Financial_Data.csv'
df_new_data = pd.read_csv(file_path)

# Step 1: Data Cleaning
df_new_data['Date'] = pd.to_datetime(df_new_data['Date'], errors='coerce')  # Convert to datetime
df_new_data.dropna(subset=['Date', 'Amount'], inplace=True)  # Remove rows with missing date or amount
df_new_data.set_index('Date', inplace=True)

# Handle outliers using IQR (remove outliers)
Q1 = df_new_data['Amount'].quantile(0.25)
Q3 = df_new_data['Amount'].quantile(0.75)
IQR = Q3 - Q1
df_new_data = df_new_data[(df_new_data['Amount'] >= (Q1 - 1.5 * IQR)) & 
                          (df_new_data['Amount'] <= (Q3 + 1.5 * IQR))]

# Resample daily expenses
daily_expenses = df_new_data.resample('D')['Amount'].sum().fillna(0)

# Convert to DataFrame for modeling
daily_expenses = pd.DataFrame(daily_expenses)

# Feature Engineering: Add lag features, day of the week, and month
daily_expenses['lag_1'] = daily_expenses['Amount'].shift(1)
daily_expenses['lag_7'] = daily_expenses['Amount'].shift(7)
daily_expenses['day_of_week'] = daily_expenses.index.dayofweek
daily_expenses['month'] = daily_expenses.index.month

# Drop NaN rows caused by shifting
daily_expenses.dropna(inplace=True)

# Split features (X) and target variable (y)
X = daily_expenses[['lag_1', 'lag_7', 'day_of_week', 'month']]
y = daily_expenses['Amount']

# Step 2: Apply Feature Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, shuffle=False)

# Initialize models
models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(),
    'Gradient Boosting Regressor': GradientBoostingRegressor(n_estimators=100)
}

# Polynomial Regression (with degree 2)
poly = PolynomialFeatures(degree=2)
X_poly_train = poly.fit_transform(X_train)
X_poly_test = poly.transform(X_test)
poly_model = LinearRegression()

# Random Forest Hyperparameter Tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10]
}
random_forest = RandomForestRegressor()
grid_search_rf = GridSearchCV(estimator=random_forest, param_grid=param_grid, cv=3)
grid_search_rf.fit(X_train, y_train)

# Best Random Forest model from GridSearch
best_rf_model = grid_search_rf.best_estimator_

# Track the best model
best_model = None
best_r2 = float('-inf')
best_model_name = ''

# Evaluate each model
for model_name, model in models.items():
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    
    rmse = sqrt(mean_squared_error(y_test, predictions))
    r2 = r2_score(y_test, predictions)
    
    print(f"{model_name} - RMSE: {rmse}, R-squared: {r2}")
    
    if r2 > best_r2:
        best_r2 = r2
        best_model = model
        best_model_name = model_name

# Evaluate Polynomial Regression separately
poly_model.fit(X_poly_train, y_train)
poly_predictions = poly_model.predict(X_poly_test)
rmse_poly = sqrt(mean_squared_error(y_test, poly_predictions))
r2_poly = r2_score(y_test, poly_predictions)

print(f"Polynomial Regression (Degree 2) - RMSE: {rmse_poly}, R-squared: {r2_poly}")

if r2_poly > best_r2:
    best_r2 = r2_poly
    best_model = poly_model
    best_model_name = 'Polynomial Regression (Degree 2)'

# Evaluate tuned Random Forest model separately
best_rf_model.fit(X_train, y_train)
rf_predictions = best_rf_model.predict(X_test)
rmse_rf = sqrt(mean_squared_error(y_test, rf_predictions))
r2_rf = r2_score(y_test, rf_predictions)

print(f"Random Forest (Tuned) - RMSE: {rmse_rf}, R-squared: {r2_rf}")

if r2_rf > best_r2:
    best_r2 = r2_rf
    best_model = best_rf_model
    best_model_name = 'Random Forest (Tuned)'

# Print the best model
print(f"\nBest Model: {best_model_name} with R-squared: {best_r2}")

# Save the best model
with open(f'best_{best_model_name.lower().replace(" ", "_")}_model.pkl', 'wb') as f:
    pickle.dump(best_model, f)
print(f"Best model saved as best_{best_model_name.lower().replace(" ", "_")}_model.pkl")


Linear Regression - RMSE: 637.7482215339697, R-squared: -0.13921491833800803
Ridge Regression - RMSE: 637.4302565712177, R-squared: -0.13807923463892013
Gradient Boosting Regressor - RMSE: 679.8570657547323, R-squared: -0.29462020951352086
Polynomial Regression (Degree 2) - RMSE: 685.4455746885633, R-squared: -0.31599156315519994
Random Forest (Tuned) - RMSE: 632.5350584906619, R-squared: -0.1206664078143933

Best Model: Random Forest (Tuned) with R-squared: -0.1206664078143933
Best model saved as best_random_forest_(tuned)_model.pkl
