In [None]:
# 1. Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Machine learning and preprocessing
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor, AdaBoostRegressor
from sklearn.metrics import mean_squared_error, r2_score, classification_report, confusion_matrix, accuracy_score

# Time series forecasting (optional)
from prophet import Prophet

# Clustering (optional)
from sklearn.cluster import KMeans

# For warnings
import warnings
warnings.filterwarnings("ignore")


In [None]:
# 2. Load and Merge the Datasets
# Load the dispensing and weather datasets
df_dispense = pd.read_csv('Dispensing_travad.csv')
df_weather = pd.read_csv('final_cleaned_data_sand_ocean_weather (1).csv')


In [None]:
df_dispense.head()

In [None]:
# Convert the date columns to datetime objects
df_dispense['Date'] = pd.to_datetime(df_dispense['Date'], errors='coerce')
df_weather['utc_date'] = pd.to_datetime(df_weather['utc_date'], errors='coerce')

In [None]:
# Create a string column for merging in a common format (YYYY-MM-DD)
df_dispense['date_str'] = df_dispense['Date'].dt.strftime('%Y-%m-%d')
df_weather['date_str'] = df_weather['utc_date'].dt.strftime('%Y-%m-%d')

In [None]:
# Merge the datasets on 'date_str'
merged_df = pd.merge(df_dispense, df_weather, on='date_str', how='inner')
print("Merged DataFrame shape:", merged_df.shape)

# Inspect a few rows
merged_df.head()

In [None]:
# 3. Data Cleaning & Feature Engineering
# Convert key columns to numeric
merged_df['Volume'] = pd.to_numeric(merged_df['Volume'], errors='coerce')
merged_df['air_temp_c'] = pd.to_numeric(merged_df['air_temp_c'], errors='coerce')

In [None]:
# Standardize Volume and Air Temperature
scaler = StandardScaler()
merged_df['Volume_std'] = scaler.fit_transform(merged_df[['Volume']])
merged_df['air_temp_c_std'] = scaler.fit_transform(merged_df[['air_temp_c']])

In [None]:
# When computing the ratio, we use the standardized values.
# To avoid division by zero, we filter out rows where air_temp_c_std is 0.
merged_df = merged_df[merged_df['air_temp_c_std'] != 0]

In [None]:
# Compute the volume-to-temp ratio (in percentage) using standardized values.
merged_df['volume_to_temp_ratio'] = (merged_df['Volume_std'] / merged_df['air_temp_c_std']) * 100

In [None]:
merged_df.head()

In [None]:
# Show summary statistics
print("Volume-to-Temp Ratio Statistics:")
print(merged_df['volume_to_temp_ratio'].describe())

In [None]:
# 4. Create Three-Class Ratio Labels for Classification
# Calculate the 33rd and 66th percentiles for the volume_to_temp_ratio
low_threshold = merged_df['volume_to_temp_ratio'].quantile(0.33)
high_threshold = merged_df['volume_to_temp_ratio'].quantile(0.66)

def assign_three_classes(x):
    if x <= low_threshold:
        return 'Low'
    elif x <= high_threshold:
        return 'Medium'
    else:
        return 'High'

# Create the three-class column
merged_df['ratio_class_3'] = merged_df['volume_to_temp_ratio'].apply(assign_three_classes)

# Check the distribution of the three classes
print("Three-Class Ratio Distribution:")
print(merged_df['ratio_class_3'].value_counts())


In [None]:
merged_df.head()

In [None]:
# prompt: download merged_df.nunique()

merged_df.nunique().to_csv('nunique_values.csv')


In [None]:
merged_df.nunique()

In [None]:
# Filter the DataFrame for rows classified as 'Medium'
low_df = merged_df[merged_df['ratio_class_3'] == 'Low']

# Display summary statistics for the Medium class
print("Summary statistics for Low class:")
print(low_df['volume_to_temp_ratio'].describe())

# Visualize the distribution of volume_to_temp_ratio for the Medium class
plt.figure(figsize=(8, 6))
sns.histplot(low_df['volume_to_temp_ratio'], bins=10, kde=True, color='orange')
plt.title('Volume-to-Temp Ratio Distribution for Low Class')
plt.xlabel('Volume-to-Temp Ratio (%)')
plt.ylabel('Frequency')
plt.show()


In [None]:
# Filter the DataFrame for rows classified as 'Medium'
medium_df = merged_df[merged_df['ratio_class_3'] == 'Medium']

# Display summary statistics for the Medium class
print("Summary statistics for Medium class:")
print(medium_df['volume_to_temp_ratio'].describe())

# Visualize the distribution of volume_to_temp_ratio for the Medium class
plt.figure(figsize=(8,6))
sns.histplot(medium_df['volume_to_temp_ratio'], bins=30, kde=True, color='orange')
plt.title('Volume-to-Temp Ratio Distribution for Medium Class')
plt.xlabel('Volume-to-Temp Ratio (%)')
plt.ylabel('Frequency')
plt.show()


In [None]:
# Filter the DataFrame for rows classified as 'High'
low_df = merged_df[merged_df['ratio_class_3'] == 'High']

# Display summary statistics for the High class
print("Summary statistics for High class:")
print(low_df['volume_to_temp_ratio'].describe())

# Visualize the distribution of volume_to_temp_ratio for the High class
plt.figure(figsize=(8, 6))
sns.histplot(low_df['volume_to_temp_ratio'], bins=10, kde=True, color='orange')
plt.title('Volume-to-Temp Ratio Distribution for High Class')
plt.xlabel('Volume-to-Temp Ratio (%)')
plt.ylabel('Frequency')
plt.show()


In [None]:
# Count the number of values in each class (Low, Medium, High)
class_counts = merged_df['ratio_class_3'].value_counts()

# Print the counts
print("Count of values for each ratio class:")
print(class_counts)


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Calculate the counts for each ratio class
class_counts = merged_df['ratio_class_3'].value_counts()

# Reorder the counts to ensure they appear as Low, Medium, High
ordered_classes = ['Low', 'Medium', 'High']
ordered_counts = class_counts.reindex(ordered_classes).fillna(0)

# Create the bar graph
plt.figure(figsize=(8, 6))
ax = sns.barplot(x=ordered_counts.index, y=ordered_counts.values, palette='Set2')

# Set title and axis labels
plt.title("Number of Observations by Ratio Class")
plt.xlabel("Ratio Class")
plt.ylabel("Count")

# Annotate each bar with its count
for i, count in enumerate(ordered_counts.values):
    plt.text(i, count + count*0.01, f"{int(count)}", ha='center', va='bottom', fontsize=12)

plt.show()


In [None]:
import matplotlib.pyplot as plt

# Compute the minimum and maximum values for each class
low_stats = merged_df[merged_df['ratio_class_3'] == 'Low']['volume_to_temp_ratio'].agg(['min', 'max'])
med_stats = merged_df[merged_df['ratio_class_3'] == 'Medium']['volume_to_temp_ratio'].agg(['min', 'max'])
high_stats = merged_df[merged_df['ratio_class_3'] == 'High']['volume_to_temp_ratio'].agg(['min', 'max'])

# Define categories and collect their min and max values
categories = ['Low', 'Medium', 'High']
mins = [low_stats['min'], med_stats['min'], high_stats['min']]
maxs = [low_stats['max'], med_stats['max'], high_stats['max']]

# Create a horizontal range graph
plt.figure(figsize=(10, 6))

# Plot a horizontal line for each category representing its range
for i, cat in enumerate(categories):
    plt.hlines(y=i, xmin=mins[i], xmax=maxs[i], color='blue', linewidth=5)
    # Plot the minimum and maximum as markers
    plt.plot(mins[i], i, 'go', markersize=10)  # Green marker for min
    plt.plot(maxs[i], i, 'ro', markersize=10)  # Red marker for max
    # Annotate the endpoints with their exact values
    plt.text(mins[i], i - 0.1, f"Min: {mins[i]:.2f}", color='green', fontsize=10, ha='left')
    plt.text(maxs[i], i - 0.1, f"Max: {maxs[i]:.2f}", color='red', fontsize=10, ha='right')

# Set y-axis labels and title
plt.yticks(range(len(categories)), categories)
plt.xlabel("Volume-to-Temp Ratio (%)")
plt.title("Range of Volume-to-Temp Ratio by Classification")
plt.grid(True, axis='x', linestyle='--', alpha=0.5)
plt.show()


In [None]:
merged_df.columns

In [None]:
# Correlation Heatmap
plt.figure(figsize=(10, 8))
corr_matrix = merged_df[['air_temp_c', 'humidity_percent', 'dewpoint_temp_c', 'precipitation_mm', 'volume_to_temp_ratio', 'wind_direction_deg']].corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Heatmap of Key Features and Target')
plt.show()

In [None]:
# ================================
# Classification Task: Three-Class Ratio Prediction
# ================================

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

# Assuming merged_df is already defined and contains the relevant columns.
# Prepare the features and map the class labels to numeric values.
features_class = ['air_temp_c_std', 'humidity_percent', 'dewpoint_temp_c']
merged_df['ratio_class_3_numeric'] = merged_df['ratio_class_3'].map({'Low': 0, 'Medium': 1, 'High': 2})

X_class = merged_df[features_class]
y_class = merged_df['ratio_class_3_numeric']

print("Classification Features:", features_class)

# Split the data (80% training, 20% testing)
X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(
    X_class, y_class, test_size=0.2, random_state=42
)

# Standardize the classification features
scaler_class = StandardScaler()
X_train_class_scaled = scaler_class.fit_transform(X_train_class)
X_test_class_scaled = scaler_class.transform(X_test_class)

# ----- Model 1: Logistic Regression -----
log_reg = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000, random_state=42)
log_reg.fit(X_train_class_scaled, y_train_class)
y_pred_log = log_reg.predict(X_test_class_scaled)
print("Logistic Regression Classification Report:")
print(classification_report(y_test_class, y_pred_log))
print("Confusion Matrix:")
print(confusion_matrix(y_test_class, y_pred_log))

# ----- Model 2: Random Forest Classifier -----
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)
rf_clf.fit(X_train_class_scaled, y_train_class)
y_pred_rf = rf_clf.predict(X_test_class_scaled)
print("\nRandom Forest Classifier Classification Report:")
print(classification_report(y_test_class, y_pred_rf))
print("Confusion Matrix:")
print(confusion_matrix(y_test_class, y_pred_rf))
from sklearn.tree import DecisionTreeClassifier

# ----- Alternative Model 3: Decision Tree Classifier -----
dt_clf = DecisionTreeClassifier(random_state=42)
dt_clf.fit(X_train_class_scaled, y_train_class)
y_pred_dt = dt_clf.predict(X_test_class_scaled)
print("\nDecision Tree Classifier Classification Report:")
print(classification_report(y_test_class, y_pred_dt))
print("Confusion Matrix:")
print(confusion_matrix(y_test_class, y_pred_dt))


In [None]:
# ----- Performance Summary -----
performance_metrics = {}

# Logistic Regression Performance
acc_log = accuracy_score(y_test_class, y_pred_log)
report_log = classification_report(y_test_class, y_pred_log, output_dict=True)
performance_metrics['Logistic Regression'] = {
    'Accuracy': acc_log,
    'Macro Precision': report_log['macro avg']['precision'],
    'Macro Recall': report_log['macro avg']['recall'],
    'Macro F1': report_log['macro avg']['f1-score']
}

# Random Forest Performance
acc_rf = accuracy_score(y_test_class, y_pred_rf)
report_rf = classification_report(y_test_class, y_pred_rf, output_dict=True)
performance_metrics['Random Forest'] = {
    'Accuracy': acc_rf,
    'Macro Precision': report_rf['macro avg']['precision'],
    'Macro Recall': report_rf['macro avg']['recall'],
    'Macro F1': report_rf['macro avg']['f1-score']
}

# Decision Tree Performance
acc_dt = accuracy_score(y_test_class, y_pred_dt)
report_dt = classification_report(y_test_class, y_pred_dt, output_dict=True)
performance_metrics['Decision Tree'] = {
    'Accuracy': acc_dt,
    'Macro Precision': report_dt['macro avg']['precision'],
    'Macro Recall': report_dt['macro avg']['recall'],
    'Macro F1': report_dt['macro avg']['f1-score']
}

performance_df = pd.DataFrame(performance_metrics).T
print("\nSummary of Classification Model Performance:")
print(performance_df)

In [None]:
# ================================
# 6. Prepare Data for Classification and Regression Separately
# ================================

# For classification, we want to predict the ratio class, so our features should be only the weather measurements.
# We do not include the continuous volume_to_temp_ratio here.
features_class = ['air_temp_c', 'humidity_percent', 'dewpoint_temp_c']  # Use raw weather features
# Map the three-class labels to numeric: Low=0, Medium=1, High=2
merged_df['ratio_class_3_numeric'] = merged_df['ratio_class_3'].map({'Low': 0, 'Medium': 1, 'High': 2})

X_class = merged_df[features_class]
y_class = merged_df['ratio_class_3_numeric']

# For regression, we want to predict the continuous volume-to-temp ratio.
# Our features should again be only the weather measurements (and not include the ratio class).
features_reg = ['air_temp_c', 'humidity_percent', 'dewpoint_temp_c']  # Use raw weather features
X_reg = merged_df[features_reg]
y_reg = merged_df['volume_to_temp_ratio']

# Split the classification data (80% train, 20% test)
from sklearn.model_selection import train_test_split
X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(
    X_class, y_class, test_size=0.2, random_state=42)

# Split the regression data (80% train, 20% test)
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(
    X_reg, y_reg, test_size=0.2, random_state=42)

print("Classification Features:", features_class)
print("Regression Features:", features_reg)



In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler

# Standardize the classification features
scaler_class = StandardScaler()
X_train_class_scaled = scaler_class.fit_transform(X_train_class)
X_test_class_scaled = scaler_class.transform(X_test_class)

# Train a Logistic Regression classifier (using multinomial option)
clf = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000, random_state=42)
clf.fit(X_train_class_scaled, y_train_class)

# Make predictions on the test set
y_pred_class = clf.predict(X_test_class_scaled)

# Evaluate the classification model
print("Classification Report:")
print(classification_report(y_test_class, y_pred_class))
print("Confusion Matrix:")
print(confusion_matrix(y_test_class, y_pred_class))


In [None]:
# ================================
# 7. Standardize Features for Both Tasks
# ================================
# Standardize features for classification
scaler_class = StandardScaler()
X_train_class = scaler_class.fit_transform(X_train_class)
X_test_class = scaler_class.transform(X_test_class)

# Standardize features for regression
scaler_reg = StandardScaler()
X_train_reg = scaler_reg.fit_transform(X_train_reg)
X_test_reg = scaler_reg.transform(X_test_reg)


In [None]:
# ================================
# 8. Classification: Three-Class Ratio Prediction
# ================================
from sklearn.linear_model import LogisticRegression

# Initialize and train a Logistic Regression model for multi-class classification
log_reg = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000, random_state=42)
log_reg.fit(X_train_class, y_train_class)

# Predict and evaluate
y_pred_class = log_reg.predict(X_test_class)
print("Classification Report for Three-Class Ratio:")
print(classification_report(y_test_class, y_pred_class))
print("Confusion Matrix:")
print(confusion_matrix(y_test_class, y_pred_class))


In [None]:
from sklearn.model_selection import RandomizedSearchCV

# Define a reduced hyperparameter grid for each model
param_dist = {
    'Random Forest': {
        'n_estimators': [100, 150, 200],
        'max_depth': [10, 20, None],
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 2, 4]
    },
    'Gradient Booster': {
        'n_estimators': [100, 150],
        'learning_rate': [0.05, 0.1],
        'max_depth': [3, 5, 7]
    },
    'decision Tree': {
        'max_depth': [5, 10, 20],
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 2, 4]
    }
}

# Initialize models
models = {
    'Random Forest': RandomForestRegressor(random_state=42),
    'Gradient Booster': GradientBoostingRegressor(random_state=42),
    'decision Tree': DecisionTreeRegressor(random_state=42)
}

# Apply RandomizedSearchCV to each model
best_models = {}
for model_name, model in models.items():
    print(f"Tuning {model_name} using RandomizedSearchCV...")
    randomized_search = RandomizedSearchCV(
        model,
        param_distributions=param_dist[model_name],
        n_iter=20,  # Number of combinations to sample
        cv=3,  # Reduced cross-validation folds
        n_jobs=-1,  # Use all available cores
        random_state=42,
        scoring='neg_mean_squared_error',
        verbose=1
    )
    randomized_search.fit(X_train_reg, y_train_reg)

    # Save the best model
    best_models[model_name] = randomized_search.best_estimator_

# Evaluate tuned models
regression_results = {}
for model_name, model in best_models.items():
    y_pred = model.predict(X_test_reg)
    rmse = np.sqrt(mean_squared_error(y_test_reg, y_pred))
    r2 = r2_score(y_test_reg, y_pred)
    regression_results[model_name] = {'RMSE': rmse, 'R2': r2}
    print(f"Tuned {model_name}: RMSE = {rmse:.4f}, R² = {r2:.4f}")

# Summary DataFrame
results_df = pd.DataFrame(regression_results).T
print("\n Tuning Results Summary:")
print(results_df)

# Bar plot of R² scores
plt.figure(figsize=(8, 5))
sns.barplot(data=results_df.reset_index(), x='index', y='R2', palette='coolwarm')
plt.axhline(0, linestyle='--', color='gray')
plt.title('Tuned R² Scores of Regression Models')
plt.ylabel('R² Score')
plt.xlabel('Model')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


Tuning Random Forest using RandomizedSearchCV...
Fitting 3 folds for each of 20 candidates, totalling 60 fits


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor, AdaBoostRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Define your features and target
features_reg = ['air_temp_c', 'humidity_percent', 'dewpoint_temp_c']
target_reg = 'Average Flow Rate'
# Prepare the data
X_reg = merged_df[features_reg]
y_reg = merged_df[target_reg]

# Remove rows with missing values in the target variable
X_reg = X_reg[y_reg.notna()]
y_reg = y_reg[y_reg.notna()]

#  Train-test split
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(
    X_reg, y_reg, test_size=0.2, random_state=42
)

# Standardize the features
scaler = StandardScaler()
X_train_reg = scaler.fit_transform(X_train_reg)
X_test_reg = scaler.transform(X_test_reg)

#  Define regression models
regression_models = {
    'Random Forest': RandomForestRegressor(n_estimators=150, max_depth=10, random_state=42),
    'Gradient Booster': GradientBoostingRegressor(n_estimators=150, learning_rate=0.1, max_depth=4, random_state=42),
    'decision Tree':DecisionTreeRegressor(max_depth=10, random_state=42)}


#  Collect results
regression_results = {}

for name, model in regression_models.items():
    model.fit(X_train_reg, y_train_reg)
    y_pred = model.predict(X_test_reg)
    rmse = np.sqrt(mean_squared_error(y_test_reg, y_pred))
    r2 = r2_score(y_test_reg, y_pred)
    regression_results[name] = {'RMSE': rmse, 'R2': r2}
    print(f"{name}: RMSE = {rmse:.4f}, R² = {r2:.4f}")

# Summary DataFrame
results_df = pd.DataFrame(regression_results).T
print("\n Regression Model Performance Summary:")
print(results_df)

#  Bar plot of R² scores
plt.figure(figsize=(8, 5))
sns.barplot(data=results_df.reset_index(), x='index', y='R2', palette='coolwarm')
plt.axhline(0, linestyle='--', color='gray')
plt.title('R² Scores of Regression Models')
plt.ylabel('R² Score')
plt.xlabel('Model')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
import xgboost as xgb
import numpy as np

# Define a pipeline that expands features with polynomials, scales them, and then fits XGBoost
pipeline = Pipeline([
    ('poly', PolynomialFeatures(degree=2, include_bias=False)),
    ('scaler', StandardScaler()),
    ('xgb', xgb.XGBRegressor(objective='reg:squarederror', random_state=42))
])

# Define a parameter grid for the pipeline (you can adjust these values)
param_grid = {
    'poly__degree': [2],  # Using degree 2 polynomial features (you can try degree 3 as well)
    'xgb__n_estimators': [100, 150, 200],
    'xgb__max_depth': [3, 5, 7],
    'xgb__learning_rate': [0.05, 0.1, 0.2],
}

# Set up grid search with 3-fold cross-validation optimizing for R²
grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring='r2', n_jobs=-1)
grid_search.fit(X_train_reg, y_train_reg)

# Display the best parameters
print("Best parameters from GridSearchCV:", grid_search.best_params_)

# Evaluate the best model on the test set
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test_reg)
rmse = np.sqrt(mean_squared_error(y_test_reg, y_pred))
r2 = r2_score(y_test_reg, y_pred)

print(f"XGBoost with Polynomial Features: RMSE = {rmse:.4f}, R2 = {r2:.4f}")



In [None]:
# ================================
# 10. Visualize Regression Model Performance
# ================================
# Bar plot for RMSE comparison
plt.figure(figsize=(8, 6))
sns.barplot(x=results_df.index, y=results_df['RMSE'], palette='Blues_d')
plt.title('Regression Models - RMSE Comparison')
plt.ylabel('RMSE')
plt.xlabel('Model')
plt.xticks(rotation=45)
plt.show()

# Bar plot for R² comparison
plt.figure(figsize=(8, 6))
sns.barplot(x=results_df.index, y=results_df['R2'], palette='Greens_d')
plt.title('Regression Models - R² Comparison')
plt.ylabel('R² Score')
plt.xlabel('Model')
plt.xticks(rotation=45)
plt.show()


In [None]:

# 12.Time Series Forecasting with Prophet

# Prepare data for forecasting the continuous target: volume_to_temp_ratio
# Ensure 'volume_to_temp_ratio' column exists. If not, recalculate it.
if 'volume_to_temp_ratio' not in merged_df.columns:
    # Assuming you have the necessary columns ('Volume_std', 'air_temp_c_std')
    merged_df['volume_to_temp_ratio'] = (merged_df['Volume_std'] / merged_df['air_temp_c_std']) * 100

prophet_df = merged_df[['date_str', 'volume_to_temp_ratio']].copy()
prophet_df.columns = ['ds', 'y']
prophet_df['ds'] = pd.to_datetime(prophet_df['ds'], errors='coerce')
prophet_df = prophet_df.dropna(subset=['ds'])

prophet_model = Prophet()
prophet_model.fit(prophet_df)

# Forecast the next 30 days
future = prophet_model.make_future_dataframe(periods=30)
forecast = prophet_model.predict(future)
fig = prophet_model.plot(forecast)
plt.title("Prophet Forecast - Volume-to-Temp Ratio")
plt.show()

In [None]:
'Date' in df_dispense.columns
'Average Flow Rate' in merged_df.columns  # after merging


In [None]:
# ================================
# 14. Client-Requested Fixes and New Feature Engineering
# ================================

import pytz

# Define Mountain timezone
mountain = pytz.timezone('MST')

# Convert 'Date' to datetime, localize to Mountain Time, then convert to UTC
df_dispense['Date'] = pd.to_datetime(df_dispense['Date'], errors='coerce')
df_dispense['Date_UTC'] = df_dispense['Date'].dt.tz_localize(mountain, ambiguous='NaT', nonexistent='NaT').dt.tz_convert('UTC')

# Create date_str from UTC for merging
df_dispense['date_str'] = df_dispense['Date_UTC'].dt.strftime('%Y-%m-%d')

# Also fix the weather dataset
df_weather['utc_date'] = pd.to_datetime(df_weather['utc_date'], errors='coerce')
df_weather['date_str'] = df_weather['utc_date'].dt.strftime('%Y-%m-%d')

# Re-merge using date_str based on UTC
merged_df = pd.merge(df_dispense, df_weather, on='date_str', how='inner')
print("✅ Merged shape with UTC fixed:", merged_df.shape)


In [None]:
# Convert to numeric in case any columns have wrong types
merged_df['Average Flow Rate'] = pd.to_numeric(merged_df['Average Flow Rate'], errors='coerce')
merged_df['Flow Duration'] = pd.to_numeric(merged_df['Flow Duration'], errors='coerce')
merged_df['Duration in Seconds'] = pd.to_numeric(merged_df['Duration in Seconds'], errors='coerce')

# Create Downtime = Duration - Flow Duration
merged_df['Downtime'] = merged_df['Duration in Seconds'] - merged_df['Flow Duration']

# Preview result
print("✅ Sample of Downtime values:\n", merged_df[['Duration in Seconds', 'Flow Duration', 'Downtime']].head())


In [None]:
merged_df['Flow Duration'].isna().sum(), merged_df['Flow Duration'].shape[0]


In [None]:
# Group by Product and Meter ID to find average flow rate
grouped_flow = merged_df.groupby(['Product', 'Meter ID'])['Average Flow Rate'].mean().reset_index(name='avg_flow_by_product_meter')

# Merge this KPI back into the main DataFrame
merged_df = pd.merge(merged_df, grouped_flow, on=['Product', 'Meter ID'], how='left')

# Preview result
print("Added avg_flow_by_product_meter:\n", merged_df[['Product', 'Meter ID', 'avg_flow_by_product_meter']].head())


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Create a histogram of avg_flow_by_product_meter
plt.figure(figsize=(10, 6))
sns.histplot(merged_df['avg_flow_by_product_meter'], bins=30, kde=True)
plt.title('Distribution of Average Flow Rate by Product and Meter')

plt.show()

In [None]:
# Define classification logic
def classify_flow(value):
    if pd.isna(value):
        return 'Unknown'
    elif value < 20:
        return 'LOW'
    elif value < 50:
        return 'MED'
    else:
        return 'HIGH'

# Apply flow category label
merged_df['Flow_Category'] = merged_df['avg_flow_by_product_meter'].apply(classify_flow)

# Show distribution
print("✅ Flow Category Distribution:\n", merged_df['Flow_Category'].value_counts())


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Create a count plot of Flow_Category
plt.figure(figsize=(8, 6))
sns.countplot(data=merged_df, x='Flow_Category')
plt.title('Distribution of Flow Categories')
plt.show()

In [None]:
#performance matrix
impo

In [None]:
# ================================
# 15. Classification Model: Predicting Flow Category (MED vs HIGH)
# ================================
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Use only clean and relevant numeric features
features = ['air_temp_c', 'humidity_percent', 'precipitation_mm', 'Average Flow Rate', 'Duration in Seconds']

# Drop missing values
clf_data = merged_df[features + ['Flow_Category']].dropna()

# Encode MED = 0, HIGH = 1
clf_data['Flow_Category_Encoded'] = clf_data['Flow_Category'].map({'MED': 0, 'HIGH': 1})

# Define inputs and target
X = clf_data[features]
y = clf_data['Flow_Category_Encoded']

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a lightweight Random Forest classifier
clf = RandomForestClassifier(n_estimators=50, max_depth=5, random_state=42)
clf.fit(X_train, y_train)

# Make predictions
y_pred = clf.predict(X_test)

# Evaluate the model
print("✅ Accuracy:", accuracy_score(y_test, y_pred))
print("\n✅ Classification Report:\n", classification_report(y_test, y_pred))
print("\n✅ Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
