In [1]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
def analyze_asset_lifecycle(df,
                               cost_per_high_usage=15000,
                               replacement_cost=130000,
                               usage_threshold=750000):
    """
    Analyze projected cost of extending asset lifecycle 1 year.
    
    Parameters:
    - df: DataFrame with columns ['asset_id', 'current_usage', 'usage_last_year', 'age']
    - cost_per_high_usage: annual maintenance premium after 750k uses
    - replacement_cost: cost per new tractor
    - usage_threshold: threshold to flag high-cost asset
    
    Returns:
    - DataFrame with predictions and summary printout
    """

    # Step 1: Encode categorical features
    df = df.copy()

    # Step 2: Clustering to infer Task A vs Task B
    features = ['current_usage', 'usage_last_year', 'age']
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(df[features])

    kmeans = KMeans(n_clusters=2, random_state=42)
    df['cluster'] = kmeans.fit_predict(X_scaled)

    # Assume majority cluster is Task A
    majority_cluster = df['cluster'].value_counts().idxmax()
    df['type'] = df['cluster'].apply(lambda x: 'Task A' if x == majority_cluster else 'Task B')

    # Step 3: Predict mileage separately
    df['predicted_usage_next_year'] = 0.0

    # Task A: Linear regression
    df_taskA = df[df['type'] == 'Task A']
    model_taskA = LinearRegression()
    X_taskA = df_taskA[['usage_last_year']]
    y_taskA = df_taskA['usage_last_year']
    model_lh.fit(X_taskA, y_taskA)
    df.loc[df['type'] == 'Task A', 'predicted_usage_next_year'] = model_taskA.predict(X_taskA)

    # Task B: Median projection
    median_pd = df[df['type'] == 'Task B']['usage_last_year'].median()
    df.loc[df['type'] == 'Task B', 'predicted_usage_next_year'] = median_taskB

    # Step 4: Projected Odometer
    df['projected_usage'] = df['current_usuage'] + df['predicted_usage_next_year']
    df['will_cross_750k'] = df['projected_usage'] > usage_threshold

    # Step 5: Cost Summary
    num_risky = df['will_cross_750k'].sum()
    extra_maintenance_cost = num_risky * cost_per_high_usage
    replacement_cost_avoided = num_risky * replacement_cost

    print("========== SUMMARY ==========")
    print(f"Task A assets: {sum(df['type'] == 'Task A')}")
    print(f"Task B assets: {sum(df['type'] == 'Task B')}")
    print(f"Assets predicted to exceed {usage_threshold:,} usage next year: {num_risky}")
    print(f"Estimated maintenance penalty: ${extra_maintenance_cost:,.0f}")
    print(f"Replacement cost avoided by not buying: ${replacement_cost_avoided:,.0f}")
    print(f"Net cost impact (maint - saved): ${extra_maintenance_cost - replacement_cost_avoided:,.0f}")
    print("=============================")

    return df


In [None]:
# Load your data
results_df = analyze_usage_lifecycle(df)
