# Construction Material Prediction - Production Model
## Simplified version for web application integration

In [None]:
# Install required packages
!pip install pandas numpy scikit-learn joblib

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error, accuracy_score
import joblib
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Load your training data
print("Loading training data...")
train_df = pd.read_csv('data/col_material_key.csv')
print(f"Loaded {len(train_df)} training samples")
print("Columns:", list(train_df.columns))
train_df.head()

In [None]:
# Load material mapping
mapping_df = pd.read_csv('data/final_cluster_to_material_mapping.csv')
material_mapping = dict(zip(mapping_df['cluster'], mapping_df['material_key']))
print("Material categories:", set(material_mapping.values()))

In [None]:
# Data preprocessing function
def prepare_features(df):
    """Prepare features for ML model"""
    features_df = df.copy()
    
    # Handle numeric columns
    numeric_columns = ['SIZE_BUILDINGSIZE', 'NUMFLOORS', 'NUMROOMS', 'NUMBEDS']
    for col in numeric_columns:
        if col in features_df.columns:
            features_df[col] = pd.to_numeric(features_df[col], errors='coerce').fillna(0)
        else:
            features_df[col] = 0
    
    # Handle categorical columns
    categorical_columns = ['PROJECT_TYPE', 'STATE', 'CORE_MARKET']
    label_encoders = {}
    
    for col in categorical_columns:
        if col in features_df.columns:
            features_df[col] = features_df[col].fillna('Unknown')
            le = LabelEncoder()
            features_df[col] = le.fit_transform(features_df[col].astype(str))
            label_encoders[col] = le
        else:
            features_df[col] = 0
    
    # Create size categories
    features_df['SIZE_CAT'] = pd.cut(
        features_df['SIZE_BUILDINGSIZE'],
        bins=[0, 5000, 20000, 50000, 100000, float('inf')],
        labels=[0, 1, 2, 3, 4]
    ).astype(float).fillna(2)
    
    # Select final features
    feature_columns = numeric_columns + categorical_columns + ['SIZE_CAT']
    
    return features_df[feature_columns].fillna(0), label_encoders

# Prepare features
print("Preparing features...")
X, label_encoders = prepare_features(train_df)
print(f"Features shape: {X.shape}")
print("Feature columns:", list(X.columns))

In [None]:
# Prepare targets
y_material = train_df['cluster'].fillna(14)  # Default to 'Misc'
y_quantity = train_df['QtyShipped'].fillna(1)
y_quantity_log = np.log1p(y_quantity)  # Log transform

print(f"Material classes: {sorted(y_material.unique())}")
print(f"Quantity range: {y_quantity.min():.2f} - {y_quantity.max():.2f}")

In [None]:
# Train Material Classifier
print("Training material classifier...")
material_classifier = RandomForestClassifier(
    n_estimators=100,
    random_state=42,
    n_jobs=-1
)

material_classifier.fit(X, y_material)

# Evaluate classifier
y_pred_material = material_classifier.predict(X)
accuracy = accuracy_score(y_material, y_pred_material)
print(f"Material classifier accuracy: {accuracy:.3f}")

In [None]:
# Train Quantity Regressor
print("Training quantity regressor...")
quantity_regressor = RandomForestRegressor(
    n_estimators=100,
    random_state=42,
    n_jobs=-1
)

quantity_regressor.fit(X, y_quantity_log)

# Evaluate regressor
y_pred_quantity_log = quantity_regressor.predict(X)
y_pred_quantity = np.expm1(y_pred_quantity_log)
mae = mean_absolute_error(y_quantity, y_pred_quantity)
print(f"Quantity regressor MAE: {mae:.2f}")

In [None]:
# Save models
print("Saving models...")
joblib.dump(material_classifier, 'models/trained_models/material_classifier.joblib')
joblib.dump(quantity_regressor, 'models/trained_models/quantity_regressor.joblib')
joblib.dump(label_encoders, 'models/trained_models/label_encoders.joblib')
print("âœ… Models saved successfully!")

In [None]:
# Test prediction function
def test_prediction(project_data):
    """Test the trained model with sample data"""
    
    # Convert to model format
    model_data = {
        'SIZE_BUILDINGSIZE': 25000,
        'NUMFLOORS': 1,
        'NUMROOMS': 0,
        'NUMBEDS': 0,
        'PROJECT_TYPE': 'Commercial',
        'STATE': 'Maharashtra',
        'CORE_MARKET': 'Enterprise'
    }
    
    if 'size' in project_data:
        size_mapping = {
            'Small (<â‚¹1Cr)': 5000,
            'Medium (â‚¹1Crâ€“â‚¹10Cr)': 25000,
            'Large (>â‚¹10Cr)': 100000
        }
        model_data['SIZE_BUILDINGSIZE'] = size_mapping.get(project_data['size'], 25000)
    
    # Prepare features
    input_df = pd.DataFrame([model_data])
    
    # Apply same preprocessing
    for col in ['PROJECT_TYPE', 'STATE', 'CORE_MARKET']:
        if col in label_encoders:
            try:
                input_df[col] = label_encoders[col].transform([model_data[col]])
            except ValueError:
                input_df[col] = 0  # Unknown category
    
    input_df['SIZE_CAT'] = pd.cut(
        input_df['SIZE_BUILDINGSIZE'],
        bins=[0, 5000, 20000, 50000, 100000, float('inf')],
        labels=[0, 1, 2, 3, 4]
    ).astype(float).fillna(2)
    
    feature_columns = ['SIZE_BUILDINGSIZE', 'NUMFLOORS', 'NUMROOMS', 'NUMBEDS', 
                      'PROJECT_TYPE', 'STATE', 'CORE_MARKET', 'SIZE_CAT']
    features = input_df[feature_columns].fillna(0)
    
    # Predict
    material_probs = material_classifier.predict_proba(features)
    top_materials = np.argsort(material_probs[0])[-5:][::-1]  # Top 5
    
    quantity_pred = np.expm1(quantity_regressor.predict(features)[0])
    
    predictions = []
    for i, cluster_id in enumerate(top_materials):
        prob = material_probs[0][cluster_id]
        if prob > 0.1:  # Only significant probabilities
            material_category = material_mapping.get(cluster_id, 'Misc')
            predictions.append({
                'cluster': cluster_id,
                'category': material_category,
                'probability': prob,
                'estimated_quantity': max(1, int(quantity_pred * prob * 10))
            })
    
    return predictions

# Test with sample project
test_project = {
    'projectType': 'Commercial Construction',
    'size': 'Medium (â‚¹1Crâ€“â‚¹10Cr)',
    'state': 'Maharashtra',
    'city': 'Mumbai'
}

print("Testing prediction...")
test_results = test_prediction(test_project)
print("\nPrediction Results:")
for result in test_results:
    print(f"  â€¢ {result['category']}: {result['estimated_quantity']} units (confidence: {result['probability']:.2f})")

In [None]:
print("\nðŸŽ‰ Production ML Model Setup Complete!")
print("\nðŸ“‹ Next Steps:")
print("1. Models are saved in models/trained_models/")
print("2. Your web application can now use these models")
print("3. Run the backend with: python main.py")
print("4. Test the API endpoint: POST /predict")