In [11]:
# Cell 1: Libraries (Leave as is)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import gcsfs

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                           f1_score, roc_auc_score, roc_curve, confusion_matrix)

import shap

print("Libraries imported successfully!")


Libraries imported successfully!


In [12]:
# Cell 2: Data Loading (Leave as is)
bucket_name = "predictivemaintence"
file_path = "predictive_maintenance.csv"
gcs_path = f"gs://{bucket_name}/{file_path}"

try:
    fs = gcsfs.GCSFileSystem()
    df = pd.read_csv(gcs_path)
    print("Data loaded successfully!")
    print(f"Dataset shape: {df.shape}")
except FileNotFoundError:
    print(f"Error: File not found at {gcs_path}")
except Exception as e:
    print(f"An error occurred: {e}")

Data loaded successfully!
Dataset shape: (10000, 10)


In [13]:
# Cell 3: Initial Exploration & Basic Preprocessing (Leave as is)
print("Original Data Info:")
df.info()
print("\nOriginal Data Head:")
print(df.head())
print("\nMissing values before preprocessing:")
print(df.isnull().sum())

features_to_drop = ['UDI', 'Product ID', 'Failure Type', 'Target']
X = df.drop(columns=features_to_drop)
y = df['Target']

categorical_features = ['Type']
numerical_features = X.drop(columns=categorical_features).columns.tolist()

print("\nFeatures identified:")
print("Categorical:", categorical_features)
print("Numerical:", numerical_features)
print("\nTarget variable shape:", y.shape)
print("Features shape before encoding:", X.shape)

Original Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 10 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   UDI                      10000 non-null  int64  
 1   Product ID               10000 non-null  object 
 2   Type                     10000 non-null  object 
 3   Air temperature [K]      10000 non-null  float64
 4   Process temperature [K]  10000 non-null  float64
 5   Rotational speed [rpm]   10000 non-null  int64  
 6   Torque [Nm]              10000 non-null  float64
 7   Tool wear [min]          10000 non-null  int64  
 8   Target                   10000 non-null  int64  
 9   Failure Type             10000 non-null  object 
dtypes: float64(3), int64(4), object(3)
memory usage: 781.4+ KB

Original Data Head:
   UDI Product ID Type  Air temperature [K]  Process temperature [K]  \
0    1     M14860    M                298.1                    308.

In [14]:
# Cell 4: Train/Test Split (Leave as is)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print("Data split into Train and Test sets:")
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

Data split into Train and Test sets:
X_train shape: (8000, 6)
X_test shape: (2000, 6)
y_train shape: (8000,)
y_test shape: (2000,)


In [15]:
# Cell 5: Preprocessing Pipeline (Leave as is)
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='passthrough'
)

X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

try:
    feature_names_out = preprocessor.get_feature_names_out()
    print("\nProcessed feature names:", feature_names_out.tolist())
except AttributeError:
    feature_names_out = numerical_features + \
                        preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features).tolist()
    print("\nProcessed feature names (fallback):", feature_names_out)

X_train_processed_df = pd.DataFrame(X_train_processed, columns=feature_names_out, index=X_train.index)
X_test_processed_df = pd.DataFrame(X_test_processed, columns=feature_names_out, index=X_test.index)

print("\nShapes after preprocessing:")
print("X_train_processed_df shape:", X_train_processed_df.shape)
print("X_test_processed_df shape:", X_test_processed_df.shape)
print("\nFirst 5 rows of processed training data:")
print(X_train_processed_df.head())



Processed feature names: ['num__Air temperature [K]', 'num__Process temperature [K]', 'num__Rotational speed [rpm]', 'num__Torque [Nm]', 'num__Tool wear [min]', 'cat__Type_H', 'cat__Type_L', 'cat__Type_M']

Shapes after preprocessing:
X_train_processed_df shape: (8000, 8)
X_test_processed_df shape: (2000, 8)

First 5 rows of processed training data:
      num__Air temperature [K]  num__Process temperature [K]  \
4058                  0.998914                      0.604282   
1221                 -1.505194                     -1.153260   
6895                  0.498092                      1.077466   
9863                 -0.553633                     -0.139294   
8711                 -1.455112                     -1.018064   

      num__Rotational speed [rpm]  num__Torque [Nm]  num__Tool wear [min]  \
4058                    -0.460607          0.718305             -0.843997   
1221                    -0.775574          0.638456              0.382263   
6895                    -1.0076

In [10]:
# Cell 6: Baseline Model (Logistic Regression) (Modified to add model saving)
print("\n--- Training Baseline Logistic Regression Model ---")
baseline_model = LogisticRegression(random_state=42, max_iter=1000)
baseline_model.fit(X_train_processed_df, y_train)

# Predictions
y_pred_baseline = baseline_model.predict(X_test_processed_df)
y_pred_proba_baseline = baseline_model.predict_proba(X_test_processed_df)[:, 1]

# Evaluation
print("\n--- Baseline Model Evaluation ---")
print("Accuracy:", accuracy_score(y_test, y_pred_baseline))
print("Precision:", precision_score(y_test, y_pred_baseline))
print("Recall:", recall_score(y_test, y_pred_baseline))
print("F1-score:", f1_score(y_test, y_pred_baseline))
print("AUC-ROC:", roc_auc_score(y_test, y_pred_proba_baseline))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_baseline))

# --- ADD THESE LINES TO SAVE THE MODEL ---
import joblib
joblib.dump(baseline_model, 'predictive_maintenance_model.pkl')
print("\nBaseline model saved to predictive_maintenance_model.pkl")
# ----------------------------------------


--- Training Baseline Logistic Regression Model ---

--- Baseline Model Evaluation ---
Accuracy: 0.9675
Precision: 0.6363636363636364
Recall: 0.10294117647058823
F1-score: 0.17721518987341772
AUC-ROC: 0.8993880160759956
Confusion Matrix:
 [[1928    4]
 [  61    7]]

Baseline model saved to predictive_maintenance_model.pkl
