In [25]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, precision_recall_fscore_support, precision_recall_curve
import matplotlib.pyplot as plt 

In [26]:
df = pd.read_csv('predictive_maintenance_dataset.csv')

In [27]:
df.head()

Unnamed: 0,date,device,failure,metric1,metric2,metric3,metric4,metric5,metric6,metric7,metric8,metric9
0,1/1/2015,S1F01085,0,215630672,55,0,52,6,407438,0,0,7
1,1/1/2015,S1F0166B,0,61370680,0,3,0,6,403174,0,0,0
2,1/1/2015,S1F01E6Y,0,173295968,0,0,0,12,237394,0,0,0
3,1/1/2015,S1F01JE0,0,79694024,0,0,0,6,410186,0,0,0
4,1/1/2015,S1F01R2B,0,135970480,0,0,0,15,313173,0,0,3


In [28]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 124494 entries, 0 to 124493
Data columns (total 12 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   date     124494 non-null  object
 1   device   124494 non-null  object
 2   failure  124494 non-null  int64 
 3   metric1  124494 non-null  int64 
 4   metric2  124494 non-null  int64 
 5   metric3  124494 non-null  int64 
 6   metric4  124494 non-null  int64 
 7   metric5  124494 non-null  int64 
 8   metric6  124494 non-null  int64 
 9   metric7  124494 non-null  int64 
 10  metric8  124494 non-null  int64 
 11  metric9  124494 non-null  int64 
dtypes: int64(10), object(2)
memory usage: 11.4+ MB


In [29]:
print(df.isnull().sum())


date       0
device     0
failure    0
metric1    0
metric2    0
metric3    0
metric4    0
metric5    0
metric6    0
metric7    0
metric8    0
metric9    0
dtype: int64


In [30]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
df['device'] = encoder.fit_transform(df['device'])


In [31]:
df['date'] = pd.to_datetime(df['date'])
df['day'] = df['date'].dt.day
df['month'] = df['date'].dt.month
df['year'] = df['date'].dt.year
df.drop('date', axis=1, inplace=True)


In [32]:
from sklearn.model_selection import train_test_split
X = df.drop('failure', axis=1)
y = df['failure']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)


ModuleNotFoundError: No module named 'imblearn'

In [13]:
# Find duplicate rows in the dataset
duplicates = df[df.duplicated()]
print(duplicates)


             date    device  failure    metric1  metric2  metric3  metric4  \
101335  7/10/2015  S1F0R4Q8        0  192721392        0        0        0   

        metric5  metric6  metric7  metric8  metric9  device_type_S1  \
101335        8   213700        0        0        0            True   

        device_type_W1  device_type_Z1  device_type_S1  device_type_W1  \
101335           False           False            True           False   

        device_type_Z1  
101335           False  


In [14]:
# Count the total number of duplicate rows
duplicate_count = df.duplicated().sum()
print(f"Number of duplicate rows: {duplicate_count}")


Number of duplicate rows: 1


In [15]:
# Drop duplicate rows
cleaned_dataset = df.drop_duplicates()
print(cleaned_dataset)


             date    device  failure    metric1  metric2  metric3  metric4  \
0        1/1/2015  S1F01085        0  215630672       55        0       52   
1        1/1/2015  S1F0166B        0   61370680        0        3        0   
2        1/1/2015  S1F01E6Y        0  173295968        0        0        0   
3        1/1/2015  S1F01JE0        0   79694024        0        0        0   
4        1/1/2015  S1F01R2B        0  135970480        0        0        0   
...           ...       ...      ...        ...      ...      ...      ...   
124489  11/2/2015  Z1F0MA1S        0   18310224        0        0        0   
124490  11/2/2015  Z1F0Q8RT        0  172556680       96      107        4   
124491  11/2/2015  Z1F0QK05        0   19029120     4832        0        0   
124492  11/2/2015  Z1F0QL3N        0  226953408        0        0        0   
124493  11/2/2015  Z1F0QLC1        0   17572840        0        0        0   

        metric5  metric6  metric7  metric8  metric9  device_typ

In [16]:
# Check duplicates based on specific columns
duplicates = df[df.duplicated(subset=['date', 'device'])]
print(duplicates)


             date    device  failure    metric1  metric2  metric3  metric4  \
101335  7/10/2015  S1F0R4Q8        0  192721392        0        0        0   

        metric5  metric6  metric7  metric8  metric9  device_type_S1  \
101335        8   213700        0        0        0            True   

        device_type_W1  device_type_Z1  device_type_S1  device_type_W1  \
101335           False           False            True           False   

        device_type_Z1  
101335           False  


In [17]:
# Drop duplicate rows from the dataset
df = df.drop_duplicates()

# Verify the updated dataset
print(df)


             date    device  failure    metric1  metric2  metric3  metric4  \
0        1/1/2015  S1F01085        0  215630672       55        0       52   
1        1/1/2015  S1F0166B        0   61370680        0        3        0   
2        1/1/2015  S1F01E6Y        0  173295968        0        0        0   
3        1/1/2015  S1F01JE0        0   79694024        0        0        0   
4        1/1/2015  S1F01R2B        0  135970480        0        0        0   
...           ...       ...      ...        ...      ...      ...      ...   
124489  11/2/2015  Z1F0MA1S        0   18310224        0        0        0   
124490  11/2/2015  Z1F0Q8RT        0  172556680       96      107        4   
124491  11/2/2015  Z1F0QK05        0   19029120     4832        0        0   
124492  11/2/2015  Z1F0QL3N        0  226953408        0        0        0   
124493  11/2/2015  Z1F0QLC1        0   17572840        0        0        0   

        metric5  metric6  metric7  metric8  metric9  device_typ

In [18]:
df['device_type'] = df['device'].str[:2]

# One-hot encoding for device type
df = pd.get_dummies(df, columns=['device_type'])

In [19]:
X = df[['metric1', 'metric2', 'metric3', 'metric4', 'metric5', 'metric6', 'metric7', 'metric8', 'metric9'] + list(df.filter(like='device_type').columns)]
y = df['failure']

In [20]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [22]:
from collections import Counter
# Verify balance
print("Original dataset shape:", Counter(y))
print("Resampled dataset shape:", Counter(y_train))


Original dataset shape: Counter({0: 124387, 1: 106})
Resampled dataset shape: Counter({0: 99504, 1: 90})


In [23]:
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20],
    'min_samples_split': [2, 5]
}


In [None]:
from sklearn.model_selection import RandomizedSearchCV
rf = RandomForestClassifier(random_state=42)

# Set up Randomized Search
random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_grid, n_iter=10, cv=3, scoring='roc_auc', n_jobs=-1, random_state=42)

# Fit the model
random_search.fit(X_train_balanced, y_train_balanced)

# Best parameters
print(random_search.best_params_)

NameError: name 'RandomizedSearchCV' is not defined

In [11]:
rf_best = random_search.best_estimator_
rf_best.fit(X_train_balanced, y_train_balanced)
y_pred = rf_best.predict(X_test)

NameError: name 'random_search' is not defined

In [None]:
# Evaluate the model
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("Classification Report:")
print(classification_report(y_test, y_pred))

auc_score = roc_auc_score(y_test, y_pred)
print("AUC-ROC Score:", auc_score)

precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='binary')
print(f"Precision: {precision}, Recall: {recall}, F1-Score: {f1}")


In [None]:

# Feature importance
importances = rf_best.feature_importances_
feature_names = X.columns
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
print(feature_importance_df)

# Precision-Recall curve
precision, recall, _ = precision_recall_curve(y_test, y_pred)
plt.plot(recall, precision)
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall curve')
plt.show()
