In [22]:
pip install scikit-learn



In [23]:
import numpy as np
from sklearn.model_selection import KFold, cross_val_score
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier

In [24]:
import pandas as pd

In [25]:
df = pd.read_csv('cannula_data.csv')

print(df)

    No.  Order No.  Article No.   Drawing  Bright Annealing  Sinking  \
0     1   14389927      15323759   360.40            107.40   385.60   
1     2   14389217      15325226   384.70            102.60   387.30   
2     3   14389820      15325218   391.10             99.20   375.60   
3     4   14388649      15325220   383.80            106.60   387.70   
4     5   14388570      15325218   392.60            115.00   396.80   
..  ...        ...           ...      ...               ...      ...   
95   96   14389722      15325221   377.44            177.72   390.23   
96   97   14389945      15325222   384.25            160.63   385.09   
97   98   14389828      15325221   418.41            188.91   413.28   
98   99   14389526      15325222   396.78            196.79   371.83   
99  100   14389578      15325221   347.26            189.11   345.72   

    Electro Fission  Distorted or Not  
0            387.70                 1  
1            390.70                 1  
2            39

In [26]:
df.drop(['No.','Order No.'],axis=1, inplace=True)

In [27]:
df.drop(['Article No. '],axis=1, inplace=True)

In [28]:
df.head(10)

Unnamed: 0,Drawing,Bright Annealing,Sinking,Electro Fission,Distorted or Not
0,360.4,107.4,385.6,387.7,1
1,384.7,102.6,387.3,390.7,1
2,391.1,99.2,375.6,391.0,1
3,383.8,106.6,387.7,394.1,1
4,392.6,115.0,396.8,396.0,1
5,361.3,114.1,383.6,400.0,1
6,395.1,98.9,411.8,402.0,1
7,393.8,117.1,402.4,406.9,1
8,375.0,115.6,403.8,407.1,1
9,387.4,115.2,390.1,407.9,1


In [29]:
# Step 2: Separate features (X) and target (y)
X = df[['Drawing', 'Bright Annealing', 'Sinking', 'Electro Fission']]  # Feature columns
y = df['Distorted or Not']  # Target column (e.g., 1 or 0)

# Step 3: Initialize the model
model = RandomForestClassifier()

# Step 4: Set up K-Fold Cross-Validation
kf = KFold(n_splits=10, shuffle=True, random_state=42)

# Step 5: Perform Cross-Validation
scores = cross_val_score(model, X, y, cv=kf, scoring='accuracy')

# Step 6: Print the results
print("Cross-validation scores:", scores)
print("Mean accuracy:", scores.mean())

Cross-validation scores: [1.  1.  1.  1.  1.  1.  0.9 1.  1.  1. ]
Mean accuracy: 0.99


In [30]:
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Evaluate the model with StratifiedKFold
stratified_scores = cross_val_score(model, X, y, cv=skf, scoring='accuracy')

print("Stratified cross-validation scores:", stratified_scores)
print("Mean accuracy:", stratified_scores.mean())

Stratified cross-validation scores: [1.  1.  1.  1.  1.  1.  0.9 1.  1.  1. ]
Mean accuracy: 0.99


In [31]:
from sklearn.metrics import accuracy_score

In [32]:
# Step 4: Initialize the Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Step 5: Train and validate using cross-validation
fold_accuracies = []  # To store accuracy for each fold

for train_index, test_index in kf.split(X):
    # Split the data into train and validation sets
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Train the Random Forest Classifier
    rf_model.fit(X_train, y_train)

    # Make predictions on the validation set
    y_pred = rf_model.predict(X_test)

    # Evaluate the performance
    accuracy = accuracy_score(y_test, y_pred)
    fold_accuracies.append(accuracy)

    print(f"Fold Accuracy: {accuracy:.2f}")

# Step 6: Calculate the average accuracy across all folds
average_accuracy = np.mean(fold_accuracies)
print(f"\nAverage Cross-Validation Accuracy: {average_accuracy:.2f}")

Fold Accuracy: 1.00
Fold Accuracy: 1.00
Fold Accuracy: 1.00
Fold Accuracy: 1.00
Fold Accuracy: 1.00
Fold Accuracy: 1.00
Fold Accuracy: 0.90
Fold Accuracy: 1.00
Fold Accuracy: 1.00
Fold Accuracy: 1.00

Average Cross-Validation Accuracy: 0.99


In [33]:
# Store predictions for each fold
all_predictions = []

for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    rf_model.fit(X_train, y_train)
    y_pred = rf_model.predict(X_test)

    all_predictions.extend(y_pred)

# Convert predictions to a DataFrame
predictions_df = pd.DataFrame({
    'Actual': y,
    'Predicted': all_predictions
})

print(predictions_df.head())

   Actual  Predicted
0       1          1
1       1          1
2       1          1
3       1          1
4       1          1


In [34]:
from sklearn.model_selection import GridSearchCV

# Initialize the Random Forest Classifier
rf = RandomForestClassifier(random_state=42)

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Perform Grid Search with 5-fold cross-validation
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='accuracy')

# Fit the model on the data
grid_search.fit(X, y)

# Print the optimal parameters
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation Accuracy:", grid_search.best_score_)

Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best Cross-Validation Accuracy: 0.99


In [36]:
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
# Fit the grid search model
grid_search.fit(X, y)

# Best hyperparameters and the best cross-validation score
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation Accuracy:", grid_search.best_score_)

# Get the detailed results for each fold
results = pd.DataFrame(grid_search.cv_results_)
print("\nDetailed Grid Search Results:\n", results)

# Use the best model to predict and evaluate metrics
best_model = grid_search.best_estimator_

# Predict on the entire dataset or a separate test set
y_pred = best_model.predict(X)

# Calculate accuracy and other metrics
print("\nAccuracy:", accuracy_score(y, y_pred))
print("\nClassification Report:\n", classification_report(y, y_pred))
print("\nROC AUC Score:", roc_auc_score(y, y_pred))

Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best Cross-Validation Accuracy: 0.99

Detailed Grid Search Results:
      mean_fit_time  std_fit_time  mean_score_time  std_score_time  \
0         0.479051      0.134425         0.018751        0.010029   
1         0.890840      0.247756         0.021505        0.006089   
2         0.756179      0.151631         0.023623        0.003615   
3         0.246379      0.065486         0.009178        0.002182   
4         0.355813      0.005364         0.013245        0.002955   
..             ...           ...              ...             ...   
103       0.322852      0.019014         0.012747        0.002346   
104       0.465700      0.007366         0.017128        0.004561   
105       0.156396      0.010387         0.007109        0.000628   
106       0.348061      0.064695         0.011949        0.001992   
107       0.731751      0.141762         0.023790        0.005124  