In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split , GridSearchCV, learning_curve
from sklearn.tree import DecisionTreeClassifier
import mlflow
from xgboost import XGBClassifier
from mlflow.tracking import MlflowClient
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, roc_auc_score

In [2]:
data = pd.read_csv('../data/iris.csv')
data.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


In [4]:
data['species'].value_counts()

species
setosa        50
versicolor    50
virginica     50
Name: count, dtype: int64

In [24]:
encoder = LabelEncoder()

x = data.drop(columns=['species'])
y = data['species']

# encoding the target variable
y_encoded =  encoder.fit_transform(y)

# scaling other variables 
scaler = StandardScaler()
X = scaler.fit_transform(x)

# splitting into train and test data 


X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

In [27]:
x_df = pd.DataFrame(X, columns=x.columns)
y_df = pd.DataFrame(y_encoded, columns=['species'])

final_df = pd.concat([x_df, y_df], axis=1)
final_df.to_csv("../data/processed/train.csv", index=False)
print("Saved preprocessed Iris dataset to data/processed/train.csv")

Saved preprocessed Iris dataset to data/processed/train.csv


In [6]:
xgb_model = XGBClassifier(
    objective='multi:softmax',  
    num_class=3,              
    eval_metric='mlogloss',    
    random_state=42
)

#Hyperparameter tuning
param_grid = {
    'n_estimators': [1,2,3],
    'max_depth': [1,2,3],
}

grid_search = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid,
    scoring='roc_auc_ovr', 
    cv=5,
    return_train_score=True,
    verbose=1,
    n_jobs=-1
)

grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_

#Predictions
y_pred = best_model.predict(X_test)
y_prob = best_model.predict_proba(X_test)[:, 1] 

accuracy = accuracy_score(y_test, y_pred)


Fitting 5 folds for each of 9 candidates, totalling 45 fits


In [7]:
print(accuracy) # my bro is overfitting like hellll

1.0


In [11]:
#DICISION TREE CLASSIFIER
dtc = DecisionTreeClassifier(random_state=42)


param_grid = {
    'max_depth': [1,2] #why complicate things for a baby dataset
}

#hyperparameter tuning
grid_search = GridSearchCV(estimator=dtc, param_grid=param_grid, cv=5, n_jobs=-1, return_train_score=True ,verbose=1)
grid_search.fit(X_train, y_train)

#Getting the best parameters
best_params = grid_search.best_params_
print(f"Best Hyperparameters: {best_params}")

#getting the best model
best_dtc = grid_search.best_estimator_

#getting Predict
y_train_pred = best_dtc.predict(X_train)
y_test_pred = best_dtc.predict(X_test)

train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

print(f"Training Accuracy: {train_accuracy:.4f}")
print(f"Testing Accuracy: {test_accuracy:.4f}")

#Confusion-Matrix
conf_matrix = confusion_matrix(y_test, y_test_pred)
print("Confusion Matrix:\n", conf_matrix)

Fitting 5 folds for each of 2 candidates, totalling 10 fits
Best Hyperparameters: {'max_depth': 2}
Training Accuracy: 0.9500
Testing Accuracy: 0.9667
Confusion Matrix:
 [[10  0  0]
 [ 0  8  1]
 [ 0  0 11]]


In [17]:


# Set the tracking URI explicitly
mlflow.set_tracking_uri("http://127.0.0.1:5000")

experiment_name = 'MLOps Experiment'
client = MlflowClient()

# Now try to get or create the experiment
experiment = client.get_experiment_by_name(experiment_name)

if experiment is None:
    experiment_id = client.create_experiment(experiment_name)
    print(f"Created experiment with ID: {experiment_id}")
else:
    experiment_id = experiment.experiment_id
    print(f"Found existing experiment with ID: {experiment_id}")

Created experiment with ID: 732826989895835585


In [18]:
with mlflow.start_run(experiment_id=experiment_id):
    mlflow.log_metric("accuracy", train_accuracy)
    mlflow.sklearn.log_model(best_dtc, 'Dicision Tree Classifier')
    print(f"✅ Accuracy: {train_accuracy:.4f}")



✅ Accuracy: 0.9500
🏃 View run secretive-steed-416 at: http://127.0.0.1:5000/#/experiments/732826989895835585/runs/adde8bc6313f4d4ea944771095ef287d
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/732826989895835585


In [19]:
with mlflow.start_run(experiment_id=experiment_id):
    mlflow.log_metric("accuracy", accuracy)
    mlflow.sklearn.log_model(best_model, 'XGBClassifier')
    print(f"✅ Accuracy: {accuracy:.4f}")



✅ Accuracy: 1.0000
🏃 View run trusting-gnu-744 at: http://127.0.0.1:5000/#/experiments/732826989895835585/runs/9712efd09b904c39915d6cc13d1fb4da
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/732826989895835585
