In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_selection import RFE
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import GridSearchCV

In [3]:
ds=pd.read_csv("modified_dataset.csv")
mean_value=ds['4000000'].mean()
ds['4000000'].fillna(value=mean_value, inplace=True)
ds.isnull().values.any()

x = ds.drop('0.33', axis=1)
y=ds['0.33']

# replace the infinite number with a biggest finite number.
x.replace([np.inf, -np.inf], np.finfo(np.float64).max, inplace=True)
# convert to 0 and 1 all data of the dataset
scaler = MinMaxScaler() 
x_scaled = scaler.fit_transform(x)

#Recursive Feature Elimination (RFE)
rfe = RFE(LogisticRegression())
rfe.n_features_to_select = 25
x_scaled = rfe.fit_transform(x, y)

# All selected feature column is stored.
selected_feature_names = x.columns[rfe.support_] 
x = ds[selected_feature_names]

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  ds['4000000'].fillna(value=mean_value, inplace=True)
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the docu

**Modify After Here**

In [4]:
print(selected_feature_names)

Index(['3.0', '0.0.3', '3.2', '3.3', '3.0.1', '3.4', '3.5', '0.5', '0.0.5',
       '0.0.6', '0.6', '0.7', '666666.6667.1', '0.0.7', '6.3', '0.0.10',
       '12.1', '0.28', '33', '-1', '0.0.11', '0.29', '0.30', '0.0.14', '0.32'],
      dtype='object')


In [5]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2)

# Define classifiers
classifiers = {
    'Naive Bayes': GaussianNB(),
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Decision Tree': DecisionTreeClassifier()
}

In [6]:
# Dictionary to store predictions for each classifier
predictions_ds = {}

# Function to evaluate and print metrics
def evaluate_model(name, model, x_test, y_test):
    predictions = model.predict(x_test)
    predictions_ds[name] = predictions

    # Calculate evaluation scores
    accuracy = accuracy_score(y_test, predictions)
    recall = recall_score(y_test, predictions, average='macro')
    precision = precision_score(y_test, predictions, average='macro')
    f1 = f1_score(y_test, predictions, average='macro')

    # Print evaluation scores
    print(f"Metrics for {name}")
    print(f"Accuracy : {accuracy:.4f}")
    print(f"Recall   : {recall:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"F1 Score : {f1:.4f}")
    print("------------------------")

# Initial performance before tuning
print("Performance before tuning:")
for name, classifier in classifiers.items():
    classifier.fit(x_train, y_train.ravel())
    evaluate_model(name, classifier, x_test, y_test)

# Hyperparameter tuning for each classifier
param_grids = {
    'Naive Bayes': {'var_smoothing': [1e-09, 1e-08, 1e-07]},
    'Logistic Regression': {'C': [0.1, 1.0, 10.0]},
    'Decision Tree': {'max_depth': [None, 5, 10]}
}

best_classifiers = {}

# Perform GridSearchCV for each classifier
print("\nPerformance after individual model tuning:")
for name, classifier in classifiers.items():
    grid_search = GridSearchCV(classifier, param_grids[name], cv=5)
    grid_search.fit(x_train, y_train)
    best_classifiers[name] = grid_search.best_estimator_

    # Evaluate the best model
    evaluate_model(f"{name} (Tuned)", best_classifiers[name], x_test, y_test)

# Voting Classifier with tuned estimators
eclf1 = VotingClassifier(estimators=[('Gaussian', best_classifiers['Naive Bayes']),
                                     ('Decision Tree', best_classifiers['Decision Tree']),
                                     ('Logistic', best_classifiers['Logistic Regression'])], 
                         voting='hard')
eclf1.fit(x_train, y_train)

# Evaluate the voting classifier with tuned models
print("After hard voting with tuned models.")
evaluate_model("Voting Classifier (Tuned)", eclf1, x_test, y_test)

# Hyperparameter tuning for VotingClassifier
voting_param_grid = {
    'Logistic__C': [0.1, 1.0, 10.0], 
    'DecisionTree__max_depth': [None, 5, 10],
    'Gaussian__var_smoothing': [1e-09, 1e-08, 1e-07],  
    'voting': ['hard', 'soft']
}

grid_search_voting = GridSearchCV(eclf1, voting_param_grid, cv=5)
grid_search_voting.fit(x_train, y_train)

best_voting_params = grid_search_voting.best_params_

# Update models with the best parameters
best_classifiers['Naive Bayes'].set_params(**{'var_smoothing': best_voting_params['Gaussian__var_smoothing']})
best_classifiers['Decision Tree'].set_params(**{'max_depth': best_voting_params['DecisionTree__max_depth']})
best_classifiers['Logistic Regression'].set_params(**{'C': best_voting_params['Logistic__C']})

# Re-create the VotingClassifier with tuned parameters
eclf1 = VotingClassifier(estimators=[('Gaussian', best_classifiers['Naive Bayes']),
                                     ('Decision Tree', best_classifiers['Decision Tree']),
                                     ('Logistic', best_classifiers['Logistic Regression'])], 
                         voting=best_voting_params['voting'])

eclf1.fit(x_train, y_train)

# Evaluate the final voting classifier
print("\nPerformance after hyperparameter tuning for Voting Classifier:")
evaluate_model("Voting Classifier (Final Tuned)", eclf1, x_test, y_test)


Performance before tuning:


  classifier.fit(x_train, y_train.ravel())


Metrics for Naive Bayes
Accuracy : 0.7714
Recall   : 0.7379
Precision: 0.8538
F1 Score : 0.7384
------------------------


  classifier.fit(x_train, y_train.ravel())
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  classifier.fit(x_train, y_train.ravel())


Metrics for Logistic Regression
Accuracy : 0.9532
Recall   : 0.9477
Precision: 0.9588
F1 Score : 0.9519
------------------------
Metrics for Decision Tree
Accuracy : 0.9996
Recall   : 0.9996
Precision: 0.9996
F1 Score : 0.9996
------------------------

Performance after individual model tuning:
Metrics for Naive Bayes (Tuned)
Accuracy : 0.7714
Recall   : 0.7379
Precision: 0.8538
F1 Score : 0.7384
------------------------


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Metrics for Logistic Regression (Tuned)
Accuracy : 0.9491
Recall   : 0.9442
Precision: 0.9532
F1 Score : 0.9478
------------------------
Metrics for Decision Tree (Tuned)
Accuracy : 0.9996
Recall   : 0.9996
Precision: 0.9996
F1 Score : 0.9996
------------------------


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


After hard voting with tuned models.
Metrics for Voting Classifier (Tuned)
Accuracy : 0.9590
Recall   : 0.9531
Precision: 0.9658
F1 Score : 0.9578
------------------------


ValueError: Invalid parameter 'DecisionTree' for estimator VotingClassifier(estimators=[('Gaussian', GaussianNB()),
                             ('Decision Tree', DecisionTreeClassifier()),
                             ('Logistic',
                              LogisticRegression(C=10.0, max_iter=1000))]). Valid parameters are: ['estimators', 'flatten_transform', 'n_jobs', 'verbose', 'voting', 'weights'].