In [None]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import numpy as np
import pandas as pd

from keras.utils import to_categorical
import os
from sklearn.ensemble import VotingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression

Mounted at /content/drive


In [None]:
# Function to read data from CSV files
def read_data_from_csv(folder_path):
    dfs = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".csv"):
            file_path = os.path.join(folder_path, filename)
            df = pd.read_csv(file_path)
            dfs.append(df)
    return pd.concat(dfs, ignore_index=True)

In [None]:
# Read data from train and test folders
combined_data = read_data_from_csv('/content/drive/My Drive/datasets/train')


In [None]:
# Encoding categorical variables
encoder = LabelEncoder()
combined_data['label'] = encoder.fit_transform(combined_data['label'])


In [None]:
X_train_combined, X_test_combined, y_train_combined, y_test_combined = train_test_split(
    combined_data.drop(columns=['label']),  # Features
    combined_data['label'],  # Target variable
    test_size=0.2,  # 20% of data will be used for testing
    random_state=42,  # For reproducibility
    stratify=combined_data['label']  # Ensure same label proportions in train and test sets
)

In [None]:
# Scaling numerical variables
scaler = StandardScaler()
X_train_combined_scaled = scaler.fit_transform(X_train_combined)
X_test_combined_scaled = scaler.transform(X_test_combined)

In [None]:
# Define base learners
base_learners = [
    ('naive_bayes', GaussianNB()),
    ('logistic_regression', LogisticRegression(max_iter=1000))

]

# Create voting classifier with soft voting
voting_classifier = VotingClassifier(estimators=base_learners, voting='soft')


In [20]:
# Train voting classifier
voting_classifier.fit(X_train_combined_scaled, y_train_combined)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix


In [None]:
# Make predictions
voting_predictions = voting_classifier.predict(X_test_combined_scaled)


In [None]:
# Plot confusion matrix
conf_matrix = confusion_matrix(y_test_combined, voting_predictions)
plt.figure(figsize=(10, 8))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
# plt.title('Confusion Matrix')
plt.show()

In [None]:
# Calculate accuracy
voting_accuracy = accuracy_score(y_test_combined, voting_predictions)

# Print accuracy
print("Voting Classifier Accuracy:", voting_accuracy)


In [None]:
# Print classification report
print("Classification Report for Voting Classifier:")
print(classification_report(y_test_combined, voting_predictions))