In [None]:
###### RANDOM FOREST ######
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from xgboost import XGBClassifier
from sklearn.svm import SVC

file_path = 'your_output_file_features_with_label.csv'

# Read CSV in chunks
chunk_size = 100000  

# Read CSV in chunks with low_memory=False
chunks = pd.read_csv(file_path, chunksize=chunk_size, low_memory=False)

# Initialize an empty DataFrame to store concatenated chunks
df = pd.DataFrame()
for chunk in chunks:
    df = pd.concat([df, chunk])

# Total number of rows in the DataFrame
total_rows = len(df)
print("Total number of rows:", total_rows)

####### seoarating features and labels ######################
X = df.drop(columns=['Label'])  # Assuming 'label' is your target column
y = df['Label']

########## splitting it into test and train data set ###############
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

################ normalisation #################
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [None]:
############# Random Forest #######################

rf_classifier = RandomForestClassifier(random_state=42)

# Train the model
rf_classifier.fit(X_train, y_train)

# Predict on the testing set
y_pred = rf_classifier.predict(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
specificity = conf_matrix[0, 0] / (conf_matrix[0, 0] + conf_matrix[0, 1])

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Specificity:", specificity)
print("Confusion Matrix:")
print(conf_matrix)


In [None]:
############# xgboost ############################
xgb_classifier = XGBClassifier(random_state=42)

# Train the model
xgb_classifier.fit(X_train, y_train)

# Predict on the testing set
y_pred = xgb_classifier.predict(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
specificity = conf_matrix[0, 0] / (conf_matrix[0, 0] + conf_matrix[0, 1])
sensitivity = recall  # Sensitivity is the same as Recall

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall (Sensitivity):", recall)
print("F1 Score:", f1)
print("Specificity:", specificity)
print("Confusion Matrix:")
print(conf_matrix)

In [None]:
######## LOGISTICS REGRESSION ########
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from sklearn import svm

file_path = 'your_output_file_features_with_label.csv'

# Read CSV in chunks
chunk_size = 100000  # Adjust chunk size as needed
# Read CSV in chunks with low_memory=False
chunks = pd.read_csv(file_path, chunksize=chunk_size, low_memory=False)

# Initialize an empty DataFrame to store concatenated chunks
df = pd.DataFrame()
for chunk in chunks:
    df = pd.concat([df, chunk])

# Total number of rows in the DataFrame
total_rows = len(df)
print("Total number of rows:", total_rows)

####### seoarating features and labels ######################
X = df.drop(columns=['Label'])  # Assuming 'label' is your target column
y = df['Label']

########## splitting it into test and train data set ###############
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

################ normalisation #################
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initializing Logistic Regression classifier
clf = LogisticRegression()

# Training the classifier
clf.fit(X_train_scaled, y_train)

# Making predictions on the testing set
y_pred = clf.predict(X_test_scaled)

# Calculating evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
sensitivity = recall_score(y_test, y_pred)  # Also known as recall or true positive rate
f1 = f1_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)

tn, fp, fn, tp = cm.ravel()
specificity = tn / (tn + fp)  # True negative rate

# Printing the evaluation metrics
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Sensitivity/Recall: {sensitivity:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Specificity: {specificity:.4f}")
print("Confusion Matrix:")
print(cm)

In [None]:
######## DECISION TREE (BEST) ########
# Initializing Decision Tree classifier
clf = DecisionTreeClassifier()

# Training the classifier
clf.fit(X_train_scaled, y_train)

# Making predictions on the testing set
y_pred = clf.predict(X_test_scaled)

# Calculating evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
sensitivity = recall_score(y_test, y_pred)  # Also known as recall or true positive rate
f1 = f1_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)

tn, fp, fn, tp = cm.ravel()
specificity = tn / (tn + fp)  # True negative rate

# Printing the evaluation metrics
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Sensitivity/Recall: {sensitivity:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Specificity: {specificity:.4f}")
print("Confusion Matrix:")
print(cm)

In [None]:
######## Gaussian Naive Bayes ########
# Initializing Gaussian Naive Bayes classifier
clf = GaussianNB()

# Training the classifier
clf.fit(X_train_scaled, y_train)

# Making predictions on the testing set
y_pred = clf.predict(X_test_scaled)

# Calculating evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
sensitivity = recall_score(y_test, y_pred)  # Also known as recall or true positive rate
f1 = f1_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)

tn, fp, fn, tp = cm.ravel()
specificity = tn / (tn + fp)  # True negative rate

# Printing the evaluation metrics
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Sensitivity/Recall: {sensitivity:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Specificity: {specificity:.4f}")
print("Confusion Matrix:")
print(cm)


In [None]:
######## ANN ########
import pandas as pd
from tensorflow import keras
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from sklearn import svm

file_path = 'your_output_file_features_with_label.csv'
# Read CSV in chunks
chunk_size = 100000  # Adjust chunk size as needed
# Read CSV in chunks with low_memory=False
chunks = pd.read_csv(file_path, chunksize=chunk_size, low_memory=False)

# Initialize an empty DataFrame to store concatenated chunks
df = pd.DataFrame()
for chunk in chunks:
    df = pd.concat([df, chunk])

# Total number of rows in the DataFrame
total_rows = len(df)
print("Total number of rows:", total_rows)

####### seoarating features and labels ######################
X = df.drop(columns=['Label'])  # Assuming 'label' is your target column
y = df['Label']

########## splitting it into test and train data set ###############
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

################ normalisation #################
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Building the ANN model
model = keras.Sequential([
    keras.layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    keras.layers.Dense(32, activation='relu'),
    keras.layers.Dense(1, activation='sigmoid')
])

# Compiling the model
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

# Training the model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

# Predicting on the test set
y_pred_proba = model.predict(X_test)
y_pred = (y_pred_proba > 0.5).astype(int)

# Calculating evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
sensitivity = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Calculating specificity
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
specificity = tn / (tn + fp)

# Confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Printing the results
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Sensitivity:", sensitivity)
print("F1 Score:", f1)
print("Specificity:", specificity)
print("Confusion Matrix:")
print(conf_matrix)