In [None]:
#TruongNguyen for De novo Promoters

import pandas as pd
import numpy as np
import csv, os
from numpy import savetxt
import random
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score

# Load train dataset
df = pd.read_csv('')

# Assuming the last column is the label/dependent output
X_train = TSS.iloc[:, :-1]  # Exclude the last column
y_train = TSS.iloc[:, -1]   # Use the last column as the label
y_train = y_train.copy()
X = X_train.dropna(axis='columns')

# Defining various steps required for the genetic algorithm
def initilization_of_population(size, n_feat):
    population = []
    for i in range(size):
        chromosome = np.ones(n_feat, dtype=np.bool)
        chromosome[:int(0.3 * n_feat)] = False
        np.random.shuffle(chromosome)
        population.append(chromosome)
    return population

def fitness_score(population):
    scores = []
    newtp = []
    newfp = []
    newtn = []
    newfn = []
    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    for chromosome in population:
        tp = []
        fp = []
        tn = []
        fn = []
        acc = []
        for train, test in kfold.split(X, y_train):
            model = RandomForestClassifier()
            model.fit(X.iloc[train, chromosome], y_train[train])
            true_labels = np.asarray(y_train[test])
            predictions = model.predict(X.iloc[test, chromosome])

            ntp, nfn, ntn, nfp = confusion_matrix(true_labels, predictions).ravel()
            tp.append(ntp)
            fp.append(nfp)
            tn.append(ntn)
            fn.append(nfn)
            acc.append(accuracy_score(true_labels, predictions))

        scores.append(np.mean(acc))
        newtp.append(np.sum(tp))
        newfp.append(np.sum(fp))
        newtn.append(np.sum(tn))
        newfn.append(np.sum(fn))

    scores, population = np.array(scores), np.array(population)

    weights = scores / np.sum(scores)
    newtp, newfp, newtn, newfn = np.array(newtp), np.array(newfp), np.array(newtn), np.array(newfn)
    inds = np.argsort(scores)

    return (
        list(scores[inds][::-1]),
        list(population[inds, :][::-1]),
        list(weights[inds][::-1]),
        list(newtp[inds][::-1]),
        list(newfp[inds][::-1]),
        list(newtn[inds][::-1]),
        list(newfn[inds][::-1]),
    )

def selection(pop_after_fit, weights, k):
    pop_after_sel = []
    selected_pop = random.choices(pop_after_fit, weights=weights, k=k)
    for t in selected_pop:
        pop_after_sel.append(t)
    return pop_after_sel

def crossover(p1, p2, crossover_rate):
    # Children are copies of parents by default
    c1, c2 = p1.copy(), p2.copy()
    # Check for recombination
    if random.random() < crossover_rate:
        # Select crossover point that is not on the end of the string
        pt = random.randint(1, len(p1) - 2)
        # Perform crossover
        c1 = np.concatenate((p1[:pt], p2[pt:]))
        c2 = np.concatenate((p2[:pt], p1[pt:]))
    return [c1, c2]

def mutation(chromosome, mutation_rate):
    for i in range(len(chromosome)):
        # Check for a mutation
        if random.random() < mutation_rate:
            # Flip the bit
            chromosome[i] = not chromosome[i]

def generations(size, n_feat, crossover_rate, mutation_rate, n_gen):
    best_chromo = []
    best_score = []
    population_nextgen = initilization_of_population(size, n_feat)

    for i in range(n_gen):
        scores, pop_after_fit, weights, tp, fp, tn, fn = fitness_score(population_nextgen)
        score = scores[0]
        print('gen', i, score)

        k = size - 2
        pop_after_sel = selection(pop_after_fit, weights, k)

        # Create the next generation
        children = []
        for i in range(0, len(pop_after_sel), 2):
            # Get selected parents in pairs
            p1, p2 = pop_after_sel[i], pop_after_sel[i + 1]
            # Crossover and mutation
            for c in crossover(p1, p2, crossover_rate):
                mutation(c, mutation_rate)
                # Store for next generation
                children.append(c)

        # Replace population
        pop_after_mutated = children
        population_nextgen = []
        for c in pop_after_fit[:2]:
            population_nextgen.append(c)
        for p in pop_after_mutated:
            population_nextgen.append(p)

        best_chromo.append(pop_after_fit[0])
        best_score.append(score)

    return best_chromo, best_score

# Running Genetic Algorithm
best_chromo, best_score = generations(size=50, n_feat=X.shape[1], crossover_rate=0.8, mutation_rate=0.05, n_gen=500)
print("Best Chromosome:", best_chromo)
print("Best Score:", best_score)

In [None]:
best_features = best_chromo[0]
X_train_best = X.loc[:, best_features]

In [None]:
final_model = RandomForestClassifier()
final_model.fit(X_train_best, y_train)


In [None]:
#Save the trained model

import joblib

joblib.dump(final_model, 'file_model.joblib')


In [None]:
# Assuming you have already loaded the test dataset into `df_test`

# Extract test features and target labels
df_test = pd.read_csv('file.csv')

X_test = df_test.iloc[:, :-1]
y_test = df_test[, -1]
X_test_best = X_test.loc[:, best_chromo[0]]

# Load the final Random Forest model
final_model = joblib.load('file_model.joblib')

predictions = final_model.predict(X_test_best)

# Calculate accuracy or any other evaluation metric
accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)

In [None]:
import pandas as pd
import numpy as np
import joblib
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier

# Visualize feature importances
feature_importance = final_model.feature_importances_
sorted_importance = np.argsort(feature_importance)[::-1]  # Sort in descending order
top_features_indices = sorted_importance[:20]  # Select the top 10 features (you can adjust this number as needed)

# Get the names of the top features
top_features_names = X.columns[top_features_indices]

# Plot feature importances
plt.figure(figsize=(10, 6))
plt.bar(range(len(top_features_indices)), feature_importance[top_features_indices])
plt.xticks(range(len(top_features_indices)), top_features_names, rotation=45)
plt.xlabel('Feature')
plt.ylabel('Importance')
plt.title('Top 20 Features Importance')
plt.tight_layout()
plt.show()

In [None]:
import pandas as pd
import numpy as np
import joblib
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, auc

# Get the predicted probabilities for class 1 (assuming it's the positive class)
y_pred_prob = final_model.predict_proba(X_test_best)[:, 1]

# Calculate the ROC curve
fpr, tpr, _ = roc_curve(y_test, y_pred_prob)

# Calculate the AUC (Area Under the Curve)
roc_auc = auc(fpr, tpr)

# Plot the ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='b', label='ROC curve (AUC = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()

In [None]:
import pandas as pd
import numpy as np
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split

#Split the data into training and testing sets
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#Train your classifier (Random Forest in this example)
#final_model = RandomForestClassifier()
#final_model.fit(X_train, y_train)

#Make predictions on the test set
y_pred = final_model.predict(X_test_best)

# Calculate the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Plot the confusion matrix as a heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", cbar=False)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()

In [None]:
# Extract the values from the confusion matrix
TN, FP, FN, TP = cm.ravel()

# Calculate the evaluation metrics
accuracy = (TP + TN) / (TP + TN + FP + FN)
precision = TP / (TP + FP)
recall = TP / (TP + FN)
specificity = TN / (TN + FP)
f1 = 2 * (precision * recall) / (precision + recall)

# Print the results
print("Accuracy: {:.3f} or {:.1f}%".format(accuracy, accuracy * 100))
print("Precision: {:.3f} or {:.1f}%".format(precision, precision * 100))
print("Recall (Sensitivity): {:.3f} or {:.1f}%".format(recall, recall * 100))
print("Specificity: {:.3f} or {:.1f}%".format(specificity, specificity * 100))
print("F1-Score: {:.3f}".format(f1))

# Plot the confusion matrix as a heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", cbar=False)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()

In [None]:
#Precision-Recall Curve:

from sklearn.metrics import precision_recall_curve
import matplotlib.pyplot as plt

# Assuming you have already trained your classifier 'final_model' and obtained the predicted probabilities 'y_pred_prob' on the test set
precision, recall, thresholds = precision_recall_curve(y_test, y_pred_prob)

plt.figure(figsize=(8, 6))
plt.plot(recall, precision, color='blue', label='Precision-Recall Curve')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend()
plt.show()

In [None]:
#Learning Curves:
from sklearn.model_selection import learning_curve

X = df.iloc[:, :-1]   # Exclude the last column
y = df[:, -1]   # Use the last column as the label

# Assuming you have already trained your classifier 'final_model'
train_sizes, train_scores, test_scores = learning_curve(final_model, X, y, cv=5)

plt.figure(figsize=(8, 6))
plt.plot(train_sizes, np.mean(train_scores, axis=1), 'o-', color='r', label='Training Accuracy')
plt.plot(train_sizes, np.mean(test_scores, axis=1), 'o-', color='g', label='Validation Accuracy')
plt.xlabel('Training Size')
plt.ylabel('Accuracy')
plt.title('Learning Curves')
plt.legend()
plt.show()

In [None]:
# Class Distribution:
import seaborn as sns

# Assuming you have already loaded your dataset as 'X' and 'y' into a DataFrame 'df'
plt.figure(figsize=(6, 4))
sns.countplot(x='ZTA', data=df)
plt.xlabel('Class')
plt.ylabel('Count')
plt.title('Class Distribution')
plt.show()

In [None]:
#Feature Distribution
# Assuming you have already loaded your dataset as 'X' into a DataFrame 'TSS'
plt.figure(figsize=(10, 6))
for feature in X.columns:
    sns.histplot(df[feature], kde=True, label=feature)
plt.xlabel('Feature Values')
plt.ylabel('Count')
plt.title('Feature Distributions')
plt.legend()
plt.show()

In [None]:
#Feature Correlation Matrix
#Assuming you have already loaded your dataset as 'X' into a DataFrame 'TSS'
correlation_matrix = df.corr()
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Feature Correlation Matrix')
plt.show()
correlation_matrix

In [None]:
correlation_matrix

In [None]:
#Calibration Curve
from sklearn.calibration import calibration_curve

# Assuming you have already trained your classifier 'final_model' and obtained the predicted probabilities 'y_pred_prob' on the test set
prob_true, prob_pred = calibration_curve(y_test, y_pred_prob, n_bins=10)

plt.figure(figsize=(8, 6))
plt.plot(prob_pred, prob_true, marker='o', linestyle='--', color='blue', label='Calibration Curve')
plt.plot([0, 1], [0, 1], linestyle='--', color='gray', label='Perfectly Calibrated')
plt.xlabel('Mean Predicted Probability')
plt.ylabel('Fraction of Positives')
plt.title('Calibration Curve')
plt.legend()
plt.show()

In [None]:
#Residual Plots (for regression tasks)
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_predict

# Assuming you have already trained your regression model 'final_model' and obtained predictions 'y_pred' on the test_set
# Assuming you have the true regression targets 'y_test' on the test set
residuals = y_test - y_pred
plt.figure(figsize=(8, 6))
plt.scatter(y_pred, residuals, alpha=0.5)
plt.axhline(y=0, color='red', linestyle='--')
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.title('Residual Plot')
plt.show()

# Calculate the occurrences of each unique value in residuals
unique_values, counts = np.unique(residuals, return_counts=True)

# Display the results
for value, count in zip(unique_values, counts):
    print(f"Occurrences of {value}: {count}")