<a href="https://colab.research.google.com/github/widyamelia26/Genetic-Algorithm_Feature-Selection/blob/main/GA_Feature_Selection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Final Project Sistem Terinspirasi Makhluk Hidup**


**Topik**: Penerapan Genetic Algorithms untuk Seleksi Fitur pada Pemodelan Support Vector Machine (Studi Kasus: Faktor Risiko Diabetes)


**Nama Kelompok 2**:
- Alifya Aisyah Ariyanto 23/528743/PPA/06685
- Alvendra Mahardika 24/547263/PPA/06876
- Ismi Nurul Na’imah 24/541118/PPA/06824
- Widya Amelia Putri 24/547828/PPA/06910


**Dosen**	:	Aina Musdholifah, S.Kom., M.Kom. Ph.D

---

## Pemodelan SVM Tanpa GA

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import pandas as pd

# Load the dataset
diabetes_data = pd.read_csv('diabetes_risk_data.csv')

# Separate features and target
X = diabetes_data.drop(columns=['class_Positive'])
y = diabetes_data['class_Positive']

# Split the data into training and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the data
scaler = StandardScaler()
X_train_scale = scaler.fit_transform(X_train)
X_test_scale = scaler.transform(X_test)

# Transform ke data frame
X_train_scale_df = pd.DataFrame(X_train_scale, columns=X.columns)
X_test_scale_df = pd.DataFrame(X_test_scale, columns=X.columns)

# Define the parameter grid for each kernel
param_grid = {
    'linear': {'C': [0.1, 1, 10, 100]},
    'rbf': {'C': [0.1, 1, 10, 100], 'gamma': [1, 0.1, 0.01, 0.001]},
    'sigmoid': {'C': [0.1, 1, 10, 100], 'gamma': [1, 0.1, 0.01, 0.001]},
    'poly': {'C': [0.1, 1, 10, 100], 'degree': [2, 3, 4], 'gamma': ['scale', 'auto']}
}

# Function to perform grid search for a specific kernel
def perform_grid_search(kernel, param_grid):
    svc = SVC(kernel=kernel)
    grid_search = GridSearchCV(svc, param_grid, cv=5, scoring='accuracy')
    grid_search.fit(X_train_scale, y_train)

    # Predict and calculate accuracy
    y_pred = grid_search.best_estimator_.predict(X_test_scale)
    accuracy = accuracy_score(y_test, y_pred)

    return grid_search.best_params_, accuracy

# Evaluate each kernel
results = {}
for kernel, params in param_grid.items():
    best_params, accuracy = perform_grid_search(kernel, params)
    results[kernel] = {'Best Params': best_params, 'Accuracy': accuracy}

# Convert results dictionary to a DataFrame
results_df = pd.DataFrame.from_dict(results, orient='index')
results_df.reset_index(inplace=True)
results_df.rename(columns={'index': 'Kernel'}, inplace=True)

# Display the DataFrame
print(results_df)


    Kernel                               Best Params  Accuracy
0   linear                                 {'C': 10}  0.932692
1      rbf                   {'C': 10, 'gamma': 0.1}  0.980769
2  sigmoid                  {'C': 10, 'gamma': 0.01}  0.913462
3     poly  {'C': 10, 'degree': 3, 'gamma': 'scale'}  0.961538


## Implementasi GA

In [None]:
import numpy as np
import random

class GeneticAlgorithm:
    def __init__(self, X, y, kernel_params, pop_size, p_cross, p_mut, mating_pool_size):
        self.X = X
        self.y = y
        self.n_features = X.shape[1]
        self.kernel_params = kernel_params
        self.pop_size = pop_size
        self.p_cross = p_cross
        self.p_mut = p_mut
        self.mating_pool_size = mating_pool_size
        self.epsilon = 1e-4
        self.population = self.initialize_population()
        self.best_fitness = 0
        self.prev_best_fitness = 0
        self.fitness_history = []
        self.no_improvement_counter = 0

    def initialize_population(self):
        population = []
        for _ in range(self.pop_size):
            chromosome = np.zeros(self.n_features)
            n_selected = max(3, np.random.randint(3, self.n_features // 2))
            selected_features = np.random.choice(self.n_features, n_selected, replace=False)
            chromosome[selected_features] = 1
            population.append(chromosome)
        return np.array(population)

    def fitness_function(self, chromosome, X_train_scale_df=X_train_scale_df, X_test_scale_df=X_test_scale_df):
        selected_features = self.X[:, chromosome == 1]

        if selected_features.shape[1] < 3:
            return 0

        selected_feature_indices = np.where(chromosome == 1)[0]
        X_train_scaled = X_train_scale_df.iloc[:,selected_feature_indices]
        X_test_scaled = X_test_scale_df.iloc[:,selected_feature_indices]
        X_train_scaled = X_train_scaled.values
        X_test_scaled = X_test_scaled.values

        svm = SVC(**self.kernel_params)
        svm.fit(X_train_scaled, y_train)

        y_pred = svm.predict(X_test_scaled)
        accuracy = accuracy_score(y_test, y_pred)

        feature_penalty = 0.001 * (selected_features.shape[1] / self.n_features)
        return accuracy - feature_penalty

    def tournament_selection(self, tournament_size=4):
        tournament_idx = np.random.choice(self.pop_size, tournament_size)
        tournament = self.population[tournament_idx]
        fitness_values = [self.fitness_function(individual, X_train_scale_df, X_test_scale_df) for individual in tournament]
        return tournament[np.argmax(fitness_values)]

    def two_point_crossover(self, parent1, parent2):
        if len(parent1) < 4:
            return parent1, parent2

        point1, point2 = sorted(np.random.choice(len(parent1)-1, 2, replace=False))
        point2 += 1

        child1 = np.copy(parent1)
        child2 = np.copy(parent2)

        child1[point1:point2] = parent2[point1:point2]
        child2[point1:point2] = parent1[point1:point2]

        return child1, child2

    def bit_flip_mutation(self, individual):
        mutated = np.copy(individual)
        for i in range(len(mutated)):
            if np.random.random() < self.p_mut:
                mutated[i] = 1 - mutated[i]

        if sum(mutated) < 3:
            n_needed = 3 - sum(mutated)
            zero_indices = np.where(mutated == 0)[0]
            if len(zero_indices) >= n_needed:
                to_flip = np.random.choice(zero_indices, int(n_needed), replace=False)
                mutated[to_flip] = 1

        return mutated

    def evolve(self):
        fitness_values = [self.fitness_function(ind, X_train_scale_df, X_test_scale_df) for ind in self.population]
        current_best_fitness = max(fitness_values)

        # Update fitness history
        self.fitness_history.append(current_best_fitness)

        # Create mating pool
        mating_pool = [self.tournament_selection() for _ in range(self.mating_pool_size)]

        # Perform crossover and mutation
        offspring = []
        for i in range(0, len(mating_pool)-1, 2):
            if np.random.random() < self.p_cross:
                child1, child2 = self.two_point_crossover(mating_pool[i], mating_pool[i+1])
                offspring.extend([
                    self.bit_flip_mutation(child1),
                    self.bit_flip_mutation(child2)
                ])
            else:
                offspring.extend([mating_pool[i], mating_pool[i+1]])

        # Replace worst individuals (steady-state update)
        fitness_values = [self.fitness_function(ind, X_train_scale_df, X_test_scale_df) for ind in self.population]
        sorted_indices = np.argsort(fitness_values)

        for i, child in enumerate(offspring):
            if i < len(sorted_indices):
                self.population[sorted_indices[i]] = child

        # Check convergence with epsilon
        if abs(current_best_fitness - self.best_fitness) < self.epsilon:
          self.no_improvement_counter += 1
        else:
          self.no_improvement_counter = 0
          self.best_fitness = current_best_fitness

        # Return whether converged
        return self.no_improvement_counter >= 5

    def run(self, max_generations=100):
        generation = 0
        converged = False

        while not converged and generation < max_generations:
            converged = self.evolve()
            generation += 1

        # Get best solution
        fitness_values = [self.fitness_function(ind) for ind in self.population]
        best_idx = np.argmax(fitness_values)
        best_chromosome = self.population[best_idx]
        best_fitness = fitness_values[best_idx]

        print(f"Generation {generation}, Best Fitness: {self.best_fitness:.6f}")
        return best_chromosome, best_fitness, generation

def run_experiment(X, y, kernel_name, kernel_params, pop_sizes, p_cross_values, feature_names):
    results = []

    # Variables to track best results for this kernel
    best_accuracy = 0
    best_config = None
    best_features = None

    for pop_size in pop_sizes:
        p_mut_value = np.arange(1/(16 * pop_size), 0.0625, 0.025)
        for p_mut in p_mut_value:
            mating_pool_size = pop_size // 2
            for p_cross in p_cross_values:
                print(f"\nRunning with pop_size={pop_size}, p_cross={p_cross}, p_mut={p_mut}")

                ga = GeneticAlgorithm(
                    X=X,
                    y=y,
                    kernel_params=kernel_params,
                    pop_size=pop_size,
                    p_cross=p_cross,
                    p_mut=p_mut,
                    mating_pool_size=mating_pool_size
                )

                best_solution, best_fitness, generations = ga.run()

                # Get selected feature names
                selected_feature_indices = np.where(best_solution == 1)[0]
                selected_feature_names = feature_names[selected_feature_indices]


                current_result = {
                    'kernel': kernel_name,
                    'pop_size': pop_size,
                    'p_cross': p_cross,
                    'p_mut': p_mut,
                    'accuracy': best_fitness,
                    'generations': generations,
                    'selected_features': sum(best_solution),
                    'feature_names': selected_feature_names  # Store feature names
                }

                results.append(current_result)

                # Update best configuration if current result is better
                if best_fitness > best_accuracy:
                    best_accuracy = best_fitness
                    best_config = current_result
                    best_features = selected_feature_names

    # Print best configuration for this kernel
    print(f"\n{'='*50}")
    print(f"Best Results for {kernel_name.upper()} kernel:")
    print(f"{'='*50}")
    print(f"Best Accuracy: {best_config['accuracy']:.4f}")
    print(f"Number of Generations: {best_config['generations']}")
    print(f"Number of Selected Features: {best_config['selected_features']}")
    print(f"Selected Features: {best_config['selected_features']}")
    print(f"Best Configuration:")
    print(f"- Population Size: {best_config['pop_size']}")
    print(f"- Crossover Probability: {best_config['p_cross']}")
    print(f"- Mutation Probability: {best_config['p_mut']:.6f}")
    print(f"{'='*50}\n")

    return results, best_config,best_features

# Load and prepare data
diabetes_data = pd.read_csv('diabetes_risk_data.csv')
X = diabetes_data.drop(columns=['class_Positive'])
feature_names = np.array(X.columns)  # Get feature names
y = diabetes_data['class_Positive']

# Define parameters
pop_sizes = [20, 30, 40, 50]
p_cross_values = [0.6, 0.7, 0.8, 0.9]

# Define kernel configurations
kernel_configs = {
    'linear': {'kernel': 'linear', 'C': 10},
    'rbf': {'kernel': 'rbf', 'C': 10, 'gamma': 0.1},
    'sigmoid': {'kernel': 'sigmoid', 'C': 10, 'gamma': 0.01},
    'poly': {'kernel': 'poly', 'C': 10, 'degree': 3, 'gamma': 'scale'}
}

# Run experiments for each kernel and store best configurations
all_results = []
best_configs = {}
best_features_by_kernel = {}
overall_best = {'accuracy': 0, 'kernel': None, 'config': None}

for kernel_name, params in kernel_configs.items():
    print(f"\nRunning experiments for {kernel_name} kernel...")
    results, best_config, best_features = run_experiment(
        X.values, y.values, kernel_name, params,
        pop_sizes, p_cross_values, feature_names
    )
    all_results.extend(results)
    best_configs[kernel_name] = best_config
    best_features_by_kernel[kernel_name] = best_features

    # Track overall best performance
    if best_config['accuracy'] > overall_best['accuracy']:
        overall_best['accuracy'] = best_config['accuracy']
        overall_best['kernel'] = kernel_name
        overall_best['config'] = best_config
        overall_best['features'] = best_features

# Print overall best results
print(f"\n{'='*50}")
print("OVERALL BEST CONFIGURATION")
print(f"{'='*50}")
print(f"Best Kernel: {overall_best['kernel'].upper()}")
print(f"Best Accuracy: {overall_best['config']['accuracy']:.4f}")
print(f"Number of Generations: {overall_best['config']['generations']}")
print(f"Number of Selected Features: {overall_best['config']['selected_features']}")
print(f"Selected Features: {', '.join(overall_best['features'])}")
print(f"Configuration:")
print(f"- Population Size: {overall_best['config']['pop_size']}")
print(f"- Crossover Probability: {overall_best['config']['p_cross']}")
print(f"- Mutation Probability: {overall_best['config']['p_mut']:.6f}")
print(f"{'='*50}")

# Print selected features for each kernel
print("\nSelected Features by Kernel:")
for kernel_name, features in best_features_by_kernel.items():
    print(f"\n{kernel_name.upper()} Kernel:")
    print(f"Number of features: {len(features)}")
    print(f"Features: {', '.join(features)}")

# Create summary DataFrame
summary_df = pd.DataFrame([{
    'Kernel': k,
    'Best Accuracy': config['accuracy'],
    'Generations': config['generations'],
    'Number of Selected Features': config['selected_features'],
    'Selected Features': ', '.join(best_features_by_kernel[k]),
    'Population Size': config['pop_size'],
    'Crossover Prob': config['p_cross'],
    'Mutation Prob': config['p_mut']
} for k, config in best_configs.items()])

print("\nSummary of Best Results for All Kernels:")
# Setting display options to show full content of cells
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', None)
print(summary_df.to_string(index=False))


Running experiments for linear kernel...

Running with pop_size=20, p_cross=0.6, p_mut=0.003125
Generation 14, Best Fitness: 0.951236

Running with pop_size=20, p_cross=0.7, p_mut=0.003125
Generation 10, Best Fitness: 0.932255

Running with pop_size=20, p_cross=0.8, p_mut=0.003125
Generation 13, Best Fitness: 0.922702

Running with pop_size=20, p_cross=0.9, p_mut=0.003125
Generation 7, Best Fitness: 0.874750

Running with pop_size=20, p_cross=0.6, p_mut=0.028125
Generation 14, Best Fitness: 0.951298

Running with pop_size=20, p_cross=0.7, p_mut=0.028125
Generation 14, Best Fitness: 0.941933

Running with pop_size=20, p_cross=0.8, p_mut=0.028125
Generation 21, Best Fitness: 0.951361

Running with pop_size=20, p_cross=0.9, p_mut=0.028125
Generation 11, Best Fitness: 0.941933

Running with pop_size=20, p_cross=0.6, p_mut=0.053125000000000006
Generation 13, Best Fitness: 0.970654

Running with pop_size=20, p_cross=0.7, p_mut=0.053125000000000006
Generation 10, Best Fitness: 0.941808

Runn

# Uji Signifikansi

In [None]:
# print summary_Df bagian kolom kernel dan beyst accuracy
summary_df_ujistat = summary_df[['Kernel', 'Best Accuracy']]
results_df_ujistat = results_df[['Kernel', 'Accuracy']]
#Gabungkan dua dataframe tersebut
ujistat_df = pd.merge(results_df_ujistat, summary_df_ujistat,  on='Kernel', suffixes=('_summary', '_results'))

# rename kolom accuracy dengan "Akurasi Sebelum GA" dan "Test Accuracy" dengan "Akurasi Setelah GA"
ujistat_df = ujistat_df.rename(columns={'Accuracy': 'Akurasi Sebelum GA'})
ujistat_df = ujistat_df.rename(columns={'Best Accuracy': 'Akurasi Setelah GA'})
print(ujistat_df)

    Kernel  Akurasi Sebelum GA  Akurasi Setelah GA
0   linear            0.932692            0.970654
1      rbf            0.980769            0.989947
2  sigmoid            0.913462            0.970716
3     poly            0.961538            0.989947


In [None]:
import numpy as np
from scipy.stats import shapiro, ttest_rel, wilcoxon

# Uji normalitas pada selisih data berpasangan
differences = ujistat_df['Akurasi Setelah GA'] - ujistat_df['Akurasi Sebelum GA']
stat_diff, p_value_diff = shapiro(differences)

print("Uji Normalitas untuk Selisih Data Berpasangan:", "P-Value =", p_value_diff)

# Periksa apakah selisih data berpasangan berdistribusi normal
alpha = 0.05  # Tingkat signifikansi

if p_value_diff > alpha:
    print("Data selisih berdistribusi normal. Lanjutkan dengan paired t-test satu arah.")
    # Melakukan paired t-test satu arah
    t_stat, p_ttest = ttest_rel(ujistat_df['Akurasi Sebelum GA'], ujistat_df['Akurasi Setelah GA'], alternative='less')
    print("Hasil paired t-test satu arah:", "T-Statistic =", t_stat, "P-Value =", p_ttest)
else:
    print("Data selisih tidak berdistribusi normal. Lanjutkan dengan Wilcoxon signed-rank test satu arah.")
    # Melakukan Wilcoxon signed-rank test satu arah
    w_stat, p_wilcoxon = wilcoxon(ujistat_df['Akurasi Sebelum GA'], ujistat_df['Akurasi Setelah GA'], alternative='less')
    print("Hasil Wilcoxon signed-rank test satu arah:", "W-Statistic =", w_stat, "P-Value =", p_wilcoxon)


Uji Normalitas untuk Selisih Data Berpasangan: P-Value = 0.9947030894389898
Data selisih berdistribusi normal. Lanjutkan dengan paired t-test satu arah.
Hasil paired t-test satu arah: T-Statistic = -3.318240613178009 P-Value = 0.02255666787021566
