In [46]:
import random
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score, train_test_split,GridSearchCV
from sklearn.datasets import load_iris
import pandas as pd
import cProfile
import time


In [47]:
class GASearhCV:
    
    def __init__(self, model,param_grid,num_generations=50,scoring='accuracy',population_size=100, mutation_rate=0.05,elitism=False,elitism_size=0,tournament_size=5,cv=5):
        self.model=model
        self.X=None
        self.y=None
        self.population_size = population_size
        self.param_grid = param_grid
        self.scoring = scoring
        self.mutation_rate = mutation_rate
        self.population = [self.generate_random_parameters() for _ in range(self.population_size)]
        self.elitism = elitism
        self.tournament_size = tournament_size
        self.cv=cv
        self.elitism_size=elitism_size
        self.best_params=None
        self.best_score=None
        self.num_generations=num_generations
        
    def generate_random_parameters(self):
        return {param : random.choice(value_list) for param,value_list in self.param_grid.items()}
    
    def fit(self,X,y):
        self.X=X
        self.y=y
        
        new_population=[self.generate_random_parameters() for _ in range(self.population_size)]
        fitness_score = dict()
        
        if self.elitism and (self.population_size - self.elitism_size) % 2 == 1:
            self.elitism_size += 1
            
        for p in self.population:
            fitness_score[str(p)]=self.fitness(p)
            
        for _ in range(self.num_generations):
            if self.elitism:
                self.population.sort(key=lambda x: fitness_score[str(x)], reverse=True)
                new_population[:self.elitism_size] = self.population[:self.elitism_size]
                
            for i in range(self.elitism_size, self.population_size, 2):
                parent1_idx = self.selection(self.population, self.tournament_size,fitness_score)
                parent2_idx = self.selection(self.population, self.tournament_size,fitness_score)
            
                self.crossover(self.population[parent1_idx],
                          self.population[parent2_idx],
                          new_population[i],
                          new_population[i+1])
            
                self.mutation(new_population[i])
                self.mutation(new_population[i+1])
            
                fitness_score[str(new_population[i])] = self.fitness(new_population[i])
                fitness_score[str(new_population[i+1])] = self.fitness(new_population[i+1])
        
            self.population[:] = new_population[:]
            
        self.best_params = max(self.population, key=lambda x: fitness_score[str(x)])
        self.best_score = fitness_score[str(self.best_params)]
                
    def selection(self,population, tournament_size,fitness_score):
        chosen = random.sample(population, tournament_size)
        max_fitness = float('-inf')
        best_idx = -1
        for i,p in enumerate(chosen):
            if fitness_score[str(p)] > max_fitness:
                max_fitness = fitness_score[str(p)]
                best_idx = i
        return best_idx
    
    def mutation(self,parameters): 
        for param,values in self.param_grid.items():
            if random.random() < self.mutation_rate:
                parameters[param]=random.choice(values)
    
    def crossover(self,parent1, parent2, child1, child2):
        for param,value in parent1.items():
            if random.random() < 0.5:
                child1[param]=value
                child2[param]=parent2[param]
            else:
                child2[param]=value
                child1[param]=parent1[param]
    
    def fitness(self,params):
        self.model.set_params(**params)
        scores = cross_val_score(self.model, self.X, self.y, cv=self.cv, scoring=self.scoring)
        return scores.mean()

In [48]:
param_grid_1={
    'max_depth' : [4,5,6],
    'criterion' : ['gini','entropy'],
    'min_samples_leaf':[2,3,4],
    'min_samples_split':[3,4,5]
}
#3*2*3*3 = 54
param_grid_2={
    'max_depth' : [4,5,6],
    'criterion' : ['gini','entropy'],
    'min_samples_leaf':[2,3,4,5,6],
    'min_samples_split':[3,4,5,6]
}
# 3*2*5*4 = 120
param_grid_3={
    'max_depth' : [4,5,6,7,8],
    'criterion' : ['gini','entropy'],
    'min_samples_leaf':[2,3,4,5,6],
    'min_samples_split':[3,4,5,6]
}
# 5*2*5*4 = 200
param_grid_4={
    'max_depth' : [4,5,6,7,8,9,10],
    'criterion' : ['gini','entropy'],
    'min_samples_leaf':[2,3,4,5,6,7],
    'min_samples_split':[3,4,5,6,7]
}
# 7*2*6*5 = 420

In [44]:
7*2*6*5

420

In [49]:
data=pd.read_csv('airline.csv')
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

kolone_za_kodiranje = ["First Name", "Gender","Last Name", "Nationality", "Airport Name", "Airport Country Code", "Country Name", "Airport Continent", "Continents", "Arrival Airport", "Pilot Name", "Flight Status"]

for kolona in kolone_za_kodiranje:
    data[kolona] = label_encoder.fit_transform(data[kolona])
    
data["Departure Date"] = pd.to_datetime(data["Departure Date"])

data["Day"] = data["Departure Date"].dt.day
data["Month"] = data["Departure Date"].dt.month
data["Year"] = data["Departure Date"].dt.year

data = data[["Passenger ID", "First Name", "Last Name", "Gender", "Age", "Nationality",
             "Airport Name", "Airport Country Code", "Country Name", "Airport Continent",
             "Continents", "Day", "Month", "Year", "Arrival Airport",
             "Pilot Name", "Flight Status"]]

In [51]:
data_100=data.head(100)
data_1000=data.head(1000)
data_10000=data.head(10000)
data_100000=data
data_10000.head()

Unnamed: 0,Passenger ID,First Name,Last Name,Gender,Age,Nationality,Airport Name,Airport Country Code,Country Name,Airport Continent,Continents,Day,Month,Year,Arrival Airport,Pilot Name,Flight Status
0,10856,2487,22626,0,62,105,1717,219,221,3,3,28,6,2022,1657,29123,2
1,43872,2629,6478,1,62,152,4228,34,37,3,3,26,12,2022,8499,30724,2
2,42633,2023,12947,1,67,177,3034,70,72,2,2,18,1,2022,2679,23677,2
3,78493,2298,31198,0,71,43,6031,34,37,3,3,16,9,2022,8673,26947,1
4,82072,828,29617,1,21,43,2870,219,221,3,3,25,2,2022,6724,9532,2


In [52]:
model=DecisionTreeClassifier()

Podaci velicine 100

In [53]:
X_100 = data_100.iloc[:,:-1]
y_100 = data_100.iloc[:,-1]

In [54]:
param_grid = [param_grid_1,param_grid_2,param_grid_3,param_grid_4]

for i,params in enumerate(param_grid):
    if i==0:
        ga=GASearhCV(model,param_grid=params,num_generations=5,population_size=5,elitism=True,elitism_size=3)
    elif i==1:
        ga=GASearhCV(model,param_grid=params,num_generations=5,population_size=10,elitism=True,elitism_size=3)
    elif i==2:
        ga=GASearhCV(model,param_grid=params,num_generations=5,population_size=15,elitism=True,elitism_size=3)
    else:
        ga=GASearhCV(model,param_grid=params,num_generations=10,population_size=15,elitism=True,elitism_size=3)

    start_time = time.time()
    ga.fit(X_100,y_100)
    end_time = time.time()
    execution_time = end_time - start_time
    print(f"Izvršavanje koda za param_grid_{i+1} trajalo je {execution_time} sekundi. Best score: {ga.best_score}")

Izvršavanje koda za param_grid_1 trajalo je 0.4510917663574219 sekundi. Best score: 0.41000000000000003
Izvršavanje koda za param_grid_2 trajalo je 1.1902570724487305 sekundi. Best score: 0.4
Izvršavanje koda za param_grid_3 trajalo je 2.182286500930786 sekundi. Best score: 0.38
Izvršavanje koda za param_grid_4 trajalo je 4.036481142044067 sekundi. Best score: 0.38999999999999996


In [55]:
X_1000 = data_1000.iloc[:,:-1]
y_1000 = data_1000.iloc[:,-1]

In [56]:
for i,params in enumerate(param_grid):
    if i==0:
        ga=GASearhCV(model,param_grid=params,num_generations=5,population_size=5,elitism=True,elitism_size=3)
    elif i==1:
        ga=GASearhCV(model,param_grid=params,num_generations=5,population_size=10,elitism=True,elitism_size=3)
    elif i==2:
        ga=GASearhCV(model,param_grid=params,num_generations=5,population_size=15,elitism=True,elitism_size=3)
    else:
        ga=GASearhCV(model,param_grid=params,num_generations=10,population_size=15,elitism=True,elitism_size=3)

    start_time = time.time()
    ga.fit(X_1000,y_1000)
    end_time = time.time()
    execution_time = end_time - start_time
    print(f"Izvršavanje koda za param_grid_{i+1} trajalo je {execution_time} sekundi. Best score: {ga.best_score}")

Izvršavanje koda za param_grid_1 trajalo je 0.7509894371032715 sekundi. Best score: 0.349
Izvršavanje koda za param_grid_2 trajalo je 2.421154737472534 sekundi. Best score: 0.348
Izvršavanje koda za param_grid_3 trajalo je 6.246211528778076 sekundi. Best score: 0.36
Izvršavanje koda za param_grid_4 trajalo je 8.265958070755005 sekundi. Best score: 0.361


In [10]:
X_10000 = data_10000.iloc[:,:-1]
y_10000 = data_10000.iloc[:,-1]

In [57]:
for i,params in enumerate(param_grid):
    if i==0:
        ga=GASearhCV(model,param_grid=params,num_generations=5,population_size=5,elitism=True,elitism_size=3)
    elif i==1:
        ga=GASearhCV(model,param_grid=params,num_generations=5,population_size=10,elitism=True,elitism_size=3)
    elif i==2:
        ga=GASearhCV(model,param_grid=params,num_generations=5,population_size=15,elitism=True,elitism_size=3)
    else:
        ga=GASearhCV(model,param_grid=params,num_generations=10,population_size=15,elitism=True,elitism_size=3)

    start_time = time.time()
    ga.fit(X_10000,y_10000)
    end_time = time.time()
    execution_time = end_time - start_time
    print(f"Izvršavanje koda za param_grid_{i+1} trajalo je {execution_time} sekundi. Best score: {ga.best_score}")

Izvršavanje koda za param_grid_1 trajalo je 6.124116659164429 sekundi. Best score: 0.3387
Izvršavanje koda za param_grid_2 trajalo je 11.703721046447754 sekundi. Best score: 0.3388999999999999
Izvršavanje koda za param_grid_3 trajalo je 25.394129276275635 sekundi. Best score: 0.3391
Izvršavanje koda za param_grid_4 trajalo je 67.62477731704712 sekundi. Best score: 0.3424


In [58]:
X_100000 = data_100000.iloc[:,:-1]
y_100000 = data_100000.iloc[:,-1]

In [59]:
for i,params in enumerate(param_grid):
    if i==0:
        ga=GASearhCV(model,param_grid=params,num_generations=5,population_size=5,elitism=True,elitism_size=3)
    elif i==1:
        ga=GASearhCV(model,param_grid=params,num_generations=5,population_size=10,elitism=True,elitism_size=3)
    elif i==2:
        ga=GASearhCV(model,param_grid=params,num_generations=5,population_size=15,elitism=True,elitism_size=3)
    else:
        ga=GASearhCV(model,param_grid=params,num_generations=10,population_size=15,elitism=True,elitism_size=3)

    start_time = time.time()
    ga.fit(X_100000,y_100000)
    end_time = time.time()
    execution_time = end_time - start_time
    print(f"Izvršavanje koda za param_grid_{i+1} trajalo je {execution_time} sekundi. Best score: {ga.best_score}")

Izvršavanje koda za param_grid_1 trajalo je 43.06265187263489 sekundi. Best score: 0.334398073734938
Izvršavanje koda za param_grid_2 trajalo je 118.81007933616638 sekundi. Best score: 0.33550335552389443
Izvršavanje koda za param_grid_3 trajalo je 297.2374222278595 sekundi. Best score: 0.3356351376110952
Izvršavanje koda za param_grid_4 trajalo je 730.5441136360168 sekundi. Best score: 0.33686211142556


In [39]:
grid=GridSearchCV(model,param_grid=param_grid_3,n_jobs=1,cv=5,scoring='accuracy')

In [42]:
start_time = time.time()
grid.fit(X_100,y_100)
end_time = time.time()
execution_time = end_time - start_time
print(f"Izvršavanje koda trajalo je {execution_time} sekundi. Best score: {grid.best_score_}")

Izvršavanje koda trajalo je 4.381105184555054 sekundi. Best score: 0.42000000000000004


In [112]:
start_time = time.time()
grid.fit(X_1000,y_1000)
end_time = time.time()
execution_time = end_time - start_time
print(f"Izvršavanje koda trajalo je {execution_time} sekundi. Best score: {ga.best_score}")

Izvršavanje koda trajalo je 9.455922365188599 sekundi. Best score: 0.3421


In [113]:
start_time = time.time()
grid.fit(X_10000,y_10000)
end_time = time.time()
execution_time = end_time - start_time
print(f"Izvršavanje koda trajalo je {execution_time} sekundi. Best score: {ga.best_score}")

Izvršavanje koda trajalo je 63.15167307853699 sekundi. Best score: 0.3421
