In [None]:
!pip install -q scikit-plot catboost optuna scikit-learn-intelex

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Gerekli Kütüphaneler

In [None]:
import numpy as np
import pandas as pd
import io
import pickle
import os
import time
from collections import Counter
from tqdm import tqdm

import logging

pd.options.mode.chained_assignment = None

np.random.seed(42)
pd.set_option("display.max_colwidth", None)
pd.set_option("display.float_format", lambda x: "%.2f" % x)

import seaborn as sns
import matplotlib.pyplot as plt
from mlxtend.plotting import plot_confusion_matrix
from scikitplot.plotters import plot_feature_importances

sns.set_style("whitegrid")

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier

import optuna
from sklearn.base import clone
from sklearn.preprocessing import RobustScaler, OrdinalEncoder, LabelEncoder, label_binarize
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics import f1_score, precision_score, recall_score

from sklearn.feature_selection import f_classif
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN, BorderlineSMOTE

from sklearn.metrics import brier_score_loss
from sklearn.calibration import CalibratedClassifierCV, calibration_curve

from sklearnex import patch_sklearn
patch_sklearn()

from IPython.display import Markdown

def bold(string):
    display(Markdown("**" + string + "**"))

PROJECT_ROOT_DIR = '/content/drive/MyDrive/proje'
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, 'images')
def save_fig(title, year):
    path = os.path.join(IMAGES_PATH, str(year), title + '.png')
    plt.tight_layout()
    plt.savefig(path, format='png', dpi=300)

import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=ConvergenceWarning)
warnings.filterwarnings("ignore", category=RuntimeWarning)

# Veriyi yükleyelim

In [None]:
data = pd.read_csv('/content/drive/MyDrive/proje/data/processed.csv', index_col=0)
logging.info('Veri seti yüklendi.')
df = data.copy()
df.head()

In [None]:
def df_stats(data):
    bold(" SHAPE ".center(50, "#"))
    print("ROWS: {}".format(data.shape[0]))
    print("COLUMNS: {}".format(data.shape[1]))
    bold(" TYPES ".center(50, "#"))
    print(data.dtypes)
    bold(" MISSING VALUES ".center(50, "#"))
    print(data.isnull().sum())
    bold(" DUPLICATED VALUES ".center(50, "#"))
    print("NUMBER OF DUPLICATED VALUES: {}".format(data.duplicated().sum()))
    bold(" MEMORY USAGE ".center(50, "#"))
    buf = io.StringIO()
    data.info(buf=buf)
    info = buf.getvalue().split("\n")[-2].split(":")[1].strip()
    print("Memory Usage: {}".format(info))
    bold(" DESCRIBE ".center(50, "#"))
    print(data.describe().T)

In [None]:
df_stats(df)

In [None]:
def plot_correlation_heatmap(df: pd.core.frame.DataFrame, title_name: str='Correlation Map') -> None:
    corr = df.corr()
    fig, axes = plt.subplots(figsize=(30, 30))
    mask = np.zeros_like(corr)
    mask[np.triu_indices_from(mask)] = True
    sns.heatmap(corr, mask=mask, linewidths=.5, cmap='viridis', annot=True, fmt='.2f')
    plt.title(title_name)
    plt.savefig('correlation')
    plt.show()

plot_correlation_heatmap(df, 'Dataset Correlation')

# Yıllara Göre Veri Setini Ayıralım

In [None]:
def seperate_by_year(year):
    temp_df = df[df['Tarih'] <= year].reset_index(drop=True)
    if not os.path.exists(f'/content/drive/MyDrive/proje/data/seperated/{year}'):
        os.makedirs(f'/content/drive/MyDrive/proje/data/seperated/{year}')
    temp_df.to_csv(f'/content/drive/MyDrive/proje/data/seperated/{year}/{year}.csv')

In [None]:
for i in range(1997, 2024):
    seperate_by_year(i)

print('Yıllara göre ayrıldı.')

# Boş Değerleri Dolduralım

In [None]:
def fill_na_values(year):
    temp_df = pd.read_csv(f'/content/drive/MyDrive/proje/data/seperated/{year}/{year}.csv', index_col=0, na_values='?')
    temp_df['duration'].fillna(method='bfill', inplace=True)
    temp_df['Publication Day'].fillna(method='bfill', inplace=True)
    temp_df.dropna(inplace=True)
    temp_df = temp_df.reset_index(drop=True)

    temp_df.to_csv(f'/content/drive/MyDrive/proje/data/seperated/{year}/{year}_filled.csv')

In [None]:
for i in range(1997, 2024):
    fill_na_values(i)

print('Boş sütunlar dolduruldu.')

# Özellik Ölçekleme ve Temel Model Eğitimi

In [None]:
def plot_results(df, title, year):
    plt.figure(figsize=(10, 30))

    plt.subplot(711)
    ax = sns.barplot(data=df.sort_values(by="Train Time", ascending=False), y="Model Name", x="Train Time", palette='viridis')
    for container in ax.containers:
        ax.bar_label(container)
    plt.title("Model / Train Time")
    plt.xlabel("")

    plt.subplot(712)
    ax = sns.barplot(data=df.sort_values(by="Test Time", ascending=False), y="Model Name", x="Test Time", palette='viridis')
    for container in ax.containers:
        ax.bar_label(container)
    plt.title("Model / Test Time")
    plt.xlabel("")

    plt.subplot(713)
    ax = sns.barplot(data=df.sort_values(by="Train Accuracy", ascending=True), y="Model Name", x="Train Accuracy", palette='viridis')
    for container in ax.containers:
        ax.bar_label(container)
    plt.title("Model / Train Accuracy")
    plt.xlabel("")

    plt.subplot(714)
    ax = sns.barplot(data=df.sort_values(by="Test Accuracy", ascending=True), y="Model Name", x="Test Accuracy", palette='viridis')
    for container in ax.containers:
        ax.bar_label(container)
    plt.title("Model / Test Accuracy")
    plt.xlabel("")

    plt.subplot(715)
    ax = sns.barplot(data=df.sort_values(by="F1", ascending=True), y="Model Name", x="F1", palette='viridis')
    for container in ax.containers:
        ax.bar_label(container)
    plt.title("Model / F1")
    plt.xlabel("")

    plt.subplot(716)
    ax = sns.barplot(data=df.sort_values(by="Precision", ascending=True), y="Model Name", x="Precision", palette='viridis')
    for container in ax.containers:
        ax.bar_label(container)
    plt.title("Model / Precision")
    plt.xlabel("")

    plt.subplot(717)
    ax = sns.barplot(data=df.sort_values(by="Recall", ascending=True), y="Model Name", x="Recall", palette='viridis')
    for container in ax.containers:
        ax.bar_label(container)
    plt.title("Model / Recall")
    plt.xlabel("")

    save_fig(title, year)
    plt.show()

In [None]:
def load_models():
    return [RandomForestClassifier(), AdaBoostClassifier(), GradientBoostingClassifier(), DecisionTreeClassifier(), ExtraTreesClassifier(),
            LogisticRegression(), SGDClassifier(), XGBClassifier(), LGBMClassifier(verbose=-100), CatBoostClassifier(verbose=0), MLPClassifier(),
            GaussianNB(), SVC(), KNeighborsClassifier()]

# Hiperparametreler

In [None]:
def objective_lgbm(trial, X_train, X_test, y_train, y_test):
    param = {
        'objective': 'multiclass',
        'num_class': 3,
        'metric': 'multi_logloss',
        'verbosity': -1,
        'boosting_type': 'gbdt',
        'num_leaves': trial.suggest_int('num_leaves', 2, 256),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.005, 0.5),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'min_child_samples': trial.suggest_int('min_child_samples', 1, 100),
        'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-5, 10.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-5, 10.0),
    }

    model = LGBMClassifier(**param)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

In [None]:
def objective_xgb(trial, X_train, X_test, y_train, y_test):
    param = {
        'objective': 'multi:softmax',
        'num_class': 3,
        'eval_metric': 'mlogloss',
        'verbosity': 0,
        'booster': 'gbtree',
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.005, 0.5),
        'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-5, 10.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-5, 10.0),
        'min_child_weight': trial.suggest_loguniform('min_child_weight', 1e-5, 10.0),
    }

    model = XGBClassifier(**param)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

In [None]:
def objective_rf(trial, X_train, X_test, y_train, y_test):
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 200),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 5),
        'max_features': trial.suggest_uniform('max_features', 0.6, 1.0)
    }

    model = RandomForestClassifier(**param)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

In [None]:
def objective_ada(trial, X_train, X_test, y_train, y_test):
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 200),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 1.0),
    }

    model = AdaBoostClassifier(**param)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

In [None]:
def objective_gb(trial, X_train, X_test, y_train, y_test):
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 200),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 1.0),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 5),
        'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
    }

    model = GradientBoostingClassifier(**param)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

In [None]:
def objective_dt(trial, X_train, X_test, y_train, y_test):
    param = {
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 5),
        'max_features': trial.suggest_uniform('max_features', 0.6, 1.0)
    }

    model = DecisionTreeClassifier(**param)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

In [None]:
def objective_et(trial, X_train, X_test, y_train, y_test):
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 200),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 5),
        'max_features': trial.suggest_uniform('max_features', 0.6, 1.0)
    }

    model = ExtraTreesClassifier(**param)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

In [None]:
def objective_sgd(trial, X_train, X_test, y_train, y_test):
    param = {
        'alpha': trial.suggest_loguniform('alpha', 1e-5, 1e-1),
        'loss': trial.suggest_categorical('loss', ['hinge', 'log_loss', 'modified_huber', 'squared_hinge']),
        'penalty': trial.suggest_categorical('penalty', ['l1', 'l2', 'elasticnet'])
    }

    model = SGDClassifier(**param)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

In [None]:
def objective_mlp(trial, X_train, X_test, y_train, y_test):
    param = {
        'activation': trial.suggest_categorical('activation', ['identity', 'logistic', 'tanh', 'relu']),
        'solver': trial.suggest_categorical('solver', ['lbfgs', 'sgd', 'adam']),
        'alpha': trial.suggest_loguniform('alpha', 1e-5, 1e-1),
    }

    model = MLPClassifier(**param)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

In [None]:
def objective_svc(trial, X_train, X_test, y_train, y_test):
    param = {
        'kernel': trial.suggest_categorical('kernel', ['rbf']),
        'C': trial.suggest_loguniform('C', 1e-5, 100),
        'gamma': trial.suggest_loguniform('gamma', 1e-5, 100),
    }

    model = SVC(**param)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

In [None]:
def objective_knn(trial, X_train, X_test, y_train, y_test):
    param = {
        'n_neighbors': trial.suggest_int('n_neighbors', 1, 30),
        'metric': trial.suggest_categorical('metric', ['euclidean', 'manhattan', 'chebyshev']),
        'weights': trial.suggest_categorical('weights', ['uniform', 'distance']),
    }
    model = KNeighborsClassifier(**param)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

# Model Eğitme

In [None]:
def train_model(year, i, model, scores_df, X_train, X_test, y_train, y_test, class_names, sampler_name):
    if not os.path.exists(f'/content/drive/MyDrive/proje/images/{year}'):
        os.makedirs(f'/content/drive/MyDrive/proje/images/{year}')

    best_k = 10
    best_score = 0

    for k in range(10, X_train.shape[1] + 1):
        selector = SelectKBest(score_func=f_classif, k=k)
        X_train_selected = selector.fit_transform(X_train, y_train)
        X_test_selected = selector.transform(X_test)

        selected_model = clone(model)

        selected_model.fit(X_train_selected, y_train)
        y_pred = selected_model.predict(X_test_selected)
        accuracy = accuracy_score(y_test, y_pred)

        if accuracy > best_score:
            best_score = accuracy
            best_k = k

    selector = SelectKBest(score_func=f_classif, k=best_k)
    X_train = selector.fit_transform(X_train, y_train)
    X_test = selector.transform(X_test)
    selected_features = selector.get_feature_names_out()

    model_name = str(model.__class__).split(".")[-1].replace("'>", "").replace("Classifier", "")

    study = optuna.create_study(direction='maximize')

    if model_name == "LGBM":
        study.optimize(lambda trial: objective_lgbm(trial, X_train, X_test, y_train, y_test), n_trials=25)
        best_params = study.best_params
    elif model_name == "AdaBoost":
        study.optimize(lambda trial: objective_ada(trial, X_train, X_test, y_train, y_test), n_trials=25)
        best_params = study.best_params
    elif model_name == "DecisionTree":
        study.optimize(lambda trial: objective_dt(trial, X_train, X_test, y_train, y_test), n_trials=25)
        best_params = study.best_params
    elif model_name == "ExtraTree":
        study.optimize(lambda trial: objective_et(trial, X_train, X_test, y_train, y_test), n_trials=25)
        best_params = study.best_params
    elif model_name == "GradientBoosting":
        study.optimize(lambda trial: objective_gb(trial, X_train, X_test, y_train, y_test), n_trials=25)
        best_params = study.best_params
    elif model_name == "MLP":
        study.optimize(lambda trial: objective_mlp(trial, X_train, X_test, y_train, y_test), n_trials=25)
        best_params = study.best_params
    elif model_name == "RandomForest":
        study.optimize(lambda trial: objective_rf(trial, X_train, X_test, y_train, y_test), n_trials=25)
        best_params = study.best_params
    elif model_name == "SGD":
        study.optimize(lambda trial: objective_sgd(trial, X_train, X_test, y_train, y_test), n_trials=25)
        best_params = study.best_params
    elif model_name == "XGB":
        study.optimize(lambda trial: objective_xgb(trial, X_train, X_test, y_train, y_test), n_trials=25)
        best_params = study.best_params
    elif model_name == "SVC":
        study.optimize(lambda trial: objective_svc(trial, X_train, X_test, y_train, y_test), n_trials=5)
        best_params = study.best_params
    elif model_name == "KNeighbors":
        study.optimize(lambda trial: objective_knn(trial, X_train, X_test, y_train, y_test), n_trials=25)
        best_params = study.best_params
    else:
        best_params = {}

    model = model.set_params(**best_params)

    best_calibration_method = None
    best_brier = 1
    calibration_methods = ['sigmoid', 'isotonic']
    for calibration in calibration_methods:
        model_calibrate = clone(model)
        calibrated_classifier = CalibratedClassifierCV(model_calibrate, method=calibration, cv=5)
        calibrated_classifier.fit(X_train, y_train)
        if hasattr(calibrated_classifier, 'predict_proba'):
            prob_pos = calibrated_classifier.predict_proba(X_test)
        else:
            prob_pos = calibrated_classifier.decision_function(X_test)
            prob_pos = (prob_pos - prob_pos.min()) / (prob_pos.max() - prob_pos.min())

        y_test_binarized = label_binarize(y_test, classes=np.unique(y_test))
        brier_scores = [brier_score_loss(y_test_binarized[:, i], prob_pos[:, i]) for i in range(y_test_binarized.shape[1])]
        brier = np.mean(brier_scores)
        if brier < best_brier:
            best_calibration_method = calibration

    model_name = f'{model_name} + {sampler_name} + k={best_k} + {best_calibration_method}'
    calibrated_model = CalibratedClassifierCV(model, method=best_calibration_method, cv=5)

    start = time.time()
    calibrated_model.fit(X_train, y_train)
    end = time.time()
    train_time = end - start

    start = time.time()
    y_pred = calibrated_model.predict(X_test)
    end = time.time()
    test_time = end - start

    train_pred = calibrated_model.predict(X_train)
    train_acc = accuracy_score(y_train, train_pred)
    test_acc = accuracy_score(y_test, y_pred)

    f1 = f1_score(y_test, y_pred, average="weighted")
    precision = precision_score(y_test, y_pred, average="weighted")
    recall = precision_score(y_test, y_pred, average="weighted")

    cm = confusion_matrix(y_test, y_pred)
    plot_confusion_matrix(conf_mat=cm, show_absolute=True, show_normed=True, colorbar=True, figsize=(8, 8))
    plt.title(model_name)
    save_fig(f"{model_name}_confusion_matrix", year)
    plt.show()

    bold(f" {model_name} ".center(75, "#"))
    print(classification_report(y_test, y_pred))
    print()

    with open(f'/content/drive/MyDrive/proje/models/{year}/{model_name}.pkl', 'wb') as file:
        pickle.dump(calibrated_model, file)

    scores_df.loc[len(scores_df)] = [model_name, selected_features, best_params, train_time, test_time, train_acc, test_acc, f1, precision, recall]

In [None]:
def train_model_per_year(year):
    temp_df = pd.read_csv(f'/content/drive/MyDrive/proje/data/seperated/{year}/{year}_filled.csv', index_col=0)
    X = temp_df.drop(['Dizi Adı', 'state'], axis=1)
    y = temp_df['state']

    if not os.path.exists(f'/content/drive/MyDrive/proje/models/{year}'):
        os.makedirs(f'/content/drive/MyDrive/proje/models/{year}')

    target = 'state'
    numerical_variables = [col for col in X.columns if pd.api.types.is_numeric_dtype(X[col]) and col != target]
    categorical_variables = [col for col in X.columns if col not in numerical_variables or X[col].dtype == "O" and col != target]

    scores_df = pd.DataFrame(columns=["Model Name", "Selected Features", "Parameters", "Train Time", "Test Time", "Train Accuracy", "Test Accuracy", "F1", "Precision", "Recall"])
    samplers = [#SMOTE(random_state=42),
                RandomOverSampler(random_state=42),
                #ADASYN(random_state=42),
                #BorderlineSMOTE(k_neighbors=4, random_state=42)
               ]

    class_index = {}
    for c in np.unique(y):
        class_index[c] = np.where(y == c)[0]

    min_class_len = min(len(ind) for ind in class_index.values())
    test_index = []
    for i in class_index.values():
        test_index.extend(np.random.choice(i, size=min_class_len // 2, replace=False))

    train_index = np.setdiff1d(X.index.values, test_index)
    train_index = list(train_index)

    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]

    day_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
    X_train['Publication Day'] = day_encoder.fit_transform(X_train['Publication Day'].values.reshape(-1, 1))[:, 0]
    X_test['Publication Day'] = day_encoder.transform(X_test['Publication Day'].values.reshape(-1, 1))[:, 0]
    pickle.dump(day_encoder, open(f'/content/drive/MyDrive/proje/models/{year}/day_encoder.pkl', 'wb'))

    #date_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
    #X_train['Publication Day'] = date_encoder.fit_transform(X_train['Publication Day'].values.reshape(-1, 1))[:, 0]
    #X_test['Publication Day'] = date_encoder.fit_transform(X_test['Publication Day'].values.reshape(-1, 1))[:, 0]
    #pickle.dump(date_encoder, open(f'./models/{year}/date_encoder.pkl', 'wb'))

    rs = RobustScaler()
    X_train[numerical_variables] = rs.fit_transform(X_train[numerical_variables])
    X_test[numerical_variables] = rs.transform(X_test[numerical_variables])
    pickle.dump(rs, open(f'/content/drive/MyDrive/proje/models/{year}/rs.pkl', 'wb'))

    le = LabelEncoder()
    y_train = le.fit_transform(y_train)
    y_test = le.transform(y_test)
    pickle.dump(le, open(f'/content/drive/MyDrive/proje/models/{year}/le.pkl', 'wb'))

    class_names = le.classes_

    for sampler in samplers:
        sampler_name = str(sampler.__class__).split(".")[-1].replace("'>", "")
        X_train_resampled, y_train_resampled = sampler.fit_resample(X_train, y_train)

        print(Counter(y_train), '=', len(y_train))
        print(Counter(y_test), '=', len(y_test))

        models = load_models()
        for i, model in enumerate(models):
            train_model(year, i, model, scores_df, X_train_resampled, X_test, y_train_resampled, y_test, class_names, sampler_name)

    scores_df.to_csv(f'/content/drive/MyDrive/proje/models/{year}/scores.csv')
    plot_results(scores_df, f'{year} Base Models Results', year)

# Temel Modeller

In [None]:
train_model_per_year(1997)

In [None]:
train_model_per_year(1998)

In [None]:
train_model_per_year(1999)

In [None]:
train_model_per_year(2000)

In [None]:
train_model_per_year(2001)

In [None]:
train_model_per_year(2002)

In [None]:
train_model_per_year(2003)

In [None]:
train_model_per_year(2004)

In [None]:
train_model_per_year(2005)

In [None]:
train_model_per_year(2006)

In [None]:
train_model_per_year(2007)

In [None]:
train_model_per_year(2008)

In [None]:
train_model_per_year(2009)

In [None]:
train_model_per_year(2010)

In [None]:
train_model_per_year(2011)

In [None]:
train_model_per_year(2012)

In [None]:
train_model_per_year(2013)

In [None]:
train_model_per_year(2014)

In [None]:
train_model_per_year(2015)

In [None]:
train_model_per_year(2016)

In [None]:
train_model_per_year(2017)

In [None]:
train_model_per_year(2018)

In [None]:
train_model_per_year(2019)

In [None]:
train_model_per_year(2020)

In [None]:
train_model_per_year(2021)

In [None]:
train_model_per_year(2022)

In [None]:
train_model_per_year(2023)

In [None]:
base_scores_df = pd.DataFrame(columns=["Year", "Model Name", "Selected Features", "Parameters", "Train Time", "Test Time", "Train Accuracy", "Test Accuracy", "F1", "Precision", "Recall"])
for year in range(1997, 2024):
    temp_df = pd.read_csv(f'/content/drive/MyDrive/proje/models/{year}/scores.csv', index_col=0)
    temp_df["Year"] = year
    base_scores_df = pd.concat([base_scores_df, temp_df])

In [None]:
base_scores_df = base_scores_df.groupby("Year", group_keys=True).apply(lambda x: x).drop('Year', axis=1)
base_scores_df

In [None]:
base_scores_df.to_csv('/content/drive/MyDrive/proje/models/base_models.csv')

In [None]:
best_base_scores_df = pd.DataFrame(columns=["Year", "Model Name", "Selected Features", "Parameters", "Train Time", "Test Time", "Train Accuracy", "Test Accuracy", "F1", "Precision", "Recall"])
for year in range(1997, 2024):
    temp_df = pd.read_csv(f'/content/drive/MyDrive/proje/models/{year}/scores.csv', index_col=0)
    temp_df = temp_df[temp_df['Test Accuracy'] == temp_df['Test Accuracy'].max()]
    temp_df['Year'] = year
    best_base_scores_df = pd.concat([best_base_scores_df, temp_df])

In [None]:
best_base_scores_df = best_base_scores_df.reset_index(drop=True)
best_base_scores_df

In [None]:
best_base_scores_df.to_csv('/content/drive/MyDrive/proje/models/best_base_models.csv')

In [None]:
best_df = best_base_scores_df.iloc[[1, 2, 7, 9, 11, 12, 17, 26, 27, 29, 31, 32, 33, 36, 38, 40, 43, 48, 49, 50, 53, 56, 57, 59, 60, 64, 65]].reset_index(drop=True)
best_df

In [None]:
best_df.to_csv('/content/drive/MyDrive/proje/models/best_df.csv')

In [None]:
def copy_best_models(year):
    temp_df = pd.read_csv('/content/drive/MyDrive/proje/models/best_df.csv', index_col=0)

    if not os.path.exists(f'/content/drive/MyDrive/proje/best_models/{year}'):
        os.makedirs(f'/content/drive/MyDrive/proje/best_models/{year}')

    temp_df = temp_df[temp_df['Year'] == year]
    name = temp_df['Model Name'].values[0]
    model_name = f'/content/drive/MyDrive/proje/models/{year}/{name}.pkl'
    model_le = f'/content/drive/MyDrive/proje/models/{year}/le.pkl'
    model_ord = f'/content/drive/MyDrive/proje/models/{year}/day_encoder.pkl'
    model_rs = f'/content/drive/MyDrive/proje/models/{year}/rs.pkl'
    os.system(f"cp '{model_name}' /content/drive/MyDrive/proje/best_models/{year}")
    os.system(f"cp '{model_le}' /content/drive/MyDrive/proje/best_models/{year}")
    os.system(f"cp '{model_ord}' /content/drive/MyDrive/proje/best_models/{year}")
    os.system(f"cp '{model_rs}' /content/drive/MyDrive/proje/best_models/{year}")

    print(f'{year} yılına ait en iyi model kopyalandı.')

In [None]:
for i in range(1997, 2024):
    copy_best_models(i)

In [None]:
import sys
print(sys.version)