In [None]:
# Zignoruj wszystkie ostrzeżenia w Jupyter Notebook
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Importowanie niezbędnych bibliotek
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from sklearn.svm import SVC
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import time

%matplotlib inline

In [None]:
# Wczytywanie danych treningowych
dftrain = pd.read_csv("Data/FINAL_CSV/train.csv") 
dftrain_roc = dftrain
dftrain.head()

In [None]:
# Wczytywanie danych testowych
dftest = pd.read_csv("Data/FINAL_CSV/test.csv")
dftest_roc = dftest
dftest.head()

In [None]:
# Kodowanie etykiet
encoder = LabelEncoder()
dftrain['target'] = encoder.fit_transform(dftrain['target'])
encoder.classes_

In [None]:
# Przetwarzanie kolumn zawierających dane kategoryczne
for column in dftrain.columns:
    if dftrain[column].dtype == type(object):
        labelencoder = LabelEncoder()
        dftrain[column] = labelencoder.fit_transform(dftrain[column])

X_train = dftrain.drop("target", axis='columns')
y_train = dftrain["target"]

for column in dftest.columns:
    if dftest[column].dtype == type(object):
        labelencoder = LabelEncoder()
        dftest[column] = labelencoder.fit_transform(dftest[column])

X_test = dftest.drop("target", axis='columns')
y_test = dftest["target"]

In [None]:
# Sprawdzanie kompletności danych i typów
print("Sprawdzanie danych treningowych")
print("Informacje o danych treningowych:")
print(dftrain.info())
print("\nPodsumowanie brakujących wartości w danych treningowych:")
print(dftrain.isnull().sum())
print("\nPodsumowanie podstawowych statystyk dla danych treningowych:")
print(dftrain.describe(include='all'))

print("\nSprawdzanie danych testowych")
print("Informacje o danych testowych:")
print(dftest.info())
print("\nPodsumowanie brakujących wartości w danych testowych:")
print(dftest.isnull().sum())
print("\nPodsumowanie podstawowych statystyk dla danych testowych:")
print(dftest.describe(include='all'))

# Usunięcie lub uzupełnienie brakujących danych, jeśli występują
dftrain.dropna(inplace=True)
dftest.dropna(inplace=True)

# Sprawdzanie, czy brakujące dane zostały usunięte
print("\nPo usunięciu brakujących danych:")
print("Podsumowanie brakujących wartości w danych treningowych:")
print(dftrain.isnull().sum())
print("Podsumowanie brakujących wartości w danych testowych:")
print(dftest.isnull().sum())

Correlation 

In [None]:
# Analiza korelacji
data = dftrain
data['target'] = data['target'].replace('legitimate', 'other')
data['target'] = data['target'].replace('malformed', 'other')
data['target'] = data['target'].replace('bruteforce', 'other')
data['target'] = data['target'].replace('slowite', 'other')
data['target'] = data['target'].replace('flood', 'other')

encoder = LabelEncoder()
data['target'] = encoder.fit_transform(data['target'])
encoder.classes_

numerical_data = data.select_dtypes(include=['int64', 'float64'])

# Obliczenie macierzy korelacji
corr_matrix = numerical_data.corr().dropna(axis=0, how='all').dropna(axis=1, how='all')

print(corr_matrix)

my_mask = np.triu(np.ones_like(corr_matrix, dtype=bool))

# Wygenerowanie wykresu macierzy korelacji
plt.figure(figsize=(18, 18))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5, vmin=0, vmax=1, square=True, mask=my_mask)
plt.show()

KNN

In [None]:
# K-Nearest Neighbors
k_values = list(range(1, 100))
mean_accuracy = []

for k in k_values:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    mean_accuracy.append(np.mean(y_pred == y_test))
    print(k, mean_accuracy[k-1])

optimal_k = k_values[np.argmax(mean_accuracy)]
print(f"The optimal number of neighbors is k={optimal_k}")
plt.plot([x for x in range(1, 100)], mean_accuracy)
plt.xlabel("Number of Neighbors k")
plt.ylabel("Mean accuracy")
plt.show()

knn = KNeighborsClassifier(n_neighbors=optimal_k, weights='distance')
knn.fit(X_train, y_train)


In [None]:
# Ocena modelu KNN
y_pred = knn.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

y_test_labels = encoder.inverse_transform(y_test)
y_pred_labels = encoder.inverse_transform(y_pred)

clf_report = classification_report(y_test_labels, y_pred_labels, labels=encoder.classes_)
conf_matrix = confusion_matrix(y_test_labels, y_pred_labels, labels=encoder.classes_)

print(f'Accuracy: {accuracy:.4f}')
print()
print(f"Classification Report:\n{clf_report}")
print(f"Confusion Matrix:\n{conf_matrix}")
print("#"*150)

# Wizualizacja macierzy pomyłek
plt.figure(figsize=(10, 8))
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=encoder.classes_, yticklabels=encoder.classes_)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

SVM

In [None]:
# Support Vector Machine z różnymi funkcjami jądra
kernels = ['poly', 'rbf', 'linear', 'sigmoid']
for kernel in kernels:
    svm = SVC(kernel=kernel, cache_size=500, random_state=42)
    svm.fit(X_train, y_train)
    accuracy = svm.score(X_test, y_test)
    print(f"Dokładność modelu SVM ({kernel}):", accuracy)

    y_pred = svm.predict(X_test)
    y_test_labels = encoder.inverse_transform(y_test)
    y_pred_labels = encoder.inverse_transform(y_pred)

    clf_report = classification_report(y_test, y_pred, target_names=encoder.classes_)
    conf_matrix = confusion_matrix(y_test_labels, y_pred_labels, labels=encoder.classes_)

    print(f"Classification Report:\n{clf_report}")
    print(f"Confusion Matrix:\n{conf_matrix}")
    print("#"*150)

    # Wizualizacja macierzy pomyłek
    plt.figure(figsize=(10, 8))
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=encoder.classes_, yticklabels=encoder.classes_)
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title(f'Confusion Matrix ({kernel})')
    plt.show()

Decision tree

In [None]:
# Decision Tree
decision_tree = DecisionTreeClassifier()
param_grid = {
    'criterion': ['gini', 'entropy'],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
grid_search = GridSearchCV(decision_tree, param_grid, cv=5)
grid_search.fit(X_train, y_train)
print("Najlepsze parametry:", grid_search.best_params_)

clf = DecisionTreeClassifier(criterion=grid_search.best_params_['criterion'],
                             min_samples_split=grid_search.best_params_['min_samples_split'],
                             min_samples_leaf=grid_search.best_params_['min_samples_leaf'])
clf.fit(X_train, y_train)

accuracy = clf.score(X_test, y_test)
print("Dokładność modelu Decision Tree:", accuracy)

y_pred = clf.predict(X_test)
y_test_labels = encoder.inverse_transform(y_test)
y_pred_labels = encoder.inverse_transform(y_pred)

clf_report = classification_report(y_test, y_pred, target_names=encoder.classes_)
conf_matrix = confusion_matrix(y_test_labels, y_pred_labels, labels=encoder.classes_)

print(f"Classification Report:\n{clf_report}")
print(f"Confusion Matrix:\n{conf_matrix}")
print("#"*150)

# Wizualizacja macierzy pomyłek
plt.figure(figsize=(10, 8))
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=encoder.classes_, yticklabels=encoder.classes_)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

# Feature Importances
importances = clf.feature_importances_
indices = np.argsort(importances)

plt.figure(figsize=(10, 12))
plt.barh(range(len(indices)), importances[indices], align='center', color=(61/255,111/255,201/255))
plt.yticks(range(len(indices)), [X_train.columns[i] for i in indices])
plt.xlabel('Udział cech w predykcji')
plt.show()

# Wizualizacja drzewa
plt.figure(figsize=(20, 10))
plot_tree(clf, feature_names=X_train.columns, class_names=encoder.classes_, filled=False)
plt.show()

Random forest

In [None]:
# Random Forest
rf_classifier = RandomForestClassifier(random_state=42)

# Definiowanie siatki hiperparametrów
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Wypisanie najlepszych hiperparametrów
print("Najlepsze parametry: ", grid_search.best_params_)

# Budowanie modelu Random Forest z najlepszymi hiperparametrami
best_rf_classifier = grid_search.best_estimator_
best_rf_classifier.fit(X_train, y_train)

# Predykcja i ocena modelu
y_pred = best_rf_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.4f}')

y_test_labels = encoder.inverse_transform(y_test)
y_pred_labels = encoder.inverse_transform(y_pred)

clf_report = classification_report(y_test_labels, y_pred_labels, labels=encoder.classes_)
conf_matrix = confusion_matrix(y_test_labels, y_pred_labels, labels=encoder.classes_)

print(f"Classification Report:\n{clf_report}")
print(f"Confusion Matrix:\n{conf_matrix}")
print("#"*150)

# Wizualizacja macierzy pomyłek
plt.figure(figsize=(10, 8))
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=encoder.classes_, yticklabels=encoder.classes_)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

Random forest ROC

In [None]:
cv = KFold(n_splits=10, shuffle=True, random_state=42)
X = X_train.to_numpy()
y = y_train.to_numpy()

clfs = {
    "Random Forest": RandomForestClassifier(n_estimators= grid_search.best_params_['n_estimators'], 
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
}

tprs = []
aucs = []
mean_fpr = np.linspace(0, 1, 100)
plt.figure(figsize=(15,8))
for clf_name, clf in clfs.items():
    i = 0
    for train, test in cv.split(X, y):
        probas_ = clf.fit(X[train], y[train]).predict_proba(X[test])
        fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1])
        tprs.append(np.interp(mean_fpr, fpr, tpr))
        roc_auc = auc(fpr, tpr)
        aucs.append(roc_auc)
        plt.plot(fpr, tpr, lw=1, alpha=0.6,
                 label='Wartość ROC dla podziału nr %d' % (i+1))

        i += 1

    plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r',
            label='Klasyfikacja losowa', alpha=.8)

    mean_tpr = np.mean(tprs, axis=0)
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    std_auc = np.std(aucs)
    plt.plot(mean_fpr, mean_tpr, 
            label=f'Wartości uśrednione ROC',
            lw=2, alpha=1)

    std_tpr = np.std(tprs, axis=0)
    tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
    tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
    plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2)

plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate', fontsize=18)
plt.ylabel('True Positive Rate', fontsize=18)
plt.title('Cross-Validation ROC', fontsize=18)
plt.legend(loc="lower right", prop={'size': 12})
plt.show()