In [None]:
import pandas as pd

import numpy as np

df = pd.read_csv('cleaned_star_data.csv')

df.head()

In [None]:
print(f"Розмір датасета: {df.shape}")

df.info()

In [None]:
df = df.replace([' ', '-', 'Unknown', 'none', 'NaN'], np.nan)

In [None]:
pd.set_option('display.max_rows', None)

df

In [None]:
df = df.drop(0)

df = df.reset_index(drop=True)

In [None]:
df.info()

In [None]:
cols_to_fix = ['Temperature (K)', 'Luminosity(L/Lo)', 'Radius(R/Ro)', 'Absolute magnitude(Mv)']

for col in cols_to_fix:

    df[col] = pd.to_numeric(df[col], errors='coerce')

df.info()

In [None]:
medians = df.median(numeric_only=True)

df = df.fillna(medians)

df['Star color'] = df['Star color'].fillna(df['Star color'].mode()[0])
df['Spectral Class'] = df['Spectral Class'].fillna(df['Spectral Class'].mode()[0])

print("Кількість пропусків після заповнення:")
print(df.isnull().sum())
df

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

text_cols = ['Star color', 'Spectral Class']

for col in text_cols:

    df[col] = le.fit_transform(df[col].astype(str))

df.info()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.set_theme(style="whitegrid")

plt.figure(figsize=(12, 8))
correlation_matrix = df.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title("Теплова карта кореляції ознак", fontsize=16)
plt.show()

df.hist(figsize=(15, 12), bins=20, color='steelblue', edgecolor='black')
plt.suptitle("Гістограми розподілу всіх ознак", fontsize=16)
plt.tight_layout(rect=(0, 0.03, 1, 0.95))
plt.show()

features_to_plot = ['Temperature (K)', 'Luminosity(L/Lo)', 'Radius(R/Ro)', 'Absolute magnitude(Mv)']

for feature in features_to_plot:
    plt.figure(figsize=(10, 5))
    sns.boxplot(x='Star type', y=feature, data=df, palette='viridis', hue='Star type', legend=False)
    plt.title(f"Boxplot: {feature} відносно Типу зірки", fontsize=14)
    plt.show()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X = df.drop('Star type', axis=1)
y = df['Star type']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import accuracy_score

base_tree = DecisionTreeClassifier(max_depth=2, random_state=42)

models = {
    "kNN": KNeighborsClassifier(n_neighbors=5),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "SVM": SVC(kernel='linear', random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "AdaBoost": AdaBoostClassifier(estimator=base_tree, n_estimators=100, random_state=42)
}

print("Результати точності моделей:")
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{name}: {accuracy:.4f}")

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay

knn_params = {'n_neighbors': range(1, 21)}
knn_grid = GridSearchCV(KNeighborsClassifier(), knn_params, cv=5)
knn_grid.fit(X_train, y_train)

svm_params = {
    'C': [0.1, 1, 10, 100],
    'gamma': [1, 0.1, 0.01, 0.001],
    'kernel': ['rbf', 'linear']
}
svm_grid = GridSearchCV(SVC(random_state=42), svm_params, cv=5)
svm_grid.fit(X_train, y_train)

best_knn = knn_grid.best_estimator_
best_svm = svm_grid.best_estimator_

print(f"Найкращі параметри kNN: {knn_grid.best_params_}")
print(f"Найкращі параметри SVM: {svm_grid.best_params_}")

final_model = best_svm if svm_grid.best_score_ > knn_grid.best_score_ else best_knn
y_pred = final_model.predict(X_test)

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

plt.figure(figsize=(8, 6))
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap='Blues')
plt.title(f"Confusion matrix")
plt.show()