Натренувати класифікатор на датасеті

https://archive.ics.uci.edu/ml/datasets/credit+approval

DecisionTreeClassifier

Провести всі етапи (підготовка, графічний аналіз, нові фічі....)

Додатково: порівняти із LogisticRegression і метричною моделлю


* A1:	b, a.
* A2:	continuous.
* A3:	continuous.
* A4:	u, y, l, t.
* A5:	g, p, gg.
* A6:	c, d, cc, i, j, k, m, r, q, w, x, e, aa, ff.
* A7:	v, h, bb, j, n, z, dd, ff, o.
* A8:	continuous.
* A9:	t, f.
* A10:	t, f.
* A11:	continuous.
* A12:	t, f.
* A13:	g, p, s.
* A14:	continuous.
* A15:	continuous.
* A16: +,-         (class attribute) - target

In [60]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np


In [61]:
df = pd.read_csv('hw-8/credit+approval/crx.data')
df.columns = ['A1', 'A2', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8', 'A9', 'A10', 'A11', 'A12', 'A13', 'A14', 'A15', 'A16']
df

In [62]:
# Filter A1 and A2 
df[['A1', 'A2']] = df[['A1', 'A2']].replace('?', np.nan)
df.dropna(subset=['A1', 'A2'], inplace=True)
# Filter A4 
df = df[df['A4'].isin(['u', 'y'])].copy()
df.dtypes

In [63]:
df['A2'] = df['A2'].astype(float)
print(df.dtypes)

In [64]:
numerical_features = ['A2', 'A3', 'A8', 'A11', 'A15']
df[numerical_features].hist(figsize=(10, 6));

In [65]:
for feature in numerical_features:
    plt.figure()  # Creates a new figure
    sns.boxplot(x=feature, data=df)
    plt.title(f'Boxplot of {feature}')
    plt.show()  # Displays the figure

In [66]:
# looks we need to normalize A15 firstly

Q1 = df['A15'].quantile(0.25)
Q3 = df['A15'].quantile(0.75)
IQR = Q3 - Q1

# Define the upper bound for outliers (we're only interested in the upper side)
upper_bound = Q3 + 1.5 * IQR

# Filter out the upper outliers
df_filtered = df[df['A15'] <= upper_bound]

print(f"Original data size: {df.shape}")
print(f"Filtered data size: {df_filtered.shape}")

# Visualizing the Data Distribution Before and After Removing Outliers
plt.figure(figsize=(12, 6))

# Before outlier removal
plt.subplot(1, 2, 1)
plt.hist(df['A15'], bins=30, edgecolor='black')
plt.title('Before Outlier Removal')

# After outlier removal
plt.subplot(1, 2, 2)
plt.hist(df_filtered['A15'], bins=30, edgecolor='black')
plt.title('After Outlier Removal')


In [67]:
# Review how A11 outliers look like. I'd like to remove 5% of outliers only
Q1 = df['A11'].quantile(0.05)
Q3 = df['A11'].quantile(0.95)
IQR = Q3 - Q1

# Define the upper bound for outliers (we're only interested in the upper side)
upper_bound = Q3 + 1.5 * IQR

# Filter out the upper outliers
df_filtered = df[df['A11'] <= upper_bound]

print(f"Original data size: {df.shape}")
print(f"Filtered data size: {df_filtered.shape}")

# Visualizing the Data Distribution Before and After Removing Outliers
plt.figure(figsize=(12, 6))

# Before outlier removal
plt.subplot(1, 2, 1)
plt.hist(df['A11'], bins=30, edgecolor='black')
plt.title('Before Outlier Removal')

# After outlier removal
plt.subplot(1, 2, 2)
plt.hist(df_filtered['A11'], bins=30, edgecolor='black')
plt.title('After Outlier Removal')

In [68]:
df = df_filtered
corr_matrix = df[numerical_features].corr()
sns.heatmap(corr_matrix)
print(corr_matrix)
# Correlation looks good - there is no strong correlation between numerical features

In [69]:
sns.pairplot(df[numerical_features]);

In [70]:
# Review and convert categorical features
categorical_features = ['A1', 'A4', 'A5', 'A6', 'A7', 'A9', 'A10', 'A12', 'A13', 'A14']

In [71]:
category_counts = {}

# Loop through each categorical feature and count the unique categories
for feature in categorical_features:
    unique_count = df[feature].nunique()
    category_counts[feature] = unique_count

# Convert the dictionary to a DataFrame for better visualization
category_counts_df = pd.DataFrame(list(category_counts.items()), columns=['Feature', 'Unique Categories'])
category_counts_df

In [72]:
# Encode categorical features
df_encoded = pd.get_dummies(df, columns=categorical_features)
df_encoded

In [74]:
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.datasets import make_classification
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.pipeline import make_pipeline
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import GradientBoostingClassifier
from lightgbm import LGBMClassifier
from sklearn import tree

In [81]:
from sklearn.metrics import accuracy_score, confusion_matrix

target = 'A16'
X = df_encoded.drop(columns=[target])
y = df_encoded[target]


# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Decision Tree Classifier
clf = DecisionTreeClassifier(random_state=42)

# Train the model
clf.fit(X_train, y_train)

# Make predictions
y_pred = clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(report)
# clf.feature_importances_

In [79]:
tree.plot_tree(clf)

In [92]:
clf_rf = RandomForestClassifier(max_depth=2, random_state=0).fit(X_train, y_train)
y_pred_rf = clf_rf.predict(X_test)
print(classification_report(y_test, y_pred_rf, target_names=[f'class_{i}' for i in range(2)]))

# As a result it is a little bit better 
def find_best_model(X_train, y_train):
    # Add search best max depth parameter using GridSearchCV
    from sklearn.model_selection import GridSearchCV
    param_grid = {
        'max_depth': [2, 4, 6, 8, 10, 12, 14, 16, 18, 20]
    }
    clf_rf = RandomForestClassifier(random_state=0)
    grid_search = GridSearchCV(estimator=clf_rf, param_grid=param_grid, cv=5, scoring='accuracy')
    grid_search.fit(X_train, y_train)
    
    # Get the best parameters and the best score
    best_params = grid_search.best_params_
    best_score = grid_search.best_score_
    
    print(f"Best parameters: {best_params}")
    print(f"Best cross-validation score: {best_score}")
    return best_params

best_params = find_best_model(X_train, y_train)

clf_rf = RandomForestClassifier(max_depth=best_params['max_depth'], random_state=0).fit(X_train, y_train)
y_pred_rf = clf_rf.predict(X_test)
print(classification_report(y_test, y_pred_rf, target_names=[f'class_{i}' for i in range(2)]))


In [99]:
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier

# Fix issue with max_iter - looks it was possible only after scaling data
pipeline_lr = Pipeline([
    ('scaler', StandardScaler()),  # Add a scaler to the pipeline
    ('logreg', LogisticRegression(random_state=42, max_iter=1000))
])
clf_knn = KNeighborsClassifier()

# Train the models
# clf_lr.fit(X_train, y_train)
pipeline_lr.fit(X_train, y_train)
clf_knn.fit(X_train, y_train)

# Make predictions
y_pred_lr = clf_lr.predict(X_test)
y_pred_knn = clf_knn.predict(X_test)

# Evaluate the models
accuracy_rf = accuracy_score(y_test, y_pred_rf)
accuracy_lr = accuracy_score(y_test, y_pred_lr)
accuracy_knn = accuracy_score(y_test, y_pred_knn)

report_rf = classification_report(y_test, y_pred_rf)
report_lr = classification_report(y_test, y_pred_lr)
report_knn = classification_report(y_test, y_pred_knn)

conf_matrix_rf = confusion_matrix(y_test, y_pred_rf)
conf_matrix_lr = confusion_matrix(y_test, y_pred_lr)
conf_matrix_knn = confusion_matrix(y_test, y_pred_knn)

# Print the results
print("Random Forest Classifier")
print(f"Accuracy: {accuracy_rf}")
print("\nLogistic Regression")
print(f"Accuracy: {accuracy_lr}")

print("\nK-Nearest Neighbors")
print(f"Accuracy: {accuracy_knn}")

In [None]:
# As we can see Random Forest hase best results as classifier model. 