In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn import datasets
from sklearn.tree import plot_tree

In [None]:
def sklearn_to_df(sklearn_dataset):
    df = pd.DataFrame(sklearn_dataset.data, columns=sklearn_dataset.feature_names)
    df['target'] = sklearn_dataset.target
    return df, sklearn_dataset.DESCR

# Decision Tree

In [None]:
label = 'target'

# Load data
df, descr = sklearn_to_df(datasets.load_wine())

In [None]:
print(descr)

In [None]:
df.head()

In [None]:
df.columns

In [None]:
df.describe()

In [None]:
# Split data into training and testing sets
X = df.drop(label, axis=1)
y = df[label]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = DecisionTreeClassifier(random_state=42)
model.fit(X_train, y_train)

# Predict on test data
y_pred = model.predict(X_test)


In [None]:
model.classes_

In [None]:
# Evaluate model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:", confusion_matrix(y_test, y_pred))
print("Classification Report:", classification_report(y_test, y_pred))

# Plot the decision tree
plt.figure(figsize=(25,10))
plot_tree(model, filled = True, rounded=True, feature_names=X_train.columns, class_names=['0','1','2'])
plt.title('Decision Tree')

# Show the plot
plt.show()


Note : The Gini Index, also known as Gini impurity, is a statistical measure that determines the likelihood that a certain characteristic would be categorized wrongly when a random sample is chosen. When all of the pieces are tied together using a single class, this is referred to as pure. The Gini Index varies between 0 and 1, where 0 represents purity of the classification and 1 denotes random distribution of elements among various classes. A Gini Index of 0.5 shows that there is equal distribution of elements across some classes.

---

A small Gini index for a decision tree means that the tree is more pure, meaning that the samples in a leaf node belong to a single class. This is desirable because it means that the tree is better able to accurately classify new data into the correct class. A larger Gini index indicates that the samples in a leaf node are more mixed, meaning that the tree is less able to accurately classify new data. This can lead to a decrease in the accuracy of the model.

# Random Forest

Random Forest is an ensemble learning method for classification and regression problems. It is a type of decision tree algorithm that uses multiple decision trees to make predictions.

The basic idea behind Random Forest is to create multiple decision trees and combine their predictions to make a final prediction. The decision trees are created by randomly selecting a subset of features and data points, and building a decision tree using this subset. This process is repeated multiple times to create multiple decision trees.

When making a prediction, the Random Forest algorithm takes the average of the predictions made by all the decision trees in the forest. This approach helps to reduce the variance and bias in the predictions made by a single decision tree.

Random Forest also includes a feature called "feature importance," which measures the importance of each feature in the dataset. This helps to identify the most important features in the dataset and can be used for feature selection.

In the case of a COVID test, Random Forest can be used to predict the probability of a patient testing positive for COVID-19 based on various features such as age, symptoms, and medical history. The algorithm can then be trained on a dataset of patients with known COVID-19 test results and used to predict the results of new patients. The feature importance can also be used to identify which symptoms or medical history are most important in predicting a positive test result.

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Train a random forest classifier : n_estimators is the number of trees taken into account for the training and vote by ensemble
clf = RandomForestClassifier(n_estimators=10, random_state=42)
clf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = clf.predict(X_test)

# Plot the confusion matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d')

# print accuracy
print("Accuracy:", accuracy_score(y_test, y_pred))
# Print the classification report
print(classification_report(y_test, y_pred))

In [None]:
# Train a random forest classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = clf.predict(X_test)

# Plot the confusion matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d')

# print accuracy
print("Accuracy:", accuracy_score(y_test, y_pred))
# Print the classification report
print(classification_report(y_test, y_pred))