# TP2 - Fondamentaux de l'apprentissage automatique

Executed on Colab

## Pre-execution

### Verify GPU is available

In [None]:
!nvidia-smi

### pip installation

In [None]:
!pip install umap-learn

In [None]:
!pip install \
    --extra-index-url=https://pypi.nvidia.com \
    cudf-cu12==24.10.* dask-cudf-cu12==24.10.* cuml-cu12==24.10.* \
    cugraph-cu12==24.10.* nx-cugraph-cu12==24.10.* cuspatial-cu12==24.10.* \
    cuproj-cu12==24.10.* cuxfilter-cu12==24.10.* cucim-cu12==24.10.* \
    pylibraft-cu12==24.10.* raft-dask-cu12==24.10.* cuvs-cu12==24.10.* \
    nx-cugraph-cu12==24.10.*

### Imports

In [None]:
from time import time
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, classification_report
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE

# Only CPU
from sklearn.ensemble import RandomForestClassifier as RFC

# GPU-accelerated
from cuml.manifold import TSNE as cumlTSNE
from cuml.manifold import UMAP as cumlUMAP



### Load datas

In [None]:
print("1. Chargement des données...")
columns = [
    "Elevation",
    "Aspect",
    "Slope",
    "Horizontal_Distance_To_Hydrology",
    "Vertical_Distance_To_Hydrology",
    "Horizontal_Distance_To_Roadways",
    "Hillshade_9am",
    "Hillshade_Noon",
    "Hillshade_3pm",
    "Horizontal_Distance_To_Fire_Points",
    "Wilderness_Area1",
    *[f"Soil_Type_{i}" for i in range(1, 40)],
    "Cover_Type"
]

print(columns)

data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/covtype/covtype.data.gz', header=None, names=columns)

### Split datas

In [None]:
X=data.drop('Cover_Type', axis=1)
y=data['Cover_Type']

# Diviser les données
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Standardize datas

In [None]:
# Standardiser les données
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

### Fonctions

In [None]:
custom_cmap = ListedColormap(plt.cm.tab10.colors[:7])

# Fonction de visualisation
def visualize_embeddings(X, y, title):
    plt.figure(figsize=(10, 8))
    plt.scatter(X[:, 0], X[:, 1], c=y, cmap=custom_cmap, s=1)
    plt.colorbar()
    plt.title(title)
    plt.show()

In [None]:
# Function to evaluate the model
def evaluate(rfc, X_test, y_test):

    y_pred = rfc.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')

    print(f"Accuracy: {accuracy:.4f}")
    print(f"F1-Score: {f1:.4f}")
    print("\nRapport de classification:")
    print(classification_report(y_test, y_pred))

In [None]:
# convert second to mm:ss

def convert(seconds):
    minutes = seconds // 60
    seconds %= 60
    return "%02d:%02d" % (minutes, seconds)

## Computing

In [None]:
# Get reduced datasets
print("2. Réduction de la dimensionnalité...")
is_reduced = True

if is_reduced:
    n = 500

    X_train_scaled = X_train_scaled[:n]
    y_train = y_train[:n]

### No Sampling

In [None]:
# Compute T-SNE (GPU)

time_tsne_no_sampling = time()

tsne_no_sampling = cumlTSNE(n_components=2, random_state=42)
X_tsne_no_sampling = tsne_no_sampling.fit_transform(X_train_scaled)

time_tsne_no_sampling = time() - time_tsne_no_sampling


In [None]:
# Compute UMAP (GPU)

time_umap_no_sampling = time()

umap_no_sampling = cumlUMAP(n_components=2, random_state=42)
X_umap_no_sampling = umap_no_sampling.fit_transform(X_train_scaled)

time_umap_no_sampling = time() - time_umap_no_sampling

In [None]:
# Compute Random Forest Classifier (GPU)

time_rfc_no_sampling = time()

rfc_no_sampling = RFC(n_estimators=100, random_state=42)
rfc_no_sampling.fit(X_train_scaled, y_train)

time_rfc_no_sampling = time() - time_rfc_no_sampling

### Under Sampling

In [None]:
rus = RandomUnderSampler(random_state=42)
X_train_scaled_under, y_train_under = rus.fit_resample(X_train_scaled, y_train)

In [None]:
# Compute T-SNE (GPU)

time_tsne_under_sampling = time()

tsne_under_sampling = cumlTSNE(n_components=2, random_state=42)
X_tsne_under_sampling = tsne_under_sampling.fit_transform(X_train_scaled_under)

time_tsne_under_sampling = time() - time_tsne_under_sampling

In [None]:
# Compute UMAP (GPU)

time_umap_under_sampling = time()

umap_under_sampling = cumlUMAP(n_components=2, random_state=42)
X_umap_under_sampling = umap_under_sampling.fit_transform(X_train_scaled_under)

time_umap_under_sampling = time() - time_umap_under_sampling

In [None]:
# Compute Random Forest Classifier (GPU)

time_rfc_under_sampling = time()

rfc_under_sampling = RFC(n_estimators=100, random_state=42)
rfc_under_sampling.fit(X_train_scaled_under, y_train_under)

time_rfc_under_sampling = time() - time_rfc_under_sampling

### Over Sampling

In [None]:
smote = SMOTE(random_state=42)
X_train_scaled_over, y_train_over = smote.fit_resample(X_train_scaled, y_train)

In [None]:
# Compute T-SNE (GPU)

time_tsne_over_sampling = time()

tsne_over_sampling = cumlTSNE(n_components=2, random_state=42)
X_tsne_over_sampling = tsne_over_sampling.fit_transform(X_train_scaled_over)

time_tsne_over_sampling = time() - time_tsne_over_sampling

In [None]:
# Compute UMAP (GPU)

time_umap_over_sampling = time()

umap_over_sampling = cumlUMAP(n_components=2, random_state=42)
X_umap_over_sampling = umap_over_sampling.fit_transform(X_train_scaled_over)

time_umap_over_sampling = time() - time_umap_over_sampling

In [None]:
# Compute Random Forest Classifier (GPU)

time_rfc_over_sampling = time()

rfc_over_sampling = RFC(n_estimators=100, random_state=42)
rfc_over_sampling.fit(X_train_scaled_over, y_train_over)

time_rfc_over_sampling = time() - time_rfc_over_sampling

## Results

### Speed comparison

In [None]:
print("---- No Sampling ----")
print(f"TSNE: {convert(time_tsne_no_sampling)}")
print(f"UMAP: {convert(time_umap_no_sampling)}")
print(f"RFC: {convert(time_rfc_no_sampling)}")
print("---- Under Sampling ----")
print(f"TSNE: {convert(time_tsne_under_sampling)}")
print(f"UMAP: {convert(time_umap_under_sampling)}")
print(f"RFC: {convert(time_rfc_under_sampling)}")
print("---- Over Sampling ----")
print(f"TSNE: {convert(time_tsne_over_sampling)}")
print(f"UMAP: {convert(time_umap_over_sampling)}")
print(f"RFC: {convert(time_rfc_over_sampling)}")

### No Sampling

In [None]:
visualize_embeddings(X_tsne_no_sampling, y_train, "T-SNE (No Sampling)")
visualize_embeddings(X_umap_no_sampling, y_train, "UMAP (No Sampling)")
evaluate(rfc_no_sampling, X_test_scaled, y_test)

### Under Sampling

In [None]:
visualize_embeddings(X_tsne_under_sampling, y_train_under, "T-SNE (Under Sampling)")
visualize_embeddings(X_umap_under_sampling, y_train_under, "UMAP (Under Sampling)")
evaluate(rfc_under_sampling, X_test_scaled, y_test)

### Over Sampling

In [None]:
visualize_embeddings(X_tsne_over_sampling, y_train_over, "T-SNE (Over Sampling)")
visualize_embeddings(X_umap_over_sampling, y_train_over, "UMAP (Over Sampling)")
evaluate(rfc_over_sampling, X_test_scaled, y_test)