In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from lazypredict.Supervised import LazyClassifier

In [2]:
# load dataset

train = pd.read_csv("./rs-data-science-wise-202324/train.csv")
test = pd.read_csv("./rs-data-science-wise-202324/test.csv")

In [3]:
train.head()

Unnamed: 0,UniqueID,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,...,Soil_Type_32,Soil_Type_33,Soil_Type_34,Soil_Type_35,Soil_Type_36,Soil_Type_37,Soil_Type_38,Soil_Type_39,Soil_Type_40,Cover_Type
0,0,3291.0,358.0,18.0,30.0,10.0,2161.0,189.0,204.0,150.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1
1,1,2563.0,334.0,18.0,60.0,5.0,1084.0,177.0,214.0,174.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
2,2,3340.0,187.0,24.0,90.0,21.0,1041.0,215.0,,154.0,...,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,7
3,3,2922.0,103.0,4.0,67.0,3.0,5057.0,227.0,234.0,141.0,...,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,1
4,4,2728.0,15.0,15.0,446.0,86.0,,205.0,207.0,136.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2


In [4]:
train.columns

Index(['UniqueID', 'Elevation', 'Aspect', 'Slope',
       'Horizontal_Distance_To_Hydrology', 'Vertical_Distance_To_Hydrology',
       'Horizontal_Distance_To_Roadways', 'Hillshade_9am', 'Hillshade_Noon',
       'Hillshade_3pm', 'Horizontal_Distance_To_Fire_Points',
       'Wilderness_Area_1', 'Wilderness_Area_2', 'Wilderness_Area_3',
       'Wilderness_Area_4', 'Soil_Type_1', 'Soil_Type_2', 'Soil_Type_3',
       'Soil_Type_4', 'Soil_Type_5', 'Soil_Type_6', 'Soil_Type_7',
       'Soil_Type_8', 'Soil_Type_9', 'Soil_Type_10', 'Soil_Type_11',
       'Soil_Type_12', 'Soil_Type_13', 'Soil_Type_14', 'Soil_Type_15',
       'Soil_Type_16', 'Soil_Type_17', 'Soil_Type_18', 'Soil_Type_19',
       'Soil_Type_20', 'Soil_Type_21', 'Soil_Type_22', 'Soil_Type_23',
       'Soil_Type_24', 'Soil_Type_25', 'Soil_Type_26', 'Soil_Type_27',
       'Soil_Type_28', 'Soil_Type_29', 'Soil_Type_30', 'Soil_Type_31',
       'Soil_Type_32', 'Soil_Type_33', 'Soil_Type_34', 'Soil_Type_35',
       'Soil_Type_36', 'Soil

In [5]:
# extract binary features
binary_features = [col for col in train.columns if train[col].nunique() == 2]

# extract categorical features (i.e., integer features with <= 10 unique values)
categorical_features = [
    col for col in train.columns if train[col].dtype == "int64" and train[col].nunique() <= 10
]

# extract numerical features
numerical_features = [
    col for col in train.columns if train[col].dtype in ["int64", "float64"] and col not in binary_features
]


In [6]:
numerical_features

['UniqueID',
 'Elevation',
 'Aspect',
 'Slope',
 'Horizontal_Distance_To_Hydrology',
 'Vertical_Distance_To_Hydrology',
 'Horizontal_Distance_To_Roadways',
 'Hillshade_9am',
 'Hillshade_Noon',
 'Hillshade_3pm',
 'Horizontal_Distance_To_Fire_Points',
 'Soil_Type_15',
 'Cover_Type']

In [7]:
categorical_features

['Cover_Type']

In [8]:
binary_features

['Wilderness_Area_1',
 'Wilderness_Area_2',
 'Wilderness_Area_3',
 'Wilderness_Area_4',
 'Soil_Type_1',
 'Soil_Type_2',
 'Soil_Type_3',
 'Soil_Type_4',
 'Soil_Type_5',
 'Soil_Type_6',
 'Soil_Type_7',
 'Soil_Type_8',
 'Soil_Type_9',
 'Soil_Type_10',
 'Soil_Type_11',
 'Soil_Type_12',
 'Soil_Type_13',
 'Soil_Type_14',
 'Soil_Type_16',
 'Soil_Type_17',
 'Soil_Type_18',
 'Soil_Type_19',
 'Soil_Type_20',
 'Soil_Type_21',
 'Soil_Type_22',
 'Soil_Type_23',
 'Soil_Type_24',
 'Soil_Type_25',
 'Soil_Type_26',
 'Soil_Type_27',
 'Soil_Type_28',
 'Soil_Type_29',
 'Soil_Type_30',
 'Soil_Type_31',
 'Soil_Type_32',
 'Soil_Type_33',
 'Soil_Type_34',
 'Soil_Type_35',
 'Soil_Type_36',
 'Soil_Type_37',
 'Soil_Type_38',
 'Soil_Type_39',
 'Soil_Type_40']

In [9]:
train['Soil_Type_15'].value_counts()

Soil_Type_15
0.00    9634
Name: count, dtype: int64

In [10]:
# add soil type 15 to binary features
binary_features.append('Soil_Type_15')

In [11]:
soil_type_features = [col for col in train.columns if col.startswith('Soil_Type')]
wilderness_area_features = [col for col in train.columns if col.startswith('Wilderness_Area')]


In [12]:
# remove cover type from categorical features
categorical_features.remove('Cover_Type')

In [13]:
# drop nan rows
train = train.dropna()

In [15]:
# take the binary features of the soil type and make them one categorical feature
train['Soil_Type'] = train[soil_type_features].idxmax(axis=1)

# take the binary features of the wilderness area and make them one categorical feature
train['Wilderness_Area'] = train[wilderness_area_features].idxmax(axis=1)

# drop the binary features of the soil type
train = train.drop(soil_type_features, axis=1)

# drop the binary features of the wilderness area
train = train.drop(wilderness_area_features, axis=1)

In [16]:
train.head()

Unnamed: 0,UniqueID,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,Cover_Type,Soil_Type,Wilderness_Area
0,0,3291.0,358.0,18.0,30.0,10.0,2161.0,189.0,204.0,150.0,1405.0,1,Soil_Type_38,Wilderness_Area_3
1,1,2563.0,334.0,18.0,60.0,5.0,1084.0,177.0,214.0,174.0,577.0,2,Soil_Type_29,Wilderness_Area_1
7,7,3077.0,64.0,13.0,60.0,12.0,5079.0,232.0,212.0,110.0,757.0,1,Soil_Type_30,Wilderness_Area_1
10,10,2407.0,300.0,22.0,182.0,63.0,1024.0,155.0,225.0,209.0,1518.0,6,Soil_Type_10,Wilderness_Area_4
11,11,2801.0,237.0,18.0,30.0,5.0,2507.0,186.0,253.0,201.0,470.0,2,Soil_Type_13,Wilderness_Area_3


In [17]:
X = train.drop(['Cover_Type', 'UniqueID'], axis=1)
y = train['Cover_Type']

In [18]:
# split data into train and validation sets
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

In [19]:
# convert categorical features to string and binary features to bool
#for col in categorical_features:
#    X_train[col] = X_train[col].astype(str)
#    X_valid[col] = X_valid[col].astype(str)

#for col in binary_features:
#    X_train[col] = X_train[col].astype(bool)
#    X_valid[col] = X_valid[col].astype(bool)

In [23]:
# train model using lazy classifier and dont normalize data binary_features and categorical_features
# use F1 score as metric to rank models
clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)
models, predictions = clf.fit(X_train, X_valid, y_train, y_valid)

models.sort_values('F1 Score', ascending=False)

 93%|████████████████████████████████████████   | 27/29 [00:03<00:00,  7.68it/s]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000181 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1987
[LightGBM] [Info] Number of data points in the train set: 2430, number of used features: 15
[LightGBM] [Info] Start training from score -1.006675
[LightGBM] [Info] Start training from score -0.723225
[LightGBM] [Info] Start training from score -2.839819
[LightGBM] [Info] Start training from score -4.962433
[LightGBM] [Info] Start training from score -4.184729
[LightGBM] [Info] Start training from score -3.668512
[LightGBM] [Info] Start training from score -3.141686


100%|███████████████████████████████████████████| 29/29 [00:04<00:00,  7.03it/s]






Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ExtraTreesClassifier,0.79,0.56,,0.78,0.3
RandomForestClassifier,0.77,0.54,,0.77,0.54
LGBMClassifier,0.76,0.54,,0.76,0.38
BaggingClassifier,0.73,0.5,,0.73,0.19
LogisticRegression,0.73,0.42,,0.71,0.09
LabelPropagation,0.7,0.51,,0.7,0.3
LabelSpreading,0.7,0.51,,0.7,0.42
KNeighborsClassifier,0.7,0.5,,0.69,0.06
DecisionTreeClassifier,0.69,0.52,,0.69,0.05
SVC,0.7,0.32,,0.67,0.26
