Load data with encode:

In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split

In [2]:
# Load datasets
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# Encode the target variable
label_encoder = LabelEncoder()
train_df['nforest_type_encoded'] = label_encoder.fit_transform(train_df['nforest_type'])

# Define features and target
X = train_df.drop(columns=['id', 'nforest_type', 'nforest_type_encoded'])
y = train_df['nforest_type_encoded']

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Optionally, use PCA for dimensionality reduction
pca = PCA(n_components=10)  # Adjust n_components as needed
X_pca = pca.fit_transform(X_scaled)

# Split the transformed data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_pca, y, test_size=0.2, random_state=42)


Choose model to ensemble:
    catboost classifier
    XGboost
    LightLLGBM

In [3]:
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
# from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import cross_val_score

In [7]:
# Initialize different classifiers
clf1 = RandomForestClassifier(n_estimators=100, random_state=42)
clf2 = LogisticRegression(random_state=42)
clf2 = SVC(probability=True, random_state=42)

# Create an ensemble model
ensemble = VotingClassifier(estimators=[
    ('rf', clf1), ('lr', clf2), ('svc', clf3)], voting='soft')

# Train the ensemble model
ensemble.fit(X_train, y_train)

# Evaluate the model using cross-validation
cv_scores = cross_val_score(ensemble, X_pca, y, cv=5)
print(f"Cross-Validation Scores: {cv_scores}")
print(f"Mean CV Score: {cv_scores.mean()}")

# Validate the ensemble model on the validation set
y_pred = ensemble.predict(X_val)
print(classification_report(y_val, y_pred))
print('Validation Accuracy:', accuracy_score(y_val, y_pred))

Cross-Validation Scores: [0.692072   0.67483723 0.68173114 0.6835249  0.68084291]
Mean CV Score: 0.6826016367534349
              precision    recall  f1-score   support

           0       0.69      0.62      0.65       930
           1       0.81      0.68      0.74       509
           2       0.68      0.78      0.72      1172

    accuracy                           0.70      2611
   macro avg       0.72      0.69      0.70      2611
weighted avg       0.71      0.70      0.70      2611

Validation Accuracy: 0.7024128686327078


Prediction:

In [5]:
# Prepare the test data
test_X = test_df.drop(columns=['id'])
test_X_scaled = scaler.transform(test_X)
test_X_pca = pca.transform(test_X_scaled)

# Make predictions
test_predictions = ensemble.predict(test_X_pca)
test_predictions_decoded = label_encoder.inverse_transform(test_predictions)


get submition:

In [6]:

# Load your sample submission file
sample_submission = pd.read_csv('sample_submission.csv')

# Merge the test data with sample submission to fill in the predicted values
predictions_df = pd.DataFrame({'id': test_df['id'], 'nforest_type': test_predictions_decoded})
final_submission = sample_submission.merge(predictions_df, on='id', how='left', suffixes=('', '_predicted'))

# Fill the missing values in sample submission with the predicted values
final_submission['nforest_type'] = final_submission['nforest_type'].combine_first(final_submission['nforest_type_predicted'])

# Drop the predicted column as it's no longer needed
final_submission = final_submission.drop(columns=['nforest_type_predicted'])

# Save the final submission
final_submission.to_csv('ensemble.csv', index=False)

In [1]:
%pip install autogluon.tabular

Collecting autogluon.tabular
  Downloading autogluon.tabular-0.0.16b20210206-py3-none-any.whl.metadata (7.7 kB)
Collecting scipy<1.5.0,>=1.3.3 (from autogluon.tabular)
  Downloading scipy-1.4.1.tar.gz (24.6 MB)
     ---------------------------------------- 0.0/24.6 MB ? eta -:--:--
     ---------------------------------------- 0.0/24.6 MB ? eta -:--:--
     ---------------------------------------- 0.1/24.6 MB 2.4 MB/s eta 0:00:11
     ---------------------------------------- 0.2/24.6 MB 2.1 MB/s eta 0:00:12
      --------------------------------------- 0.3/24.6 MB 2.1 MB/s eta 0:00:12
      --------------------------------------- 0.4/24.6 MB 2.2 MB/s eta 0:00:12
      --------------------------------------- 0.5/24.6 MB 2.1 MB/s eta 0:00:12
     - -------------------------------------- 0.6/24.6 MB 2.3 MB/s eta 0:00:11
     - -------------------------------------- 0.8/24.6 MB 2.3 MB/s eta 0:00:11
     - -------------------------------------- 0.9/24.6 MB 2.3 MB/s eta 0:00:11
     - ------

  error: subprocess-exited-with-error
  
  × pip subprocess to install build dependencies did not run successfully.
  │ exit code: 1
  ╰─> [466 lines of output]
      Ignoring numpy: markers 'python_version == "3.5" and platform_system != "AIX"' don't match your environment
      Ignoring numpy: markers 'python_version == "3.6" and platform_system != "AIX"' don't match your environment
      Ignoring numpy: markers 'python_version == "3.7" and platform_system != "AIX"' don't match your environment
      Ignoring numpy: markers 'python_version == "3.5" and platform_system == "AIX"' don't match your environment
      Ignoring numpy: markers 'python_version == "3.6" and platform_system == "AIX"' don't match your environment
      Ignoring numpy: markers 'python_version == "3.7" and platform_system == "AIX"' don't match your environment
      Ignoring numpy: markers 'python_version >= "3.8" and platform_system == "AIX"' don't match your environment
      Collecting wheel
        Using cach

In [2]:
# inference ensemble model with autogluon
from autogluon.tabular import TabularPredictor

# Define the task
predictor = TabularPredictor(label='nforest_type_encoded').fit(train_df)

# Make predictions
test_predictions = predictor.predict(test_df)
test_predictions_decoded = label_encoder.inverse_transform(test_predictions)

# Merge the test data with sample submission to fill in the predicted values
predictions_df = pd.DataFrame({'id': test_df['id'], 'nforest_type': test_predictions_decoded})
final_submission = sample_submission.merge(predictions_df, on='id', how='left', suffixes=('', '_predicted'))

# Fill the missing values in sample submission with the predicted values
final_submission['nforest_type'] = final_submission['nforest_type'].combine_first(final_submission['nforest_type_predicted'])

# Drop the predicted column as it's no longer needed
final_submission = final_submission.drop(columns=['nforest_type_predicted'])

# Save the final submission
final_submission.to_csv('ensemble_autogluon.csv', index=False)

ModuleNotFoundError: No module named 'autogluon'