In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import os, sys, math
%matplotlib inline

In [2]:
train_data = pd.read_csv('../Data/train_yaOffsB.csv')
test_data = pd.read_csv('../Data/test_pFkWwen.csv')
sample_submission = pd.read_csv('../Data/sample_submission_O1oDc4H.csv')

In [3]:
ID = 'ID'
target = 'Crop_Damage'

In [4]:
categorical_columns = ['Season', 'Pesticide_Use_Category']

In [5]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import LabelBinarizer

In [6]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

In [8]:
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score, roc_curve

# Data Preprocessing

## Missing Imputation

In [9]:
train_data['Number_Weeks_Used'] = train_data['Number_Weeks_Used'].fillna(0)
test_data['Number_Weeks_Used'] = test_data['Number_Weeks_Used'].fillna(0)

## Scaling

In [10]:
for column in train_data.columns:
    if (column not in categorical_columns) & (column!=ID) & (column!=target):
        
        mms = MinMaxScaler()
        ss = StandardScaler()
        rs = RobustScaler()
        pt = PowerTransformer()
        ft_log = FunctionTransformer(np.log1p)
        
        train_data[f'{column}_mms'] = mms.fit_transform(train_data[[column]])
        test_data[f'{column}_mms'] = mms.transform(test_data[[column]])
        
        train_data[f'{column}_ss'] = ss.fit_transform(train_data[[column]])
        test_data[f'{column}_ss'] = ss.transform(test_data[[column]])
        
        train_data[f'{column}_rs'] = rs.fit_transform(train_data[[column]])
        test_data[f'{column}_rs'] = rs.transform(test_data[[column]])
        
        train_data[f'{column}_pt'] = pt.fit_transform(train_data[[column]])
        test_data[f'{column}_pt'] = pt.transform(test_data[[column]])
        
        train_data[f'{column}_ft_log'] = ft_log.fit_transform(train_data[[column]])
        test_data[f'{column}_ft_log'] = ft_log.transform(test_data[[column]])

## One Hot Encoding

In [11]:
train_data['is_train'] = True
test_data['is_train'] = False

In [12]:
for column in categorical_columns:
    train_data[column] = train_data[column].apply(lambda x: f'{column}_{x}')
    test_data[column] = test_data[column].apply(lambda x: f'{column}_{x}')
    
for idx, column in enumerate(categorical_columns):
    _tmp = pd.concat([train_data[['is_train', column]], test_data[['is_train', column]]])
    _tmp_ohe = pd.get_dummies(_tmp[column])

    if idx==0:
        _tmp_final = pd.concat([_tmp[['is_train']], _tmp_ohe], axis=1)
    else:
        _tmp_final = pd.concat([_tmp_final, _tmp_ohe], axis=1)

train_data = pd.concat([train_data.drop(categorical_columns+['is_train'], axis=1), _tmp_final[_tmp_final['is_train']]], axis=1)
test_data = pd.concat([test_data.drop(categorical_columns+['is_train'], axis=1), _tmp_final[~_tmp_final['is_train']]], axis=1)

In [13]:
X_train, X_val, y_train, y_val = train_test_split(train_data.drop(['ID','Crop_Damage','is_train'], axis=1), 
                                                  train_data[['Crop_Damage']], test_size = 0.3, 
                                                  random_state = 1234, stratify = train_data['Crop_Damage'])

In [14]:
no_of_estimators = 500

In [38]:
models = {
    'DT': DecisionTreeClassifier(),
    'RF': RandomForestClassifier(n_estimators = no_of_estimators, max_depth=10, random_state=1234),
    'GBM_ES': GradientBoostingClassifier(n_estimators = no_of_estimators, max_depth=10, random_state=1234, 
                                      validation_fraction=0.2, n_iter_no_change=50)
}

In [39]:
for clf, model in models.items():
    print(f"Building {clf} model starts")
    cv_scores = cross_val_score(model, X_train, y_train.values.ravel(), cv=5, scoring='accuracy')
    model.fit(X_train,y_train.values.ravel())
    y_pred = model.predict(X_val)
    val_score = accuracy_score(y_val.values.ravel(), y_pred)
    print(f"The average accuracy of {clf} model on training data is {np.round(cv_scores.mean(),2)} with a std of +/- {np.round((cv_scores.std()/cv_scores.mean())*100,2)} %")
    print(f"The average ROC score of the {clf} model on validation data is {np.round(val_score,2)}")
    print(f"Building {clf} model ends")
    print(f"---------------------------------------------------------------------------------------------------------")

Building DT model starts
The average accuracy of DT model on training data is 0.75 with a std of +/- 0.16 %
The average ROC score of the DT model on validation data is 0.75
Building DT model ends
---------------------------------------------------------------------------------------------------------
Building RF model starts
The average accuracy of RF model on training data is 0.84 with a std of +/- 0.17 %
The average ROC score of the RF model on validation data is 0.85
Building RF model ends
---------------------------------------------------------------------------------------------------------
Building GBM_ES model starts
The average accuracy of GBM_ES model on training data is 0.83 with a std of +/- 0.28 %
The average ROC score of the GBM_ES model on validation data is 0.84
Building GBM_ES model ends
---------------------------------------------------------------------------------------------------------


In [41]:
models['RF'].get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': 10,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 500,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 1234,
 'verbose': 0,
 'warm_start': False}