In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import os, sys, math
%matplotlib inline

In [2]:
train_data = pd.read_csv('../Data/train_yaOffsB.csv')
test_data = pd.read_csv('../Data/test_pFkWwen.csv')
sample_submission = pd.read_csv('../Data/sample_submission_O1oDc4H.csv')

In [3]:
ID = 'ID'
target = 'Crop_Damage'

In [4]:
categorical_columns = ['Season', 'Pesticide_Use_Category']

In [5]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import LabelBinarizer

In [6]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

In [8]:
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score, roc_curve

# Data Preprocessing

## Missing Imputation

In [9]:
train_data['Number_Weeks_Used'] = train_data['Number_Weeks_Used'].fillna(0)
test_data['Number_Weeks_Used'] = test_data['Number_Weeks_Used'].fillna(0)

## Scaling

In [10]:
for column in train_data.columns:
    if (column not in categorical_columns) & (column!=ID) & (column!=target):
        
        mms = MinMaxScaler()
        ss = StandardScaler()
        rs = RobustScaler()
        pt = PowerTransformer()
        ft_log = FunctionTransformer(np.log1p)
        
        train_data[f'{column}_mms'] = mms.fit_transform(train_data[[column]])
        test_data[f'{column}_mms'] = mms.transform(test_data[[column]])
        
        train_data[f'{column}_ss'] = ss.fit_transform(train_data[[column]])
        test_data[f'{column}_ss'] = ss.transform(test_data[[column]])
        
        train_data[f'{column}_rs'] = rs.fit_transform(train_data[[column]])
        test_data[f'{column}_rs'] = rs.transform(test_data[[column]])
        
        train_data[f'{column}_pt'] = pt.fit_transform(train_data[[column]])
        test_data[f'{column}_pt'] = pt.transform(test_data[[column]])
        
        train_data[f'{column}_ft_log'] = ft_log.fit_transform(train_data[[column]])
        test_data[f'{column}_ft_log'] = ft_log.transform(test_data[[column]])

## One Hot Encoding

In [11]:
train_data['is_train'] = True
test_data['is_train'] = False

In [12]:
for column in categorical_columns:
    train_data[column] = train_data[column].apply(lambda x: f'{column}_{x}')
    test_data[column] = test_data[column].apply(lambda x: f'{column}_{x}')
    
for idx, column in enumerate(categorical_columns):
    _tmp = pd.concat([train_data[['is_train', column]], test_data[['is_train', column]]])
    _tmp_ohe = pd.get_dummies(_tmp[column])

    if idx==0:
        _tmp_final = pd.concat([_tmp[['is_train']], _tmp_ohe], axis=1)
    else:
        _tmp_final = pd.concat([_tmp_final, _tmp_ohe], axis=1)

train_data = pd.concat([train_data.drop(categorical_columns+['is_train'], axis=1), _tmp_final[_tmp_final['is_train']]], axis=1)
test_data = pd.concat([test_data.drop(categorical_columns+['is_train'], axis=1), _tmp_final[~_tmp_final['is_train']]], axis=1)

In [13]:
X_train, X_val, y_train, y_val = train_test_split(train_data.drop(['ID','Crop_Damage','is_train'], axis=1), 
                                                  train_data[['Crop_Damage']], test_size = 0.3, 
                                                  random_state = 1234, stratify = train_data['Crop_Damage'])

In [14]:
no_of_estimators = 500

In [38]:
models = {
    'DT': DecisionTreeClassifier(),
    'RF': RandomForestClassifier(n_estimators = no_of_estimators, max_depth=10, random_state=1234),
    'GBM_ES': GradientBoostingClassifier(n_estimators = no_of_estimators, max_depth=10, random_state=1234, 
                                      validation_fraction=0.2, n_iter_no_change=50)
}

In [39]:
for clf, model in models.items():
    print(f"Building {clf} model starts")
    cv_scores = cross_val_score(model, X_train, y_train.values.ravel(), cv=5, scoring='accuracy')
    model.fit(X_train,y_train.values.ravel())
    y_pred = model.predict(X_val)
    val_score = accuracy_score(y_val.values.ravel(), y_pred)
    print(f"The average accuracy of {clf} model on training data is {np.round(cv_scores.mean(),2)} with a std of +/- {np.round((cv_scores.std()/cv_scores.mean())*100,2)} %")
    print(f"The average ROC score of the {clf} model on validation data is {np.round(val_score,2)}")
    print(f"Building {clf} model ends")
    print(f"---------------------------------------------------------------------------------------------------------")

Building DT model starts
The average accuracy of DT model on training data is 0.75 with a std of +/- 0.16 %
The average ROC score of the DT model on validation data is 0.75
Building DT model ends
---------------------------------------------------------------------------------------------------------
Building RF model starts
The average accuracy of RF model on training data is 0.84 with a std of +/- 0.17 %
The average ROC score of the RF model on validation data is 0.85
Building RF model ends
---------------------------------------------------------------------------------------------------------
Building GBM_ES model starts
The average accuracy of GBM_ES model on training data is 0.83 with a std of +/- 0.28 %
The average ROC score of the GBM_ES model on validation data is 0.84
Building GBM_ES model ends
---------------------------------------------------------------------------------------------------------


In [45]:
model_imp = pd.DataFrame({'var':X_train.columns,'imp':models['RF'].feature_importances_})

In [49]:
from sklearn.model_selection import RandomizedSearchCV

In [65]:
models['RF'].get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': 10,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 500,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 1234,
 'verbose': 0,
 'warm_start': False}

In [67]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(20,2000,10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(3,20,10)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2,5,10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1,2,4] 
# Method of selecting samples for training each tree
bootstrap = [True, False]

In [71]:
random_grid = {
    'bootstrap': bootstrap,
    'max_depth': max_depth,
    'max_features': max_features,
    'min_samples_leaf': min_samples_leaf,
    'min_samples_split': min_samples_split,
    'n_estimators': n_estimators
}

In [74]:
rf = RandomForestClassifier()
rf_random = RandomizedSearchCV(rf, random_grid, n_iter=5, scoring='accuracy', cv=5, n_jobs=-1, random_state=1234,
                               verbose=1)
rf_random.fit(X_train, y_train)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed: 32.3min finished
  self.best_estimator_.fit(X, y, **fit_params)


RandomizedSearchCV(cv=5, estimator=RandomForestClassifier(), n_iter=5,
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [3, 4, 6, 8, 10, 12, 14,
                                                      16, 18, 20, None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [20, 240, 460, 680, 900,
                                                         1120, 1340, 1560, 1780,
                                                         2000]},
                   random_state=1234, scoring='accuracy', verbose=1)

In [75]:
rf_random.best_params_

{'n_estimators': 680,
 'min_samples_split': 5,
 'min_samples_leaf': 4,
 'max_features': 'sqrt',
 'max_depth': 10,
 'bootstrap': False}

In [76]:
rf_random.best_score_

0.8431672025723472

In [77]:
rf_random.best_estimator_.predict(X_val)

array([0, 0, 0, ..., 0, 0, 0])

In [78]:
y_pred = rf_random.best_estimator_.predict(X_val)
val_score = accuracy_score(y_val.values.ravel(), y_pred)
print(f"The average ROC score of the tuned RF model on validation data is {np.round(val_score,2)}")

The average ROC score of the tuned RF model on validation data is 0.85


In [79]:
test_data

Unnamed: 0,ID,Estimated_Insects_Count,Crop_Type,Soil_Type,Number_Doses_Week,Number_Weeks_Used,Number_Weeks_Quit,Estimated_Insects_Count_mms,Estimated_Insects_Count_ss,Estimated_Insects_Count_rs,...,Number_Weeks_Quit_rs,Number_Weeks_Quit_pt,Number_Weeks_Quit_ft_log,is_train,Season_1,Season_2,Season_3,Pesticide_Use_Category_1,Pesticide_Use_Category_2,Pesticide_Use_Category_3
0,F00000002,188,1,1,0,0.0,0,0.009628,-1.426324,-0.877464,...,-0.4375,-1.272764,0.000000,False,0,1,0,1,0,0
1,F00000007,410,1,1,0,0.0,0,0.065873,-1.164854,-0.687232,...,-0.4375,-1.272764,0.000000,False,0,1,0,1,0,0
2,F00000011,626,1,0,0,0.0,0,0.120598,-0.910450,-0.502142,...,-0.4375,-1.272764,0.000000,False,0,1,0,1,0,0
3,F00000013,731,1,0,0,0.0,0,0.147200,-0.786782,-0.412168,...,-0.4375,-1.272764,0.000000,False,0,1,0,1,0,0
4,F00000014,789,0,0,0,0.0,0,0.161895,-0.718469,-0.362468,...,-0.4375,-1.272764,0.000000,False,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59305,F00155937,3337,1,0,20,34.0,12,0.807449,2.282553,1.820908,...,0.3125,0.613424,2.564949,False,1,0,0,0,1,0
59306,F00155940,3516,1,0,20,32.0,10,0.852800,2.493378,1.974293,...,0.1875,0.464956,2.397895,False,0,1,0,0,1,0
59307,F00155941,3702,1,0,10,0.0,48,0.899924,2.712448,2.133676,...,2.5625,1.949070,3.891820,False,1,0,0,0,1,0
59308,F00155943,3702,1,0,10,28.0,17,0.899924,2.712448,2.133676,...,0.6250,0.914595,2.890372,False,0,1,0,0,1,0


In [95]:
final_pred = models['GBM_ES'].predict(test_data.drop(['ID','is_train'], axis=1))

In [96]:
len(final_pred)

59310

In [97]:
sample_submission['Crop_Damage'] = final_pred

In [98]:
sample_submission.to_csv('submission_gbm_es.csv',index=False)

In [99]:
from sklearn.feature_selection import RFE

In [100]:
rf_random.best_estimator_

RandomForestClassifier(bootstrap=False, max_depth=10, max_features='sqrt',
                       min_samples_leaf=4, min_samples_split=5,
                       n_estimators=680)

In [102]:
rfe = RFE(rf_random.best_estimator_, n_features_to_select=25, verbose=1)

In [103]:
selector = rfe.fit(X_train, y_train.values.ravel())

Fitting estimator with 42 features.
Fitting estimator with 41 features.
Fitting estimator with 40 features.
Fitting estimator with 39 features.
Fitting estimator with 38 features.
Fitting estimator with 37 features.
Fitting estimator with 36 features.
Fitting estimator with 35 features.
Fitting estimator with 34 features.
Fitting estimator with 33 features.
Fitting estimator with 32 features.
Fitting estimator with 31 features.
Fitting estimator with 30 features.
Fitting estimator with 29 features.
Fitting estimator with 28 features.
Fitting estimator with 27 features.
Fitting estimator with 26 features.


In [104]:
selector.support_

array([ True, False, False,  True,  True,  True,  True,  True,  True,
        True,  True, False, False, False, False,  True, False, False,
       False, False, False,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True, False,  True, False,
       False, False, False, False,  True,  True])

In [113]:
X_train.columns

Index(['Estimated_Insects_Count', 'Crop_Type', 'Soil_Type',
       'Number_Doses_Week', 'Number_Weeks_Used', 'Number_Weeks_Quit',
       'Estimated_Insects_Count_mms', 'Estimated_Insects_Count_ss',
       'Estimated_Insects_Count_rs', 'Estimated_Insects_Count_pt',
       'Estimated_Insects_Count_ft_log', 'Crop_Type_mms', 'Crop_Type_ss',
       'Crop_Type_rs', 'Crop_Type_pt', 'Crop_Type_ft_log', 'Soil_Type_mms',
       'Soil_Type_ss', 'Soil_Type_rs', 'Soil_Type_pt', 'Soil_Type_ft_log',
       'Number_Doses_Week_mms', 'Number_Doses_Week_ss', 'Number_Doses_Week_rs',
       'Number_Doses_Week_pt', 'Number_Doses_Week_ft_log',
       'Number_Weeks_Used_mms', 'Number_Weeks_Used_ss', 'Number_Weeks_Used_rs',
       'Number_Weeks_Used_pt', 'Number_Weeks_Used_ft_log',
       'Number_Weeks_Quit_mms', 'Number_Weeks_Quit_ss', 'Number_Weeks_Quit_rs',
       'Number_Weeks_Quit_pt', 'Number_Weeks_Quit_ft_log', 'Season_1',
       'Season_2', 'Season_3', 'Pesticide_Use_Category_1',
       'Pesticide_Use_

In [114]:
selected = pd.DataFrame({'feature':X_train.columns, 'selected': selector.support_})

In [119]:
list(selected[selected['selected']==True]['feature'])

['Estimated_Insects_Count',
 'Number_Doses_Week',
 'Number_Weeks_Used',
 'Number_Weeks_Quit',
 'Estimated_Insects_Count_mms',
 'Estimated_Insects_Count_ss',
 'Estimated_Insects_Count_rs',
 'Estimated_Insects_Count_pt',
 'Estimated_Insects_Count_ft_log',
 'Crop_Type_ft_log',
 'Number_Doses_Week_mms',
 'Number_Doses_Week_ss',
 'Number_Doses_Week_rs',
 'Number_Doses_Week_pt',
 'Number_Doses_Week_ft_log',
 'Number_Weeks_Used_mms',
 'Number_Weeks_Used_ss',
 'Number_Weeks_Used_rs',
 'Number_Weeks_Used_pt',
 'Number_Weeks_Used_ft_log',
 'Number_Weeks_Quit_mms',
 'Number_Weeks_Quit_ss',
 'Number_Weeks_Quit_pt',
 'Pesticide_Use_Category_2',
 'Pesticide_Use_Category_3']

In [105]:
selector.ranking_

array([ 1,  8, 11,  1,  1,  1,  1,  1,  1,  1,  1,  9,  5,  6,  7,  1, 10,
       15, 13, 17, 18,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  3,
        1,  2, 14, 12, 16,  4,  1,  1])

In [106]:
X_test_transform = selector.transform(test_data.drop(['ID','is_train'], axis=1))

In [110]:
final_pred = selector.predict(test_data.drop(['ID','is_train'], axis=1))

In [111]:
sample_submission['Crop_Damage'] = final_pred

In [112]:
sample_submission.to_csv('rf_tuned_rfe.csv',index=False)

In [117]:
selector.estimator_.feature_importances_

array([0.04320464, 0.04018929, 0.04211441, 0.02420291, 0.04025785,
       0.03934638, 0.03770165, 0.03837314, 0.03870244, 0.02034771,
       0.04301679, 0.04020243, 0.03858843, 0.039634  , 0.03809755,
       0.04507152, 0.05293414, 0.04302467, 0.04702803, 0.04426941,
       0.02274737, 0.02466028, 0.02393084, 0.08917578, 0.04317835])

In [120]:
model_imp_v = pd.DataFrame({'var':list(selected[selected['selected']==True]['feature']),'imp':selector.estimator_.feature_importances_})

In [121]:
model_imp_v

Unnamed: 0,var,imp
0,Estimated_Insects_Count,0.043205
1,Number_Doses_Week,0.040189
2,Number_Weeks_Used,0.042114
3,Number_Weeks_Quit,0.024203
4,Estimated_Insects_Count_mms,0.040258
5,Estimated_Insects_Count_ss,0.039346
6,Estimated_Insects_Count_rs,0.037702
7,Estimated_Insects_Count_pt,0.038373
8,Estimated_Insects_Count_ft_log,0.038702
9,Crop_Type_ft_log,0.020348


In [122]:
model_imp

Unnamed: 0,var,imp
0,Estimated_Insects_Count,0.042927
1,Crop_Type,0.00502
2,Soil_Type,0.002597
3,Number_Doses_Week,0.036682
4,Number_Weeks_Used,0.040675
5,Number_Weeks_Quit,0.022575
6,Estimated_Insects_Count_mms,0.043428
7,Estimated_Insects_Count_ss,0.038152
8,Estimated_Insects_Count_rs,0.041731
9,Estimated_Insects_Count_pt,0.041737
