In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
pd.options.display.max_columns = 30
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier, plot_importance
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, balanced_accuracy_score, make_scorer

In [None]:
data_train = pd.read_csv('../input/learn-together/train.csv',index_col='Id')
data_test  = pd.read_csv('../input/learn-together/test.csv',index_col='Id')

In [None]:
data_train.head()

In [None]:
print("Train dataset shape:", data_train.shape)
print("Test dataset shape:", data_test.shape)

In [None]:
print(f"Missing Values in train dataset: {data_train.isna().any().any()}")
print(f"Missing Values in test dataset: {data_test.isna().any().any()}")

In [None]:
print(f"Train Column Types: {set(data_train.dtypes)}")
print(f"Test Column Types: {set(data_test.dtypes)}")

In [None]:
data_train.describe().T

In [None]:
print("Unique values in 'Soil_Type7' column in training data:\n", data_train['Soil_Type7'].unique())
print("\nUnique values in 'Soil_Type7' column in validation data:\n", data_test['Soil_Type7'].unique())
print("Unique values in 'Soil_Type15' column in training data:\n", data_train['Soil_Type15'].unique())
print("\nUnique values in 'Soil_Type15' column in validation data:\n", data_test['Soil_Type15'].unique())

In [None]:
# removing both columns from train and test datasets
data_train.drop(["Soil_Type7", "Soil_Type15"], axis = 1, inplace=True)
data_test.drop(["Soil_Type7", "Soil_Type15"], axis = 1, inplace=True)

## Creating new futures

In [None]:
def new_futures(data):
    data['Bottom_Hydrology'] = data.Vertical_Distance_To_Hydrology < 0
    
    data['Elevation_Of_Hydrology'] = data.Elevation - \
    data.Vertical_Distance_To_Hydrology
    
    data['Decreased_Elevation'] = data.Elevation - \
    data.Horizontal_Distance_To_Hydrology
    
    data['Direct_Dist_to_Hydrology'] = (data.Horizontal_Distance_To_Hydrology \
                                        **2 + data.Vertical_Distance_To_Hydrology \
                                        **2) **0.5
    
    #data['Distance_To_Earth_See'] = (data.Elevation / np.sin(data.Slope)).astype('float64')
    
    data['Vertical_Angle'] = 180 - 90 - data.Slope
    
    data['Vertical_Distance_To_Fire_Point'] = data.Horizontal_Distance_To_Fire_Points * \
    np.tanh(data.Slope)
    
    data['Vertical_Distance_To_Roadways'] = data.Horizontal_Distance_To_Roadways * \
    np.tanh(data.Slope)
    
    data['Elevation_Of_Fire_Point'] = data.Elevation - data.Vertical_Distance_To_Fire_Point
    
    data['Elevation_Of_Roadways'] = data.Elevation - data.Vertical_Distance_To_Roadways
    
    data['Direct_Dist_to_Fire_Point'] = (data.Vertical_Distance_To_Fire_Point \
                                        **2 + data.Horizontal_Distance_To_Fire_Points \
                                        **2) **0.5
    data['Direct_Dist_to_Roadways'] = (data.Vertical_Distance_To_Roadways \
                                        **2 + data.Horizontal_Distance_To_Roadways \
                                        **2) **0.5
    
    data['Hydrology_Fire_Point'] = abs(data['Horizontal_Distance_To_Hydrology'] - \
                                       data['Horizontal_Distance_To_Fire_Points'])
    
    data['Hydrology_Fire_Point_Plus'] = data['Horizontal_Distance_To_Hydrology'] + \
    data['Horizontal_Distance_To_Fire_Points']
    
    data['Hydrology_Road_Plus'] = data['Horizontal_Distance_To_Hydrology'] + \
    data['Horizontal_Distance_To_Roadways']
    
    data['Hydrology_Road_Minus'] = abs(data['Horizontal_Distance_To_Hydrology'] - \
                                       data['Horizontal_Distance_To_Roadways'])
    
    data['Fire_Roadways_Plus'] = data['Horizontal_Distance_To_Fire_Points'] + \
    data['Horizontal_Distance_To_Roadways']
    
    data['Fire_Roadways_Minus'] = abs(data['Horizontal_Distance_To_Fire_Points'] - \
                                      data['Horizontal_Distance_To_Roadways'])
    
    data['Hydrology_Fire_Point_Mean'] = (data.Horizontal_Distance_To_Hydrology + \
                                         data.Horizontal_Distance_To_Fire_Points) / 2
    
    data['Hydrology_Road_Mean'] = (data.Horizontal_Distance_To_Hydrology + \
    data.Horizontal_Distance_To_Roadways) / 2
    
    data['Fire_Roadways_Mean'] = (data.Horizontal_Distance_To_Fire_Points + \
    data.Horizontal_Distance_To_Roadways) / 2
    
    return data

data_train = new_futures(data_train)
data_test = new_futures(data_test)

In [None]:
data_train.head()

In [None]:
print("Train dataset shape:", data_train.shape)
print("Test dataset shape:", data_test.shape)
print(f"Missing Values in train dataset: {data_train.isna().any().any()}")
print(f"Missing Values in test dataset: {data_test.isna().any().any()}")

In [None]:
data_train.info()

In [None]:
y = data_train.Cover_Type
X = data_train.drop(['Cover_Type'], axis=1)

In [None]:
X_train,  X_val, y_train, y_val = train_test_split(X, y, train_size=0.8, random_state=1)
print(X_train.shape)
print(y_train.shape)
print(X_val.shape)
print(y_val.shape)

# RF Model

In [None]:
%%time
# looking for best n_estimators using cross validation and GridSearch
rf_model = RandomForestClassifier(n_estimators = 100, n_jobs=-1, random_state=1)
params = {'n_estimators': [100, 500, 1000, 2000]}
grid_rf = GridSearchCV(rf_model, params, n_jobs=-1, cv=3, scoring='accuracy', verbose=True)
score_rf = grid_rf.fit(X, y)
print(grid_rf.best_score_) 
print(grid_rf.best_params_)

0.801058201058201
{'n_estimators': 500}
CPU times: user 37.2 s, sys: 488 ms, total: 37.7 s
Wall time: 2min 35s

In [None]:
# application of the n_estimators:1500 to the model

rf_model = RandomForestClassifier(n_estimators=1500, n_jobs=-1, random_state=1)
rf_model.fit(X_train, y_train)
rf_model_pred = rf_model.predict(X_val)
print("RandomForest Val accuracy: ", accuracy_score(rf_model_pred, y_val))

RandomForest Val accuracy:  0.888558201058201

In [None]:
def plot_feature_importances(model):
    plt.figure(figsize=(12, 18))
    n_features = X_train.shape[1]
    plt.barh(range(n_features), model.feature_importances_, align='center')
    plt.yticks(np.arange(n_features), X_train.columns.sort_values(ascending=False))
    plt.xlabel("Importance of feature")
    plt.ylabel("Feature")
    
plot_feature_importances(rf_model)

In [None]:
def plot_feature_importances(model, figsize=(12, 18)):
    
    importances = pd.DataFrame({'Features': X_train.columns, 
                                'Importances': model.feature_importances_})
    
    importances.sort_values(by=['Importances'], axis='index', ascending=False, inplace=True)

    fig = plt.figure(figsize=figsize)
    sns.barplot(y = 'Features', x = 'Importances', data=importances)
    plt.yticks(rotation='horizontal')
    plt.show()
    return importances
    
importances = plot_feature_importances(rf_model)    

## Experimenting with decreasing non important futures

### Outcome: Drop of columns with minor importancy futures has not led to increase of the accuracy!

# XGBoost Model

In [None]:
#%%time
# searching best parameters with GridSearch method

#xgb_model = XGBClassifier(n_estimators=100, learning_rate=0.6, n_jobs=-1, random_state=1)
#params = {'n_estimators': [500, 1000, 2000], 'learning_rate': [0.001, 0.01, 0.1, 0.5, 1]}    
#grid_xgb = GridSearchCV(xgb_model, params, n_jobs=-1, cv=5, scoring='accuracy', verbose=True)
#score_xgb = grid_xgb.fit(X, y)
#print(grid_xgb.best_score_) 
#print(grid_xgb.best_params_)

In [None]:
#%%time
#xgb_model = XGBClassifier(n_estimators=2000, learning_rate=0.6, n_jobs=-1, random_state=1)
#xgb_model.fit(X_train, y_train, early_stopping_rounds=30, 
#              eval_set=[(X_val, y_val)], verbose=True)
#xgb_model_pred = xgb_model.predict(X_val)
#print("XGBoost Val accuracy: ", accuracy_score(xgb_model_pred, y_val))

# File for submission

In [None]:
# predictions for RF model
preds_test = rf_model.predict(data_test)

In [None]:
# Creating file with predictions for submission

output = pd.DataFrame({'Id': data_test.index,
                       'Cover_Type': preds_test})
output.to_csv('submission.csv', index=False)

In [None]:
output.head()