In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
pd.options.display.max_columns = 30

In [None]:
data_train = pd.read_csv('../input/learn-together/train.csv',index_col='Id')
data_test  = pd.read_csv('../input/learn-together/test.csv',index_col='Id')

In [None]:
data_train.head()

# Checking shape of datasets

In [None]:
print("Train dataset shape:", data_train.shape)
print("Test dataset shape:", data_test.shape)

In [None]:
data_train.info()

# Looking for missing values

In [None]:
print(f"Missing Values in train dataset: {data_train.isna().any().any()}")
print(f"Missing Values in test dataset: {data_test.isna().any().any()}")

# Checking columns type

In [None]:
print(f"Train Column Types: {set(data_train.dtypes)}")
print(f"Test Column Types: {set(data_test.dtypes)}")

# Reviewing description of each column in train dataset

In [None]:
data_train.describe().T

# Checking quantity of different values in each column of train and test datasets

In [None]:
for column in data_train.columns:
    print(column, data_train[column].nunique())

In [None]:
for column in data_test.columns:
    print(column, data_test[column].nunique())

In [None]:
print("Unique values in 'Soil_Type7' column in training data:\n", data_train['Soil_Type7'].unique())
print("\nUnique values in 'Soil_Type7' column in validation data:\n", data_test['Soil_Type7'].unique())
print("Unique values in 'Soil_Type15' column in training data:\n", data_train['Soil_Type15'].unique())
print("\nUnique values in 'Soil_Type15' column in validation data:\n", data_test['Soil_Type15'].unique())

Observation: columns *Soil_Types7* and *Soil_Type15* in train dataset have only one value "0", meantime the same columns in the test dataset have two values "0" and "1". I will drop those columns from both datasets. Taking into consideration the quantity of values "1" in the test set. 

In [None]:
# removing both columns from train and test datasets
data_train.drop(["Soil_Type7", "Soil_Type15"], axis = 1, inplace=True)
data_test.drop(["Soil_Type7", "Soil_Type15"], axis = 1, inplace=True)

In [None]:
# choosing categorical columns
categorical_columns = ['Wilderness_Area1', 'Wilderness_Area2', 'Wilderness_Area3', 'Wilderness_Area4', 
                       'Soil_Type1', 'Soil_Type2', 'Soil_Type3', 'Soil_Type4', 'Soil_Type5', 
                       'Soil_Type6', 'Soil_Type8', 'Soil_Type9', 'Soil_Type10', 'Soil_Type11', 
                       'Soil_Type12', 'Soil_Type13', 'Soil_Type14', 'Soil_Type16', 'Soil_Type17', 
                       'Soil_Type18', 'Soil_Type19', 'Soil_Type20', 'Soil_Type21', 'Soil_Type22', 
                       'Soil_Type23', 'Soil_Type24', 'Soil_Type25', 'Soil_Type26', 'Soil_Type27', 
                       'Soil_Type28', 'Soil_Type29', 'Soil_Type30', 'Soil_Type31', 'Soil_Type32',
                       'Soil_Type33', 'Soil_Type34', 'Soil_Type35', 'Soil_Type36', 'Soil_Type37', 
                       'Soil_Type38', 'Soil_Type39', 'Soil_Type40']

In [None]:
# creating training dataset without categorical columns
data_train_num = data_train.drop(categorical_columns, axis=1)
data_test_num = data_test.drop(categorical_columns, axis=1)

# Creating Heatmap with numerical data

In [None]:

f,ax = plt.subplots(figsize=(12, 12))
sns.heatmap(data_train_num.corr(),annot=True, linewidths=.5) #, fmt='.1f', ax=ax)
plt.show()

The higher number is the higher correlation between those columns

# Creating scatterplots showing relationship between data

## Below we may see distribution of data in the training set

In [None]:
%%time
sns.pairplot(data_train_num)

In [None]:
# Scatter plot
data_train.plot(kind="scatter", x="Cover_Type", y="Elevation")

In [None]:
data_train.plot(kind="scatter", x="Cover_Type", y='Slope')

In [None]:
data_train.plot(kind="scatter", x="Cover_Type", y='Vertical_Distance_To_Hydrology')

In [None]:
sns.regplot(x=data_train['Vertical_Distance_To_Hydrology'], y=data_train['Horizontal_Distance_To_Hydrology'], color='darkblue')

Nice correlation

In [None]:
sns.regplot(x=data_train['Aspect'], y=data_train['Hillshade_3pm'], color='darkred')

In [None]:
sns.regplot(x=data_train['Horizontal_Distance_To_Hydrology'], y=data_train['Elevation'], color='blue')

Also good correlation

In [None]:
sns.regplot(x=data_train['Horizontal_Distance_To_Roadways'], y=data_train['Elevation'], color='darkred')

Good correlation

In [None]:
# train dataset
soil_types = data_train.iloc[:,14:-1].sum(axis=0)

plt.figure(figsize=(18,9))
sns.barplot(x=soil_types.index, y=soil_types.values)
plt.xticks(rotation= 75)
plt.ylabel('Total')
plt.title('Count of Soil Types With Value 1 in Train Dataset',color='blue',fontsize=16)

In [None]:
# test dataset
soil_types = data_test.iloc[:,14:-1].sum(axis=0)

plt.figure(figsize=(18,9))
sns.barplot(x=soil_types.index, y=soil_types.values)
plt.xticks(rotation= 75)
plt.ylabel('Total')
plt.title('Count of Soil Types With Value 1 in Test Dataset',color='blue',fontsize=16)

In [None]:
wilderness_areas = data_train.iloc[:,10:14].sum(axis=0)

plt.figure(figsize=(7,5))
sns.barplot(x=wilderness_areas.index,y=wilderness_areas.values, palette="rocket")
plt.xticks(rotation=90)
plt.title('Wilderness Areas',color = 'blue',fontsize=16)
plt.ylabel('Total')
plt.show()

Looking at target columns, is there a different quantity in each type?

In [None]:
import plotly.express as px

cover_type = data_train["Cover_Type"].value_counts()
data_cover_type = pd.DataFrame({'CoverType': cover_type.index, 'Total':cover_type.values})
fig = px.bar(data_cover_type, x='CoverType', y='Total', height=400, width=650)
fig.show()

the targets classes are balanced

# Checking & comparision of data distribution in tran and test sets

In [None]:
f,ax=plt.subplots(1,2,figsize=(15,7))
data_train.Vertical_Distance_To_Hydrology.plot.hist(ax=ax[0],bins=30,edgecolor='black',color='crimson')
ax[0].set_title('Train Dataset \n''Vertical Distance To Hydrology')
x1=list(range(-150,350,50))
ax[0].set_xticks(x1)
data_test.Vertical_Distance_To_Hydrology.plot.hist(ax=ax[1],bins=30,edgecolor='black',color='darkmagenta')
ax[1].set_title('Test Dataset \n ''Vertical Distance To Hydrology')
x2=list(range(-150,350,50))
ax[1].set_xticks(x2)
plt.show()

In [None]:
f,ax=plt.subplots(1,2,figsize=(15,7))
data_train.Elevation.plot.hist(ax=ax[0],bins=30,edgecolor='black',color='blue')
ax[0].set_title('Train Dataset \n''Elevation')
x1=list(range(1800,4000,200))
ax[0].set_xticks(x1)
data_test.Elevation.plot.hist(ax=ax[1],bins=30,edgecolor='black',color='darkblue')
ax[1].set_title('Test Dataset \n ''Elevation')
x2=list(range(1800,4000,100))
ax[1].set_xticks(x2)
plt.show()

In [None]:
f,ax=plt.subplots(1,2,figsize=(15,7))
data_train.Horizontal_Distance_To_Hydrology.plot.hist(ax=ax[0],bins=30,edgecolor='black',color='green')
ax[0].set_title('Train Dataset \n''Horizontal Distance To Hydrology')
x1=list(range(0,1400,200))
ax[0].set_xticks(x1)
data_test.Horizontal_Distance_To_Hydrology.plot.hist(ax=ax[1],bins=30,edgecolor='black',color='darkgreen')
ax[1].set_title('Test Dataset \n ''Horizontal Distance To Hydrology')
x2=list(range(0,1400,100))
ax[1].set_xticks(x2)
plt.show()

In [None]:
f,ax=plt.subplots(1,2,figsize=(15,7))
data_train.Slope.plot.hist(ax=ax[0],bins=30,edgecolor='black',color='grey')
ax[0].set_title('Train Dataset \n''Slope')
x1=list(range(0,60,5))
ax[0].set_xticks(x1)
data_test.Slope.plot.hist(ax=ax[1],bins=30,edgecolor='black',color='darkgrey')
ax[1].set_title('Test Dataset \n ''Slope')
x2=list(range(0,60,5))
ax[1].set_xticks(x2)
plt.show()

# Building Random Forest Classifier model

In [None]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier, plot_importance
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, balanced_accuracy_score, make_scorer, roc_auc_score

In [None]:
X = data_train.drop(['Cover_Type'], axis=1)
y = data_train.Cover_Type

In [None]:
X_train,  X_val, y_train, y_val = train_test_split(X, y, train_size=0.8, random_state=1)
print(X_train.shape)
print(y_train.shape)
print(X_val.shape)
print(y_val.shape)

In [None]:
# looking for best n_estimators using cross validation and GridSearch
#rf_model = RandomForestClassifier(n_estimators = 50, n_jobs=-1, random_state=1)
#params = {'n_estimators': [650, 3500]}
#grid_rf = GridSearchCV(rf_model, params, n_jobs=-1, cv=3, scoring='accuracy', verbose=True)
#score_rf = grid_rf.fit(X, y)
#print(grid_rf.best_score_) 
#print(grid_rf.best_params_)

Best outcome:
0.7856481481481481
{'n_estimators': 3500}

In [None]:
# application of the n_estimators: 3500 to the model

rf_model = RandomForestClassifier(n_estimators=3500, n_jobs=-1, random_state=1)
rf_model.fit(X_train, y_train)
rf_model_pred = rf_model.predict(X_val)

In [None]:
print("RandomForest Val accuracy: ", accuracy_score(rf_model_pred, y_val))

RandomForest Val accuracy:  0.8630952380952381

## Building plot for futures importances of training dataset for RF model

In [None]:
def plot_feature_importances(model):
    plt.figure(figsize=(12, 16))
    n_features = X_train.shape[1]
    plt.barh(range(n_features), model.feature_importances_, align='center')
    plt.yticks(np.arange(n_features), X_train.columns)
    plt.xlabel("Importance of feature")
    plt.ylabel("Feature")
    
plot_feature_importances(rf_model)

# Building XGBClassifier Model

In [None]:
#%%time
# searching best parameters with GridSearch method

#xgb_model = XGBClassifier(n_estimators=100, learning_rate=0.6, n_jobs=-1, random_state=1)
#params = {'n_estimators': [500, 1000, 2000], 'learning_rate': [0.001, 0.01, 0.1, 0.5, 1]}    
#grid_xgb = GridSearchCV(xgb_model, params, n_jobs=-1, cv=5, scoring='accuracy', verbose=True)
#score_xgb = grid_xgb.fit(X, y)
#print(grid_xgb.best_score_) 
#print(grid_xgb.best_params_)

0.7702380952380953
{'learning_rate': 0.5, 'n_estimators': 1000}
CPU times: user 2min 46s, sys: 796 ms, total: 2min 47s
Wall time: 1h 9min 17s

In [None]:
%%time
xgb_model = XGBClassifier(n_estimators=2000, learning_rate=0.6, n_jobs=-1, random_state=1)
xgb_model.fit(X_train, y_train, early_stopping_rounds=30, 
              eval_set=[(X_val, y_val)], verbose=True)
xgb_model_pred = xgb_model.predict(X_val)
print("XGBoost Val accuracy: ", accuracy_score(xgb_model_pred, y_val))

XGBoost Val accuracy:  0.8429232804232805
CPU times: user 32.2 s, sys: 252 ms, total: 32.5 s
Wall time: 32.3 s

In [None]:
xgb_model

xgb_model = XGBClassifier(n_estimators=2000, learning_rate=0.6, n_jobs=-1, random_state=1)
xgb_model.fit(X_train, y_train, early_stopping_rounds=10, 
              eval_set=[(X_val, y_val)], verbose=False)
              
XGBoost Val accuracy:  0.8343253968253969
CPU times: user 20.7 s, sys: 68 ms, total: 20.7 s
Wall time: 20.7 s

xgb_model = XGBClassifier(n_estimators=2000, learning_rate=0.2, n_jobs=-1, random_state=1)    
xgb_model.fit(X_train, y_train, early_stopping_rounds=10, 
              eval_set=[(X_val, y_val)], verbose=False)

Outcome: XGBoost Val accuracy:  0.8072089947089947
CPU times: user 24.4 s, sys: 88 ms, total: 24.5 s
Wall time: 24.5 s

# File for submission

## Creating predictions:

In [None]:
# predictions for RF model
#preds_test = rf_model.predict(data_test)

In [None]:
# predictions for XGB model
preds_test = xgb_model.predict(data_test)

In [None]:
# Creating file with predictions for submission

output = pd.DataFrame({'Id': data_test.index,
                       'Cover_Type': preds_test})
output.to_csv('submission.csv', index=False)

In [None]:
output.head()