In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_columns',None)
import lightgbm as lgb

from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier ,RandomForestClassifier ,GradientBoostingClassifier
from xgboost import XGBClassifier 
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import Ridge,Lasso
from sklearn.metrics import roc_auc_score ,mean_squared_error,accuracy_score,classification_report,roc_curve,confusion_matrix
import warnings
warnings.filterwarnings('ignore')
from scipy.stats.mstats import winsorize
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
# Load the train data
train = pd.read_csv('../input/forest-cover-type/forest-cover-type-prediction/train.csv')

In [3]:
train.info()

In [4]:
train.describe()

In [5]:
train.shape

In [6]:
# Drop the Id.
train.drop('Id',axis=1,inplace=True)

In [7]:
# Checking for the class distribution of the target
train['Cover_Type'].value_counts().plot(kind='bar')
plt.title('Class Distribution of target')
plt.xlabel('Classes in target')
plt.ylabel('Distribution')
print()

In [8]:
# Checking for skewness in the features
print('Skewness for the different features is as shown below: ')
print(train.skew())

In [9]:
# Dropping the columns Soil_Type7 and Soil_Type15
train.drop(['Soil_Type7','Soil_Type15'],1,inplace=True)

In [10]:
# Plotting a violinplot between every feature and target
for i in range(0,len(train.columns)-1):
    sns.violinplot(data=train,x=train.iloc[:,-1],y=train[train.columns[i]])
    plt.show()
    
# Plotting a heatmap using to check for correlation between the features
sns.heatmap(train.corr())


# Selecting upper and lower threshold
upper_threshold = 0.5
lower_threshold = -0.5


# List the correlation pairs
correlation = train.corr().unstack().sort_values(kind='quicksort')

# Select the highest correlation pairs having correlation greater than upper threshold and lower than lower threshold
corr_var_list = correlation[((correlation>upper_threshold) | (correlation<lower_threshold)) & (correlation!=1)]
print(corr_var_list)

#### Feature Selection and Model building
Separate the features and target and then split the train data into train and validation set.
Apply different models of your choice and then predict on the validation data and find the accuracy_score for this prediction.
Try improving upon the accuracy_score using different feature selection techniques like wrapper methods, PCA and try using hyperparameter tuning to find the parameters that give the best accuracy.

In [11]:
# Split into features and target
X = train.iloc[:,:-1]
y = train.iloc[:,-1]

# Separate into train and test data
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=42,test_size=0.2)

In [12]:
# Fitting a Random Forest model on the train data and predict on the test data.
model = RandomForestClassifier(n_estimators=100, random_state=17, n_jobs=4)
rfe = RFE(model,n_features_to_select=5)
X_train_rfe = rfe.fit_transform(X_train, y_train)
X_test_rfe = rfe.transform(X_test)
model.fit(X_train_rfe,y_train)
y_Pred_rfe = model.predict(X_test_rfe)
print(accuracy_score(y_test,y_Pred_rfe))

##### Prediction on the test data and creating the sample submission file.
Load the test data and store the Id column in a separate variable.
Perform the same operations on the test data that you have performed on the train data.
Create the submission file as a csv file consisting of the Id column from the test data and your prediction as the second column.

In [13]:
# Read the test data
test = pd.read_csv('../input/forest-cover-type/forest-cover-type-prediction/test.csv')

In [14]:
test.info()

In [15]:
test.describe()

In [16]:
test.shape

In [17]:
# Storing the id from the test file
id_ = test['Id']

In [18]:
# Dropping the same columns from the test data
test.drop(['Id','Soil_Type7','Soil_Type15'],1,inplace=True)

In [19]:
# Applying rfe on test data
test_rfe = rfe.transform(test)

# Predict on the test data
y_pred_test = model.predict(test_rfe)

# Create a sample submission file
sample_submission = pd.DataFrame({'Id':id_,'Cover_Type':y_pred_test})

# Convert the sample submission file into a csv file
sample_submission.to_csv('sample_submission.csv',index=False)