<a href="https://colab.research.google.com/github/sunitadhotre/Artificial-Intelligence/blob/main/Day9_Assignment_SSD.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import StratifiedKFold, GridSearchCV, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from xgboost import XGBClassifier

In [None]:
import os
os.chdir(")

In [None]:
df = pd.read_csv("Satellite.csv", sep=";")
df.head(3)

Unnamed: 0,x.1,x.2,x.3,x.4,x.5,x.6,x.7,x.8,x.9,x.10,...,x.28,x.29,x.30,x.31,x.32,x.33,x.34,x.35,x.36,classes
0,92,115,120,94,84,102,106,79,84,102,...,104,88,121,128,100,84,107,113,87,grey soil
1,84,102,106,79,84,102,102,83,80,102,...,100,84,107,113,87,84,99,104,79,grey soil
2,84,102,102,83,80,102,102,79,84,94,...,87,84,99,104,79,84,99,104,79,grey soil


In [None]:
le = LabelEncoder()
df["classes"] = le.fit_transform(df["classes"])
print(le.classes_)

['cotton crop' 'damp grey soil' 'grey soil' 'red soil'
 'vegetation stubble' 'very damp grey soil']


In [None]:
X = df.drop('classes', axis=1)
y = df["classes"]

### Grid Search for Decision Tree

In [None]:
params = {'max_depth':[2,3,4,5,6,None],
         'min_samples_split':[2, 5, 10, 30],
         'min_samples_leaf': [1, 10, 50]}
dtc = DecisionTreeClassifier(random_state=23)
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=23)
gcv_tree = GridSearchCV(dtc, param_grid=params, cv=kfold, scoring='neg_log_loss')
gcv_tree.fit(X, y)

In [None]:
print(gcv_tree.best_params_)
print(gcv_tree.best_score_)

{'max_depth': 6, 'min_samples_leaf': 50, 'min_samples_split': 2}
-0.5720877740971643


### Grid Search for Random Forest

In [None]:
params = {'max_depth':[2,3,4,5,6,None],
         'min_samples_split':[2, 5, 10, 30],
         'min_samples_leaf': [1, 10, 50],
         'max_features': [2,3,4]}
rf = RandomForestClassifier(random_state=23, n_estimators=25)
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=23)
gcv_rf = GridSearchCV(rf, param_grid=params, cv=kfold, scoring='neg_log_loss')
gcv_rf.fit(X, y)

In [None]:
print(gcv_rf.best_params_)
print(gcv_rf.best_score_)

{'max_depth': None, 'max_features': 4, 'min_samples_leaf': 1, 'min_samples_split': 10}
-0.28284728668263054


### Grid Search for XG Boost

In [None]:
gbm = XGBClassifier(random_state=23)
params = {'learning_rate':[0.01, 0.1, 0.4, 0.9],
          'max_depth':[3, 5, None],
          'n_estimators':[25, 50]}
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=23)
gcv_gbm = GridSearchCV(gbm, param_grid=params, cv=kfold, scoring='neg_log_loss')
gcv_gbm.fit(X, y)

In [None]:
print(gcv_gbm.best_params_)
print(gcv_gbm.best_score_)

{'learning_rate': 0.4, 'max_depth': 3, 'n_estimators': 50}
-0.24102128345654622


#### Scores:

In [None]:
print("Log Loss for Tree =", gcv_tree.best_score_)
print("Log Loss for RF =", gcv_rf.best_score_)
print("Log Loss for XGB =", gcv_gbm.best_score_)

Log Loss for Tree = -0.5720877740971643
Log Loss for RF = -0.28284728668263054
Log Loss for XGB = -0.24102128345654622


### Stack Ensembler

In [None]:
stack = StackingClassifier(estimators=[('TREE', gcv_tree),('RF',gcv_rf),('XGB',gcv_gbm)],
                           final_estimator=gcv_rf, passthrough=True)
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=23)
results = cross_val_score(stack, X, y, scoring='neg_log_loss', cv=kfold)
score_stack = results.mean()

In [None]:
print("Log Loss with Stacking =", score_stack)

## Best Model

The model with best score here seems to be XG Boost as the log loss score is closer to zero for XG Boost.

In [None]:
best_model = gcv_gbm.best_estimator_

## Inferencing

#### Unlabeled Data:

In [None]:
tst = pd.read_csv("tst_satellite.csv")
tst.head()

Unnamed: 0,x.1,x.2,x.3,x.4,x.5,x.6,x.7,x.8,x.9,x.10,...,x.27,x.28,x.29,x.30,x.31,x.32,x.33,x.34,x.35,x.36
0,104,97,106,79,94,91,85,87,106,92,...,100,80,110,102,87,105,89,81,100,75
1,99,105,99,95,101,110,91,101,96,83,...,80,84,75,107,85,94,100,96,79,110
2,98,78,91,104,105,103,84,91,106,82,...,95,81,76,99,97,95,88,78,103,75
3,75,98,98,104,89,90,100,81,88,88,...,108,86,88,86,106,89,76,79,79,91
4,92,108,89,89,92,108,78,94,84,88,...,91,106,84,106,96,81,91,76,84,106


In [None]:
predictions = best_model.predict(tst)
le.inverse_transform(predictions)

array(['grey soil', 'grey soil', 'grey soil', 'grey soil', 'grey soil',
       'grey soil', 'grey soil', 'grey soil', 'grey soil', 'grey soil',
       'grey soil', 'grey soil', 'very damp grey soil', 'grey soil',
       'grey soil', 'grey soil', 'grey soil', 'grey soil'], dtype=object)

Loading the MNIST Fashion dataset