In [1]:
%matplotlib inline

In [2]:
from IPython.display import HTML
style = "<style>svg{width:70% !important;height:70% !important;}.container{width:70% !important;}</style></style>"
HTML(style)

In [3]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [4]:
df = pd.read_csv('../../../resources/invasion.csv')
df.head()

Unnamed: 0,class,g_reflection,i_reflection,speed,brightness,time_of_observance,volume
0,transport,2.190672,6.716633,62.168208,0.347465,158221,44.932446
1,transport,3.453276,8.995909,62.994707,0.590094,385972,41.5683
2,transport,2.432994,6.938691,62.245807,0.329288,446482,40.123467
3,fighter,6.083763,3.019459,18.474555,0.174738,210125,11.384865
4,fighter,12.876769,2.45295,195.805771,0.150446,23109,11.328806


In [6]:
x_testing = pd.read_csv('../../../resources/operative_information.csv')
x_testing.head()

Unnamed: 0,g_reflection,i_reflection,speed,brightness,time_of_observance,volume
0,7.516543,3.916691,513.954279,0.177247,105908,13.267224
1,4.322988,6.967689,63.75297,0.545922,277855,39.83313
2,4.595724,9.098297,62.233948,0.389201,160662,42.014556
3,2.689675,7.964869,62.475495,0.541081,162092,42.056829
4,8.075576,5.169719,336.441261,0.174757,466853,11.779813


In [7]:
y = df["class"]
y.head()

0    transport
1    transport
2    transport
3      fighter
4      fighter
Name: class, dtype: object

In [8]:
x = df.drop(["class"], axis=1)
x.head()

Unnamed: 0,g_reflection,i_reflection,speed,brightness,time_of_observance,volume
0,2.190672,6.716633,62.168208,0.347465,158221,44.932446
1,3.453276,8.995909,62.994707,0.590094,385972,41.5683
2,2.432994,6.938691,62.245807,0.329288,446482,40.123467
3,6.083763,3.019459,18.474555,0.174738,210125,11.384865
4,12.876769,2.45295,195.805771,0.150446,23109,11.328806


In [9]:
clf = RandomForestClassifier(random_state=0)

In [10]:
parametrs = {"n_estimators": range(10, 50, 10), "max_depth": range(1, 12, 2), "min_samples_leaf": range(1, 7), 
             "min_samples_split": range(2, 9, 2)}

In [11]:
grid_search_cv_clf = GridSearchCV(clf, parametrs, cv=3, n_jobs=-1)

In [12]:
grid_search_cv_clf.fit(x, y)

GridSearchCV(cv=3, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False, random_state=0,
                                   

In [13]:
grid_search_cv_clf.best_params_

{'max_depth': 3,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 10}

In [15]:
best_clf = grid_search_cv_clf.best_estimator_

In [16]:
best_clf.feature_importances_

array([0.08272264, 0.12437216, 0.17144063, 0.36380346, 0.        ,
       0.25766112])

In [17]:
feature_importances_df = pd.DataFrame({"features": x.columns, "feature_importances": best_clf.feature_importances_})
feature_importances_df.sort_values("feature_importances", ascending=False)

Unnamed: 0,features,feature_importances
3,brightness,0.363803
5,volume,0.257661
2,speed,0.171441
1,i_reflection,0.124372
0,g_reflection,0.082723
4,time_of_observance,0.0


In [18]:
pred = best_clf.predict(x_testing)
pred

array(['fighter', 'transport', 'transport', ..., 'transport', 'fighter',
       'transport'], dtype=object)

In [19]:
pd.Series(pred).value_counts()

fighter      675
transport    595
cruiser      230
dtype: int64

In [21]:
pd.DataFrame(best_clf.predict(x_testing), columns=["class"])

Unnamed: 0,class
0,fighter
1,transport
2,transport
3,transport
4,fighter
...,...
1495,fighter
1496,fighter
1497,transport
1498,fighter


In [29]:
from sklearn import preprocessing

In [30]:
le = preprocessing.LabelEncoder()

In [31]:
le.fit(["paris", "paris", "tokyo", "amsterdam"])

LabelEncoder()

In [32]:
list(le.classes_)

['amsterdam', 'paris', 'tokyo']

In [33]:
le.transform(["tokyo", "tokyo", "paris"]) 

array([2, 2, 1])

In [34]:
list(le.inverse_transform([2, 2, 1]))

['tokyo', 'tokyo', 'paris']