In [1]:
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
import pandas as pd
from sklearn.datasets import load_wine, load_digits, load_iris
from plotly.offline import iplot
import plotly.graph_objects as go
import plotly


In [2]:
def split_data(data, split=0.8):
    #print((split + (1 - split)/2))
    fit_data = data[:int(len(data) * split)]
    predict_data = data[int(len(data) * split) : int(len(data) * (split + (1 - split)/2))]
    score_data = data[int(len(data) * (split + (1 - split)/2)) :]
    return {'Train':fit_data, 'Predict':predict_data, 'Score':score_data}

In [3]:
classificators ={
    'LogisticRegression':LogisticRegression(max_iter=5000),
    'DecisionTreeClassifier':DecisionTreeClassifier(),
    'KNeighborsClassifier':KNeighborsClassifier(),
    'SVM':SVC(),
    'RandomForestClassifier':RandomForestClassifier(),
    'RandomForestRegressor':RandomForestRegressor(),
    'KNN':KNeighborsClassifier()
}

In [4]:
def test_clf(classifiers, X, y):
    results = {}
    for name, classificator in classifiers.items():
        classificator.fit(X['Train'], y['Train'])
        cl_score = classificator.score(X['Score'], y['Score'])
        cl_result = classificator.predict(X['Predict'])
        df = pd.DataFrame({
            'label': y['Predict'],
            'cl_result': cl_result,
            'result': y['Predict']==cl_result}
        )
        cl_pred_result = df[df['result'] == True].count()['result'] / len(y['Predict'])
        results[name] = {'Train score': cl_score, 'Predict_result': cl_pred_result, 'Classif_predict':cl_result}
    return results

# Wine

In [5]:
wine_data, wine_label = load_wine(return_X_y=True)

In [6]:
X = split_data(wine_data, split=0.9)
y = split_data(wine_label, split=0.9)

In [7]:
results = test_clf(classificators, X, y)

In [8]:
print(f'Expected result:{76*" "}{y['Predict']}')
for k,v in results.items():
    print(f'{k:<30}, Train Score: {v['Train score']:>1.5f}, Predict Score:{v['Predict_result']:>0.4f}, Classif_predict:{v["Classif_predict"]}')


Expected result:                                                                            [2 2 2 2 2 2 2 2 2]
LogisticRegression            , Train Score: 1.00000, Predict Score:1.0000, Classif_predict:[2 2 2 2 2 2 2 2 2]
DecisionTreeClassifier        , Train Score: 1.00000, Predict Score:0.8889, Classif_predict:[2 1 2 2 2 2 2 2 2]
KNeighborsClassifier          , Train Score: 0.11111, Predict Score:0.1111, Classif_predict:[1 1 1 1 2 1 1 1 0]
SVM                           , Train Score: 0.00000, Predict Score:0.0000, Classif_predict:[1 1 1 1 1 1 1 1 1]
RandomForestClassifier        , Train Score: 1.00000, Predict Score:1.0000, Classif_predict:[2 2 2 2 2 2 2 2 2]
RandomForestRegressor         , Train Score: 0.00000, Predict Score:0.3333, Classif_predict:[1.98 1.7  1.84 2.   2.   1.99 1.77 2.   1.86]
KNN                           , Train Score: 0.11111, Predict Score:0.1111, Classif_predict:[1 1 1 1 2 1 1 1 0]


<html>
<table>
    <tr>
        <td colspan="3"><b>Split 0.7</b></td>
    </tr>
    <tr>
        <td>LogisticRegression</td><td>Train Score: 0.00000</td><td>Predict Score: 0.2222</td>
    </tr>
    <tr>
        <td>DecisionTreeClassifier</td><td>Train Score: 0.00000</td><td>Predict Score: 0.2222</td>
    </tr>
    <tr>
        <td>KNeighborsClassifier</td><td>Train Score: 0.00000</td><td>Predict Score: 0.2222</td>
    </tr>
    <tr>
        <td>SVM</td><td>Train Score: 0.00000</td><td>Predict Score: 0.2222</td>
    </tr>
    <tr>
        <td>RandomForestClassifier</td><td>Train Score: 0.00000</td><td>Predict Score: 0.2222</td>
    </tr>
    <tr>
        <td>RandomForestRegressor</td><td>Train Score: 0.00000</td><td>Predict Score: 0.1481</td>
    </tr>
    <tr>
        <td>KNN</td><td>Train Score: 0.00000</td><td>Predict Score: 0.2222</td>
    </tr>
    <tr>
        <td colspan="3"><b>Split 0.8</b></td>
    </tr>
    <tr>
        <td>LogisticRegression</td><td>Train Score: 1.00000</td><td>Predict Score: 1.0000</td>
    </tr>
    <tr>
        <td>DecisionTreeClassifier</td><td>Train Score: 0.88889</td><td>Predict Score: 0.7222</td>
    </tr>
    <tr>
        <td>KNeighborsClassifier</td><td>Train Score: 0.00000</td><td>Predict Score: 0.0000</td>
    </tr>
    <tr>
        <td>SVM</td><td>Train Score: 0.00000</td><td>Predict Score: 0.0000</td>
    </tr>
    <tr>
        <td>RandomForestClassifier</td><td>Train Score: 1.00000</td><td>Predict Score: 0.9444</td>
    </tr>
    <tr>
        <td>RandomForestRegressor</td><td>Train Score: 0.00000</td><td>Predict Score: 0.0000</td>
    </tr>
    <tr>
        <td>KNN</td><td>Train Score: 0.00000</td><td>Predict Score: 0.0000</td>
    </tr>
</table>
</html?


In [9]:
df = pd.DataFrame(results).transpose().reset_index()
fig = go.Figure()
fig.add_trace(go.Bar(y=df['Train score'], x=df['index'], name='Train Score'))
fig.add_trace(go.Bar(y=df['Predict_result'], x=df['index'], name='Predict Score'))
iplot(fig)

# Do rozmowy
### Przy splicie danych 0.7 wszytski modele są niedouczone, przeuczenie nie występuje, dla dużych danych wejściowy Regresja i RandoForest sprawdzają się najlepiej

# Digits

In [10]:
digits_data, digits_labels = load_digits(return_X_y=True)

In [11]:
digits_data

array([[ 0.,  0.,  5., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ..., 10.,  0.,  0.],
       [ 0.,  0.,  0., ..., 16.,  9.,  0.],
       ...,
       [ 0.,  0.,  1., ...,  6.,  0.,  0.],
       [ 0.,  0.,  2., ..., 12.,  0.,  0.],
       [ 0.,  0., 10., ..., 12.,  1.,  0.]])

In [12]:
X = split_data(digits_data, split=0.8)
y = split_data(digits_labels, split=0.8)

In [13]:
results = test_clf(classificators, X, y)

In [14]:
print(f'Expected result:{76*" "}{y['Predict']}')
for k,v in results.items():
    print(f'{k:<30}, Train Score: {v['Train score']:>1.5f}, Predict Score:{v['Predict_result']:>0.4f}, Classif_predict:{v["Classif_predict"]}')

Expected result:                                                                            [2 3 4 5 6 7 8 9 0 9 5 5 6 5 0 9 8 9 8 4 1 7 7 3 5 1 0 0 2 2 7 8 2 0 1 2 6
 3 3 7 3 3 4 6 6 6 4 9 1 5 0 9 5 2 8 2 0 0 1 7 6 3 2 1 7 4 6 3 1 3 9 1 7 6
 8 4 3 1 4 0 5 3 6 9 6 1 7 5 4 4 7 2 8 2 2 5 7 9 5 4 8 8 4 9 0 8 9 8 0 1 2
 3 4 5 6 7 8 9 0 1 2 3 4 5 6 9 0 1 2 3 4 5 6 7 8 9 0 9 5 5 6 5 0 9 8 9 8 4
 1 7 7 3 5 1 0 0 2 2 7 8 2 0 1 2 6 3 3 7 3 3 4 6 6 6 4 9 1 5 0 9]
LogisticRegression            , Train Score: 0.92778, Predict Score:0.8833, Classif_predict:[2 3 4 5 6 7 8 9 0 9 5 5 6 5 0 9 8 9 8 4 1 7 7 3 5 1 0 0 2 2 7 8 2 0 9 2 6
 3 3 7 3 3 4 6 6 6 4 9 9 5 0 9 5 2 5 2 0 0 9 7 6 3 2 3 7 4 6 3 1 3 9 1 7 6
 8 4 3 9 4 0 5 3 6 9 6 9 7 5 4 4 7 2 5 2 2 5 7 9 5 4 8 8 4 9 0 8 9 8 0 1 2
 3 4 5 1 8 1 9 0 1 2 3 4 5 6 9 0 1 2 3 4 5 6 7 1 9 4 9 1 5 6 5 0 9 8 1 8 4
 1 7 7 3 5 1 6 0 2 2 1 8 2 0 1 2 6 8 7 7 7 3 4 6 6 6 9 9 1 5 0 9]
DecisionTreeClassifier        , Train Score: 0.81667, Predict Score:0.7889, Classif_

In [15]:
df = pd.DataFrame(results).transpose().reset_index()
fig = go.Figure()
fig.add_trace(go.Bar(y=df['Train score'], x=df['index'], name='Train Score'))
fig.add_trace(go.Bar(y=df['Predict_result'], x=df['index'], name='Predict Score'))
iplot(fig)

In [16]:
df

Unnamed: 0,index,Train score,Predict_result,Classif_predict
0,LogisticRegression,0.927778,0.883333,"[2, 3, 4, 5, 6, 7, 8, 9, 0, 9, 5, 5, 6, 5, 0, ..."
1,DecisionTreeClassifier,0.816667,0.788889,"[2, 3, 4, 3, 6, 7, 8, 9, 0, 9, 5, 5, 6, 5, 0, ..."
2,KNeighborsClassifier,0.961111,0.966667,"[2, 3, 4, 5, 6, 7, 8, 9, 0, 9, 5, 5, 6, 5, 0, ..."
3,SVM,0.938889,0.944444,"[2, 3, 4, 5, 6, 7, 8, 9, 0, 9, 5, 5, 6, 5, 0, ..."
4,RandomForestClassifier,0.933333,0.916667,"[2, 3, 4, 5, 6, 7, 8, 9, 0, 9, 5, 5, 6, 5, 0, ..."
5,RandomForestRegressor,0.797063,0.055556,"[2.07, 3.44, 3.98, 4.91, 5.94, 6.57, 7.2, 7.07..."
6,KNN,0.961111,0.966667,"[2, 3, 4, 5, 6, 7, 8, 9, 0, 9, 5, 5, 6, 5, 0, ..."


# Iris

In [17]:
iris_data, iris_labels = load_iris(return_X_y=True)

In [18]:
X = split_data(iris_data, split=0.8)
y = split_data(iris_labels, split=0.8)

In [19]:
results = test_clf(classificators, X, y)

In [20]:
print(f'Expected result:{76*" "}{y['Predict']}')
for k,v in results.items():
    print(f'{k:<30}, Train Score: {v['Train score']:>1.5f}, Predict Score:{v['Predict_result']:>0.4f}, Classif_predict:{v["Classif_predict"]}')

Expected result:                                                                            [2 2 2 2 2 2 2 2 2 2 2 2 2 2 2]
LogisticRegression            , Train Score: 0.93333, Predict Score:0.7333, Classif_predict:[2 2 2 1 2 2 1 1 2 2 2 2 2 1 2]
DecisionTreeClassifier        , Train Score: 0.86667, Predict Score:0.7333, Classif_predict:[2 2 2 1 2 2 1 1 2 1 2 2 2 2 2]
KNeighborsClassifier          , Train Score: 0.86667, Predict Score:0.7333, Classif_predict:[2 2 2 1 2 2 1 1 2 2 2 2 2 1 2]
SVM                           , Train Score: 0.73333, Predict Score:0.6667, Classif_predict:[2 1 2 1 2 2 1 1 2 2 2 2 2 1 2]
RandomForestClassifier        , Train Score: 0.93333, Predict Score:0.5333, Classif_predict:[2 1 2 1 2 2 1 1 2 1 2 2 2 1 1]
RandomForestRegressor         , Train Score: 0.00000, Predict Score:0.4000, Classif_predict:[2.   1.29 2.   1.32 2.   1.96 1.26 1.2  2.   1.21 1.98 2.   2.   1.39
 1.43]
KNN                           , Train Score: 0.86667, Predict Score:0.7333, Classif_pr

### Przy splicie 0.9
Expected result:                                                                            [2 2 2 2 2 2 2]

LogisticRegression            , Train Score: 1.00000, Predict Score:0.8571, Classif_predict:[2 2 2 1 2 2 2]

DecisionTreeClassifier        , Train Score: 1.00000, Predict Score:1.0000, Classif_predict:[2 2 2 2 2 2 2]

KNeighborsClassifier          , Train Score: 1.00000, Predict Score:1.0000, Classif_predict:[2 2 2 2 2 2 2]

SVM                           , Train Score: 1.00000, Predict Score:0.8571, Classif_predict:[2 2 2 1 2 2 2]

RandomForestClassifier        , Train Score: 1.00000, Predict Score:1.0000, Classif_predict:[2 2 2 2 2 2 2]

RandomForestRegressor         , Train Score: 0.00000, Predict Score:0.8571, Classif_predict:[2.   2.   2.   1.52 2.   2.   2.  ]

KNN                           , Train Score: 1.00000, Predict Score:1.0000, Classif_predict:[2 2 2 2 2 2 2]


In [21]:
results

{'LogisticRegression': {'Train score': 0.9333333333333333,
  'Predict_result': 0.7333333333333333,
  'Classif_predict': array([2, 2, 2, 1, 2, 2, 1, 1, 2, 2, 2, 2, 2, 1, 2])},
 'DecisionTreeClassifier': {'Train score': 0.8666666666666667,
  'Predict_result': 0.7333333333333333,
  'Classif_predict': array([2, 2, 2, 1, 2, 2, 1, 1, 2, 1, 2, 2, 2, 2, 2])},
 'KNeighborsClassifier': {'Train score': 0.8666666666666667,
  'Predict_result': 0.7333333333333333,
  'Classif_predict': array([2, 2, 2, 1, 2, 2, 1, 1, 2, 2, 2, 2, 2, 1, 2])},
 'SVM': {'Train score': 0.7333333333333333,
  'Predict_result': 0.6666666666666666,
  'Classif_predict': array([2, 1, 2, 1, 2, 2, 1, 1, 2, 2, 2, 2, 2, 1, 2])},
 'RandomForestClassifier': {'Train score': 0.9333333333333333,
  'Predict_result': 0.5333333333333333,
  'Classif_predict': array([2, 1, 2, 1, 2, 2, 1, 1, 2, 1, 2, 2, 2, 1, 1])},
 'RandomForestRegressor': {'Train score': 0.0,
  'Predict_result': 0.4,
  'Classif_predict': array([2.  , 1.29, 2.  , 1.32, 2.  , 

In [22]:
from plotly.offline import iplot
import plotly.graph_objects as go

In [23]:
df = pd.DataFrame(results).transpose().reset_index()

In [24]:
fig = go.Figure()
fig.add_trace(go.Bar(y=df['Train score'], x=df['index'], name='Train Score'))
fig.add_trace(go.Bar(y=df['Predict_result'], x=df['index'], name='Predict Score'))
fig.update_xaxes(type='category')
fig.update_layout(showlegend=True)
iplot(fig)