# Data Analysis with Python
by Ladislav Vrbsky  
for Estácio

<a href='https://portal.estacio.br/'> <img src='https://portal.estacio.br/imgs/logo-estacio.png' /></a>

In [None]:
import pandas as pd
import math

from sklearn import __version__ as sklearn_version
print(f'Scikit learn version: {sklearn_version} should be >= 0.23')
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix, precision_recall_fscore_support, classification_report
import seaborn as sns
%matplotlib inline

# If necessary, make some updates
#!pip install --upgrade pip
#!pip install --upgrade scikit-learn

# Get and Explore Data

In [None]:
df = pd.read_csv("data/zoo.csv")
df.head(10)

In [None]:
df['class_type'].value_counts()

In [None]:
df_class = pd.read_csv("data/class.csv")
df_class.head(10)

## Choose a type of classification here
Multi-class or binary

In [None]:
multi_class = True
my_class = 'Mammal'
df_model = df.copy()

In [None]:
if not multi_class:
    class_number = df_class[df_class['Class_Type']==my_class]['Class_Number'][0]
    print(f'Class number to identify: {class_number}')
    df_model['class_type'] = df_model['class_type'].apply(lambda x: 1 if x==class_number else 0)
    target_names = ['Other', my_class]
else:
    target_names = list(df_class['Class_Type'])

print(target_names)
df_model.head(30)

In [None]:
sns.heatmap(df_model.corr(), cmap='coolwarm')

# Model

Split data

In [None]:
X = df_model.iloc[:, 1:-1]
y = df_model.iloc[:, -1]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=123)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

Define the model or classifier

In [None]:
model = svm.SVC()

Train on train data

In [None]:
model.fit(X_train, y_train) 

## Evaluate

In [None]:
model.score(X_test, y_test)

In [None]:
y_pred = model.predict(X_test)

In [None]:
y_pred[15:25]

In [None]:
y_test[15:25]

In [None]:
df_results = pd.DataFrame({'ground truth':y_test,'prediction': y_pred})
df_results.head()

In [None]:
len(df_results[df_results['ground truth'] != df_results['prediction']])

In [None]:
conf_mat = confusion_matrix(y_test, y_pred)
print(conf_mat)

In [None]:
accuracy = sum(conf_mat[i][i] for i in range(len(conf_mat)))/sum(sum(conf_mat))
print(f'Model Accuracy: {accuracy}')

In [None]:
sns.heatmap(confusion_matrix(y_test, y_pred))

`precision_recall_fscore_support` [docs](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_recall_fscore_support.html)

In [None]:
plot_confusion_matrix(model, X_test, y_test)

### Example only:
<img src='http://rasbt.github.io/mlxtend/user_guide/evaluate/lift_score_files/lift_cm_1.png'>

You can usually, ignore the warnings, but it is a good idea to read them.

In [None]:
pd.DataFrame(
    dict(
        zip('Precision Recall F_score Support'.split(),
            precision_recall_fscore_support(y_test, y_pred)
           )
    ),
    index=target_names
)

`classification_report` [docs](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.classification_report.html)

You can usually, ignore the warnings, but it is a good idea to read them.

In [None]:
#classification_report(y_test, y_pred, target_names=target_names)
print(classification_report(y_test, y_pred, target_names=target_names))

In [None]:
print(f'Model Accuracy: {accuracy}')