<a href="https://colab.research.google.com/github/visiont3lab/project-work-ifoa/blob/main/colab/ClassifierZone.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Classificazore colore zone (regioni) durante il periodo covid-19 in italia

> Obbiettivo: Creare un classificatore capace di prevedere il colore della zona di una regione.

## Creazione dataset

In [174]:
# Importare il dataset
import pandas as pd
from datetime import datetime
import numpy as np

df = pd.read_csv("https://raw.githubusercontent.com/visiont3lab/project-work-ifoa/main/data/dpc-covid19-ita-regioni-zone.csv")
df["data"] = [ datetime.strptime(d, "%Y-%m-%d %H:%M:%S").date() for d in  df["data"]]
df = df[df["zona"]!="unknown"].copy()
# Semplicazione e scelta degli input
inputs = ["ricoverati_con_sintomi","terapia_intensiva",
        "totale_ospedalizzati","totale_positivi","isolamento_domiciliare",
        "deceduti","dimessi_guariti","nuovi_positivi","totale_casi","tamponi"]

df_X = df[inputs].copy()

oneHot = pd.get_dummies(df["denominazione_regione"], prefix='R')
for k in oneHot.keys():
    df_X[k] = oneHot[k]

display(df_X.head())

df_Y = df["zona"]
display(df_Y.head())

Unnamed: 0,ricoverati_con_sintomi,terapia_intensiva,totale_ospedalizzati,totale_positivi,isolamento_domiciliare,deceduti,dimessi_guariti,nuovi_positivi,totale_casi,tamponi,R_Abruzzo,R_Basilicata,R_Calabria,R_Campania,R_Emilia-Romagna,R_Friuli Venezia Giulia,R_Lazio,R_Liguria,R_Lombardia,R_Marche,R_Molise,R_P.A. Bolzano,R_P.A. Trento,R_Piemonte,R_Puglia,R_Sardegna,R_Sicilia,R_Toscana,R_Umbria,R_Valle d'Aosta,R_Veneto
5376,468,42,510,8581,8071,584,4340,395,13505,308505,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5377,100,16,116,2335,2219,59,778,249,3172,112980,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5378,212,15,227,4481,4254,132,2101,264,6714,292222,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5379,1677,180,1857,62196,60339,796,15017,4508,78009,1075201,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5380,1673,177,1850,33730,31880,4752,28559,1953,67041,1695309,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


5376    bianca
5377    bianca
5378     rossa
5379    gialla
5380    bianca
Name: zona, dtype: object

In [175]:
# Classi
dict_names = {"bianca":0,"gialla": 1, "arancione": 2, "rossa": 3}
names = list(dict_names)

X = df_X.values
Y = np.array([dict_names[d] for d in df_Y],dtype=np.float)

print("X shape: ", X.shape)
print("Y shape: ", Y.shape)


X shape:  (2667, 31)
Y shape:  (2667,)


## Training

In [192]:
from sklearn import datasets
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC,SVC
import plotly.graph_objects as go
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.model_selection import train_test_split, GridSearchCV,RandomizedSearchCV
from sklearn.decomposition import PCA
import joblib
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, GradientBoostingClassifier

X_train, X_test, Y_train, Y_test  = train_test_split(X, Y, test_size=0.2, random_state=42)

pipeline = Pipeline([
    ("sc", StandardScaler()), # 0-1 features
    #('polinomial', PolynomialFeatures(degree=3)),
    #("pca", PCA(n_components=0.99)),
    #("model", SVC(kernel="rbf",C=10,gamma=1,probability=True) ) # Probability true slow down dataset
    #("model", SVC(kernel="linear",C=1000,gamma=100,probability=True) ) # Probability true slow down dataset
    #('model', GradientBoostingClassifier(learning_rate=0.05,n_estimators=150))
    ('model',RandomForestClassifier(n_estimators=150))
])

# Fit
pipeline.fit(X_train,Y_train)

# Score
score = pipeline.score(X_test,Y_test)
print("Test: f1_weighted score : ", score)

# Save trained model
joblib.dump(pipeline, "model.pkl") 

'''
tuned_parameters = [
            {'pca__n_components': [0.97,0.98,0.99]},
            {'model__kernel': ['rbf'],  'model__C': [1000,1500,2000] , 'model__gamma' : [100,200,300]},
        ]

# Grid search
grid_search = RandomizedSearchCV(
    pipeline, tuned_parameters, scoring='f1_macro'
)

# Train
grid_search.fit(X_train,Y_train)

# Score
score = grid_search.best_estimator_.score(X,Y)
print("f1_weighted score : ", score)

# Save trained model
joblib.dump(grid_search.best_estimator_, "model.pkl") 

print(grid_search.best_estimator_.named_steps["model"])
'''

print(pipeline)


Test: f1_weighted score :  0.9176029962546817
Pipeline(memory=None,
         steps=[('sc',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('model',
                 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                        class_weight=None, criterion='gini',
                                        max_depth=None, max_features='auto',
                                        max_leaf_nodes=None, max_samples=None,
                                        min_impurity_decrease=0.0,
                                        min_impurity_split=None,
                                        min_samples_leaf=1, min_samples_split=2,
                                        min_weight_fraction_leaf=0.0,
                                        n_estimators=150, n_jobs=None,
                                        oob_score=False, random_state=None,
                                        verbose=0, warm_start=False))],
 

## Testing


In [181]:
import joblib
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix,classification_report

class Inference:
    def __init__(self,model_path="model.pkl"):
        dict_names = {"bianca":0,"gialla": 1, "arancione": 2, "rossa": 3}
        self.names = list(dict_names)
        self.model = joblib.load(model_path)
    def predict(self,X):
        Y_hat = self.model.predict(X)
        return Y_hat
    def report(self,X,Y):
        Y_hat = self.predict(X)
        names_pred = [ "Pred: " + n for n in self.names]
        #print("Confusion Matrix")
        cm = confusion_matrix(Y,Y_hat)
        df = pd.DataFrame(cm, columns=names_pred, index=names)
        print(df)
        #print("Report")
        #print(classification_report(Y, Y_hat))

inf = Inference()
print("\n ------- Training Results\n")
inf.report(X_train,Y_train)
print("\n ------- Test Results\n")
inf.report(X_test,Y_test)



 ------- Training Results

           Pred: bianca  Pred: gialla  Pred: arancione  Pred: rossa
bianca              760             0                0            0
gialla                0           683                0            0
arancione             0             0              549            0
rossa                 0             0                0          141

 ------- Test Results

           Pred: bianca  Pred: gialla  Pred: arancione  Pred: rossa
bianca              156             3               12            3
gialla               11           178                1            0
arancione             8             1              115            0
rossa                 0             0                2           44


## Feature Importance

In [193]:
import joblib
import numpy as np
import plotly.graph_objects as go

inputs = df_X.keys()

model = joblib.load("model.pkl")
model = pipeline.named_steps["model"]
importances = model.feature_importances_
std = np.std([tree.feature_importances_ for tree in model.estimators_],axis=0)
indices = np.argsort(importances)[::-1]

# Print the feature ranking
#print("Feature ranking:")
xplot = []
yplot = []
for f in range(X.shape[1]):
    xplot.append(inputs[indices[f]])
    yplot.append(np.round(importances[indices[f]],3))
    #print("%s %s (%s)" % (f + 1,inputs[indices[f]] , np.round(importances[indices[f]],3)))

# Plot the impurity-based feature importances of the forest
fig = go.Figure()
fig.add_traces(go.Bar(x=xplot, y=yplot))
fig.update_layout(title="Input features Importance")
fig.show()