<a href="https://colab.research.google.com/github/visiont3lab/flask-static-website/blob/master/Regression_Code_Summary.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Riassunto Regressione

## Importare librerie e funzioni

In [2]:
# Create a class to select numerical or categorical columns 
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.impute import SimpleImputer 
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.metrics import mean_squared_error  # MSE
from sklearn.metrics import mean_absolute_error # MAE
from sklearn.metrics import median_absolute_error # MedAE
from sklearn.preprocessing import PolynomialFeatures
from sklearn.decomposition import PCA
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import RANSACRegressor, SGDRegressor, HuberRegressor, TheilSenRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso, Ridge
from sklearn.linear_model import ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
import seaborn as sn
import matplotlib.pyplot as plt
import random
import numpy as np
import plotly.graph_objects as go
import pickle
import json
from sklearn.pipeline import Pipeline
import pandas as pd
from sklearn import datasets
from google.colab import data_table
import seaborn as sns
import plotly.express as px

  import pandas.util.testing as tm


In [1]:
def plot_fig(Ys, names):
    # Ys list of output to plot [Y_real, Y_pred]
    n = np.linspace(0,len(Ys[0]), len(Ys[0]), dtype=int)
    fig = go.Figure()
    for yh,nm in zip(Ys,names):
        fig.add_trace(go.Scatter(x=n, y=yh,
                      mode='lines',#mode='lines+markers',
                      name=nm))
    fig.update_layout(
      hovermode = "x",
      paper_bgcolor = "rgb(0,0,0)" ,
      plot_bgcolor = "rgb(10,10,10)" , 
      title=dict(
          x = 0.5,
          text = "Risultati",
          font=dict(
              size = 20,
              color = "rgb(255,255,255)"
          )
      )
    )
    return fig

def validate(Y_test,Y_pred,name):
    mse = mean_squared_error(Y_test,Y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(Y_test,Y_pred)
    medae = median_absolute_error(Y_test,Y_pred)
    print("[" + name + "]" + " MSE: ", round(mse,4), "RMSE  : ", round(rmse,4), "MAE: ", round(mae,4), "MedAE: ", round(medae,4))

def compare_models(X,Y):
    # Split data into training and validation set
    #X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.01,  shuffle=True, random_state=0) 
    #print("Shapes: X_train: ", X_train.shape, "Y_train: ", Y_train.shape, "X_test: ", X_test.shape, "Y_test", Y_test.shape)
    #print("Metric : negative mean square error (MSE)")

    # Scaling
    sc = StandardScaler()
    sc.fit(X)
    X_train = sc.transform(X)
    Y_train = Y
    #X_test = sc.transform(X_test)

    # PCA
    pc = PCA(n_components=0.98)
    pc.fit(X_train)
    X_train = pc.transform(X_train)
    #X_test = pc.transform(X_test)
    print (pc.explained_variance_)
    print (pc.explained_variance_ratio_)
    
    # Polinomial degree
    '''
    poly = PolynomialFeatures(degree=2)
    poly.fit(X_train)
    X_train = poly.transform(X_train)
    X_test = poly.transform(X_test)
    '''

    # user variables to tune
    seed    = 5
    folds   = 5 # 10 = 10%, 5 = 20% for testing
    #5-fold cross validation. This means that 20% of the data is used for testing, this is usually pretty accurate.
    metric  = "neg_mean_squared_error"

    # hold different regression models in a single dictionary
    models = {}
    models["Linear"]        = LinearRegression()
    #models["RANSAC"]        = RANSACRegressor()
    models["Huber"]         = HuberRegressor(max_iter=1000)
    models["TheilSen"]      = TheilSenRegressor()
    #models["SGD"]           = SGDRegressor(max_iter=500,penalty=None, eta0=0.01, tol=0.00001)
    models["Ridge"]         = Ridge()
    models["Lasso"]         = Lasso()
    models["ElasticNet"]    = ElasticNet()
    models["KNN"]           = KNeighborsRegressor(n_neighbors=5)
    models["DecisionTree"]  = DecisionTreeRegressor()
    models["SVR"]           = SVR(gamma="auto")
    models["AdaBoost"]      = AdaBoostRegressor(n_estimators=50)
    models["GradientBoost"] = GradientBoostingRegressor(n_estimators=100)
    models["RandomForest"]  = RandomForestRegressor(n_estimators=100)
    models["ExtraTrees"]    = ExtraTreesRegressor(n_estimators=100)

    # 10-fold cross validation for each model
    model_results = []
    model_names   = []
    for model_name in models:
        model   = models[model_name]
        k_fold  = KFold(n_splits=folds, random_state=seed,shuffle=True)
        results = cross_val_score(model, X_train, Y_train, cv=k_fold, scoring=metric)

        model_results.append(results)
        model_names.append(model_name)
        print("{}: {}, {}".format(model_name, round(results.mean(), 3), round(results.std(), 3)))

    fig = go.Figure()
    for name,res in zip(model_names,model_results):    
        fig.add_trace(go.Box(y=res,name=name, boxpoints='all'))
    #fig.show()
    return fig

## Caricare il dataset

In [35]:
df = pd.read_csv("https://raw.githubusercontent.com/visiont3lab/flask-static-website/master/dataset_ready.csv")
display(df.head())
Y = df.pop("Specific Gravity")
X = df

Unnamed: 0,Age,Albumin,Anemia-no,Anemia-yes,Appetite-good,Appetite-poor,Bacteria-notpresent,Bacteria-present,Blood Glucose Random,Blood Pressure,Blood Urea,Class,Coronary Artery Disease-no,Coronary Artery Disease-yes,Diabetes Mellitus-no,Diabetes Mellitus-yes,Hemoglobin,Hypertension-no,Hypertension-yes,Packed Cell Volume,Pedal Edema-no,Pedal Edema-yes,Potassium,Pus Cell clumps-notpresent,Pus Cell clumps-present,Pus Cell-abnormal,Pus Cell-normal,Red Blood Cell Count,Red Blood Cells-abnormal,Red Blood Cells-normal,Serum Creatinine,Sodium,Specific Gravity,Sugar,White Blood Cell Count
0,48.0,4.0,0,1,0,1,1,0,117.0,70.0,56.0,1,1,0,1,0,11.2,0,1,32.0,0,1,2.5,0,1,1,0,3.9,0,1,3.8,111.0,1.005,0.0,6700.0
1,53.0,2.0,0,1,0,1,1,0,70.0,90.0,107.0,1,1,0,0,1,9.5,0,1,29.0,1,0,3.7,0,1,1,0,3.7,1,0,7.2,114.0,1.02,0.0,12100.0
2,63.0,3.0,1,0,0,1,1,0,380.0,70.0,60.0,1,1,0,0,1,10.8,0,1,32.0,0,1,4.2,0,1,1,0,3.8,1,0,2.7,131.0,1.01,0.0,4500.0
3,68.0,3.0,1,0,0,1,0,1,157.0,80.0,90.0,1,0,1,0,1,5.6,0,1,16.0,0,1,6.4,0,1,1,0,2.6,0,1,4.1,130.0,1.01,2.0,11000.0
4,61.0,2.0,0,1,0,1,1,0,173.0,80.0,148.0,1,0,1,0,1,7.7,0,1,24.0,0,1,5.2,1,0,1,0,3.2,1,0,3.9,135.0,1.015,0.0,9200.0


## Scegliere il modello

In [36]:
fig_compare_models = compare_models(X,Y)
fig_compare_models.show()

[16.17141223  2.82187329  2.13442141  2.01519095  1.60148488  1.4082671
  1.05357624  0.99035349  0.8349257   0.76928615  0.70434213  0.60671406
  0.51952845  0.47183479  0.41251882  0.36542019  0.30087643  0.2753285
  0.20724591]
[0.47261946 0.08247098 0.06237978 0.05889519 0.04680438 0.04115747
 0.03079141 0.02894369 0.02440122 0.02248286 0.02058483 0.01773159
 0.01518354 0.01378966 0.01205612 0.01067963 0.0087933  0.00804664
 0.00605689]
Linear: -0.0, 0.0
Huber: -0.0, 0.0
TheilSen: -0.093, 0.036
Ridge: -0.0, 0.0
Lasso: -0.0, 0.0
ElasticNet: -0.0, 0.0
KNN: -0.0, 0.0
DecisionTree: -0.0, 0.0
SVR: -0.0, 0.0
AdaBoost: -0.0, 0.0
GradientBoost: -0.0, 0.0
RandomForest: -0.0, 0.0
ExtraTrees: -0.0, 0.0


## Allenare e salvare il modello

In [37]:
# Dentro quello che passo a questa pipeline NON ci devono essere variabili categoriche. Tutto quello
# che c'è qua dentro è un numero.

#X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.25, random_state=42)
X_train = X_test = X
Y_train = Y_test =  Y

pipeline_preprocess = Pipeline([
    #("myd", DataFrameSelector()),
    #("null_row",SimpleImputer(strategy="median")),
    ("sc", StandardScaler()),
    #('polinomial', PolynomialFeatures(degree=3)),
    #("pca", PCA(n_components=0.98)),
    #("random_forest", RandomForestRegressor()),
    ("gradient_boost", GradientBoostingRegressor() )
])
pipeline_preprocess.fit(X_train,Y_train)
#pipeline_preprocess.fit_transform(df)
#pickle.dump(pipeline_preprocess, open("mymodel.sav", 'wb'))

#pipeline_preprocess_new = pickle.load(open("mymodel.sav",'rb'))
#score = pipeline_preprocess_new.score(X,Y)
#print("Score: ", score)
#Y_pred = pipeline_preprocess_new.predict(X)

# Fine tuning
param_grid = {
              #'pca__n_components' : [12,0.96,0.97,0.98], 
              #'random_forest__n_estimators': [3,30,100],
              #'random_forest__max_features': [2,4,6,8],
              #'random_forest__criterion': ["mse","mae"],
              'gradient_boost__n_estimators' : [3,30,100],
              'gradient_boost__learning_rate' : [0.01,0.1]
            }
grid_search = RandomizedSearchCV(pipeline_preprocess, param_grid, cv=5, scoring='neg_mean_squared_error', return_train_score=True)
grid_search.fit(X_train, Y_train)
print(grid_search.best_estimator_)


from sklearn.externals import joblib
joblib.dump(grid_search.best_estimator_, "my_model_joblib.pkl") # DIFF
my_model_loaded = joblib.load("my_model_joblib.pkl") # DIFF
score = my_model_loaded.score(X_test,Y_test)
print("Score: ", score)
Y_pred = my_model_loaded.predict(X_test)


'''
# name step concetto
X_sc =  my_model_loaded.named_steps["sc"].transform(X)
X_pca1 = my_model_loaded.named_steps["pca"].transform(X_sc)
print(np.round(X_pca-X_pca1,1))
'''

fig = plot_fig([Y_test,Y_pred], ["Y", "Y_pred"])
fig.show()


The total space of parameters 6 is smaller than n_iter=10. Running 6 iterations. For exhaustive searches, use GridSearchCV.



Pipeline(memory=None,
         steps=[('sc',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('gradient_boost',
                 GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0,
                                           criterion='friedman_mse', init=None,
                                           learning_rate=0.1, loss='ls',
                                           max_depth=3, max_features=None,
                                           max_leaf_nodes=None,
                                           min_impurity_decrease=0.0,
                                           min_impurity_split=None,
                                           min_samples_leaf=1,
                                           min_samples_split=2,
                                           min_weight_fraction_leaf=0.0,
                                           n_estimators=30,
                                           n_iter_no_change=None,
                   

## Salvare i risultati come excel/csv

In [41]:
from sklearn.externals import joblib

# Carichiamo i nuovi dati
df = pd.read_csv("https://raw.githubusercontent.com/visiont3lab/flask-static-website/master/dataset_ready.csv")
Y = df.pop("Specific Gravity")
X = df

# Usiamo il modello
my_model_loaded = joblib.load("my_model_joblib.pkl") # DIFF
Y_pred = my_model_loaded.predict(X)

df["Specifity Gradivity"] = Y
df["Specifity Gradivity Pred"] = Y_pred
df.to_excel("result.xlsx", index=None)
df.to_csv("result.csv", index=None)
