# Prediction of wine type using random forest classifier

Before you run this notebook you must first install the dependencies using the dependencies.ipynb notebook.

Only needs to be done once, but if you start a new Jupyter instance then you will have to load them again.

In [1]:
from sklearn.datasets import load_wine
import pandas as pd
from sklearn.ensemble import RandomForestClassifier


In [2]:
from jaqpotpy import Jaqpot
from jaqpotpy.models import MolecularModel

In [3]:
# see here for info on the wine dataset https://scikit-learn.org/stable/datasets/toy_dataset.html
# and https://archive.ics.uci.edu/ml/datasets/Wine
data = load_wine()

In [4]:
# create the data frame
df_X = pd.DataFrame(data['data'], columns=data['feature_names'])
print(df_X)

     alcohol  malic_acid   ash  alcalinity_of_ash  magnesium  total_phenols  \
0      14.23        1.71  2.43               15.6      127.0           2.80   
1      13.20        1.78  2.14               11.2      100.0           2.65   
2      13.16        2.36  2.67               18.6      101.0           2.80   
3      14.37        1.95  2.50               16.8      113.0           3.85   
4      13.24        2.59  2.87               21.0      118.0           2.80   
..       ...         ...   ...                ...        ...            ...   
173    13.71        5.65  2.45               20.5       95.0           1.68   
174    13.40        3.91  2.48               23.0      102.0           1.80   
175    13.27        4.28  2.26               20.0      120.0           1.59   
176    13.17        2.59  2.37               20.0      120.0           1.65   
177    14.13        4.10  2.74               24.5       96.0           2.05   

     flavanoids  nonflavanoid_phenols  proanthocyan

In [5]:
df_Y = pd.DataFrame(data['target'], columns = ["target"])
print(df_Y)

     target
0         0
1         0
2         0
3         0
4         0
..      ...
173       2
174       2
175       2
176       2
177       2

[178 rows x 1 columns]


In [6]:
# create the model
model = RandomForestClassifier(max_depth=10, random_state=0)
model.fit(df_X, df_Y)
# this gives a strange warning about "column-vector y was passed when a 1d array was expected" which can be ignored, but need to work out how to avoid this. 

  model.fit(df_X, df_Y)


In [10]:
# Create the Jaqpot instance and prompt for the API key
a = input()
jaqpot = Jaqpot('https://jaqpotapi.informaticsmatters.org/jaqpot/services/')
jaqpot.set_api_key(a)
# TODO - Jaqpot prints the key which is a security risk. Avoid doing this.

eyJhbGciOiJSUzI1NiIsInR5cCIgOiAiSldUIiwia2lkIiA6ICI3RC1USHRaTVdNRElhV3gxX0NXamhCVHpWdEpaejRBTnd2dGh3QWx2OFRrIn0.eyJleHAiOjE2NzE2NTM5MzksImlhdCI6MTY3MTYzMjMzOSwiYXV0aF90aW1lIjoxNjcxNjMyMzM4LCJqdGkiOiIyMjJjNjRhMy0xZDA2LTQwMzUtYTIyNy1iOThkZDE2MWNhNGEiLCJpc3MiOiJodHRwczovL3NxdW9uay5pbmZvcm1hdGljc21hdHRlcnMub3JnL2F1dGgvcmVhbG1zL3NxdW9uayIsImF1ZCI6WyJzcXVvbmstam9iZXhlY3V0b3IiLCJzcXVvbmstcG9ydGFsIiwiYWNjb3VudCJdLCJzdWIiOiJmZGMwNTc0OS03YzZkLTQyMmMtYTU0ZS01MjFkMTA5NzI0YTMiLCJ0eXAiOiJCZWFyZXIiLCJhenAiOiJqYXFwb3QtdWkiLCJub25jZSI6ImYyZTcyZjcwOGYyYzM1N2I0MTg3YTE3YWE5OTk5YmIwZGF6Tk5BS3FHIiwic2Vzc2lvbl9zdGF0ZSI6IjhjMDk3YzFkLTk4MjEtNDFiMC05YTExLTQyYjY3NDI2NTVhMCIsImFjciI6IjEiLCJhbGxvd2VkLW9yaWdpbnMiOlsiaHR0cHM6Ly9qYXFwb3QuaW5mb3JtYXRpY3NtYXR0ZXJzLm9yZyJdLCJyZWFsbV9hY2Nlc3MiOnsicm9sZXMiOlsiYWNjb3VudC1zZXJ2ZXItYWRtaW4iLCJzdGFuZGFyZC11c2VyIiwiZGF0YS1tYW5hZ2VyLXVzZXIiLCJkYXRhLW1hbmFnZXItYWRtaW4iLCJvZmZsaW5lX2FjY2VzcyIsImFjY291bnQtc2VydmVyLXVzZXIiLCJ1bWFfYXV0aG9yaXphdGlvbiIsImZyYWduZXQtc2VhcmNoIl19LCJyZXNv

  2022-12-21 14:22:10,813 - INFO - api key is set


In [12]:
# Deploy the model to Jaqpot along with its title and description
model_id = jaqpot.deploy_sklearn(model, df_X, df_Y, "RF wine model from Squonk", "Random Forest wine category predictive model from Squonk")

  2022-12-21 14:45:18,137 - INFO - Model with id: 9wlQ0whRHvTK2vOy8avd created. Please visit the application to proceed


In [13]:
# Run the predictions using the uploaded model. The Jaqpot client handles this for us.
df_pred, predicts = jaqpot.predict(df_X, model_id)

  2022-12-21 14:45:29,297 - INFO - completed 20.0
  2022-12-21 14:45:30,449 - INFO - completed 100.0


In [14]:
print(predicts)
print(df_pred)
print(df_pred[predicts])

['target']
     magnesium  alcalinity_of_ash  color_intensity  nonflavanoid_phenols  \
0          127               15.6             5.64                  0.28   
1          100               11.2             4.38                  0.26   
2          101               18.6             5.68                  0.30   
3          113               16.8             7.80                  0.24   
4          118               21.0             4.32                  0.39   
..         ...                ...              ...                   ...   
173         95               20.5             7.70                  0.52   
174        102               23.0             7.30                  0.43   
175        120               20.0            10.20                  0.43   
176        120               20.0             9.30                  0.53   
177         96               24.5             9.20                  0.56   

     proline   ash  proanthocyanins  od280/od315_of_diluted_wines  flavanoid