In [1]:
!pip install kaggle
!kaggle datasets download mrmars1010/grape-quality -p /home/ubuntu/machine-learning-zoomcamp/MyMachineLearningRepo/Capstone1/data --unzip

Dataset URL: https://www.kaggle.com/datasets/mrmars1010/grape-quality
License(s): apache-2.0
grape-quality.zip: Skipping, found more recently modified local copy (use --force to force download)


In [2]:
!pip install seaborn
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

Collecting seaborn
  Using cached seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Collecting matplotlib!=3.6.1,>=3.4 (from seaborn)
  Downloading matplotlib-3.9.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting contourpy>=1.0.1 (from matplotlib!=3.6.1,>=3.4->seaborn)
  Using cached contourpy-1.3.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.4 kB)
Collecting cycler>=0.10 (from matplotlib!=3.6.1,>=3.4->seaborn)
  Using cached cycler-0.12.1-py3-none-any.whl.metadata (3.8 kB)
Collecting fonttools>=4.22.0 (from matplotlib!=3.6.1,>=3.4->seaborn)
  Downloading fonttools-4.55.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (165 kB)
Collecting kiwisolver>=1.3.1 (from matplotlib!=3.6.1,>=3.4->seaborn)
  Using cached kiwisolver-1.4.7-cp39-cp39-manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (6.3 kB)
Collecting pyparsing>=2.3.1 (from matplotlib!=3.6.1,>=3.4->seaborn)
  Downloading pyparsing-3.2.1-py3-none-a

In [3]:
df = pd.read_csv("../data/GRAPE_QUALITY.csv")
df.head()

Unnamed: 0,sample_id,variety,region,quality_score,quality_category,sugar_content_brix,acidity_ph,cluster_weight_g,berry_size_mm,harvest_date,sun_exposure_hours,soil_moisture_percent,rainfall_mm
0,1,Riesling,Barossa Valley,2.11,Medium,13.7,3.17,251.81,18.58,2023-09-04,8.4,55.9,223.3
1,2,Pinot Noir,Loire Valley,2.83,High,20.84,3.15,289.0,22.08,2023-09-16,8.8,36.9,312.1
2,3,Sauvignon Blanc,Napa Valley,3.52,Premium,27.46,4.07,125.14,21.4,2023-08-25,11.1,64.9,698.7
3,4,Riesling,Napa Valley,2.28,Medium,24.64,3.15,237.46,12.31,2023-09-13,8.0,54.5,660.1
4,5,Merlot,Napa Valley,2.9,High,26.13,3.18,221.52,12.24,2023-08-11,11.2,36.3,410.4


In [4]:
df.columns

Index(['sample_id', 'variety', 'region', 'quality_score', 'quality_category',
       'sugar_content_brix', 'acidity_ph', 'cluster_weight_g', 'berry_size_mm',
       'harvest_date', 'sun_exposure_hours', 'soil_moisture_percent',
       'rainfall_mm'],
      dtype='object')

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 13 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   sample_id              1000 non-null   int64  
 1   variety                1000 non-null   object 
 2   region                 1000 non-null   object 
 3   quality_score          1000 non-null   float64
 4   quality_category       1000 non-null   object 
 5   sugar_content_brix     1000 non-null   float64
 6   acidity_ph             1000 non-null   float64
 7   cluster_weight_g       1000 non-null   float64
 8   berry_size_mm          1000 non-null   float64
 9   harvest_date           1000 non-null   object 
 10  sun_exposure_hours     1000 non-null   float64
 11  soil_moisture_percent  1000 non-null   float64
 12  rainfall_mm            1000 non-null   float64
dtypes: float64(8), int64(1), object(4)
memory usage: 101.7+ KB


In [6]:
# The min and max values will be used later in the slider bar of the streamlit app
df.describe()

Unnamed: 0,sample_id,quality_score,sugar_content_brix,acidity_ph,cluster_weight_g,berry_size_mm,sun_exposure_hours,soil_moisture_percent,rainfall_mm
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,500.5,2.49436,20.14015,3.50341,197.11247,17.41162,8.9926,49.7319,489.423
std,288.819436,0.52896,5.843944,0.406907,85.946831,4.302279,1.736462,11.591189,170.95025
min,1.0,1.1,10.06,2.8,50.2,10.0,6.0,30.1,200.1
25%,250.75,2.1,14.8175,3.1675,122.5725,13.6775,7.5,39.6,341.6
50%,500.5,2.5,20.375,3.5,195.285,17.415,8.9,49.4,490.4
75%,750.25,2.87,25.2125,3.86,271.265,21.0975,10.4,60.3,628.65
max,1000.0,3.97,29.99,4.2,349.87,24.97,12.0,70.0,799.3


In [7]:
df.isnull().sum()

sample_id                0
variety                  0
region                   0
quality_score            0
quality_category         0
sugar_content_brix       0
acidity_ph               0
cluster_weight_g         0
berry_size_mm            0
harvest_date             0
sun_exposure_hours       0
soil_moisture_percent    0
rainfall_mm              0
dtype: int64

In [8]:
df.shape

(1000, 13)