<a href="https://colab.research.google.com/github/tanishavaishya18/python-basics/blob/main/Day13_MiniProject_EndToEnd.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**PROBLEM STATEMENT**

Predict wine class quality using chemical properties, and build a robust ML pipeline with proper evaluation

In [2]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_wine

data = load_wine()
x=data.data
y=data.target
feature_names=data.feature_names

df = pd.DataFrame(x, columns=feature_names)
df['class']=y
df.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,class
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0,0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0,0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0,0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0,0


In [4]:
df.describe()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,class
count,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0
mean,13.000618,2.336348,2.366517,19.494944,99.741573,2.295112,2.02927,0.361854,1.590899,5.05809,0.957449,2.611685,746.893258,0.938202
std,0.811827,1.117146,0.274344,3.339564,14.282484,0.625851,0.998859,0.124453,0.572359,2.318286,0.228572,0.70999,314.907474,0.775035
min,11.03,0.74,1.36,10.6,70.0,0.98,0.34,0.13,0.41,1.28,0.48,1.27,278.0,0.0
25%,12.3625,1.6025,2.21,17.2,88.0,1.7425,1.205,0.27,1.25,3.22,0.7825,1.9375,500.5,0.0
50%,13.05,1.865,2.36,19.5,98.0,2.355,2.135,0.34,1.555,4.69,0.965,2.78,673.5,1.0
75%,13.6775,3.0825,2.5575,21.5,107.0,2.8,2.875,0.4375,1.95,6.2,1.12,3.17,985.0,2.0
max,14.83,5.8,3.23,30.0,162.0,3.88,5.08,0.66,3.58,13.0,1.71,4.0,1680.0,2.0


In [3]:


df.isnull().sum()

Unnamed: 0,0
alcohol,0
malic_acid,0
ash,0
alcalinity_of_ash,0
magnesium,0
total_phenols,0
flavanoids,0
nonflavanoid_phenols,0
proanthocyanins,0
color_intensity,0


In [5]:

df['acid_ratio']=df['malic_acid']/(df['total_phenols']+1e-6)
df['flav_color_interaction']=df['flavanoids']*df['color_intensity']
df['alcohol_proline_ratio']=df['alcohol']/(df['proline']+ 1e-6)

In [6]:
x=df.drop('class', axis =1)
y=df['class']

In [9]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier

pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('model', RandomForestClassifier(random_state=42))
])

In [11]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(pipe, x, y, cv=5, scoring='accuracy')
print('CV scores', scores)
print('mean accuracy', scores.mean())
print('STD Dev', scores.std())

CV scores [0.97222222 0.94444444 0.94444444 0.97142857 0.97142857]
mean accuracy 0.9607936507936508
STD Dev 0.013352216409405144


In [16]:
from sklearn.model_selection import GridSearchCV

param_grid={
    'model__n_estimators' : [50, 100, 200],
    'model__max_depth' : [None, 5, 10]
    }

grid = GridSearchCV(
    pipe, param_grid, cv=5, scoring='accuracy'
)
grid.fit(x,y)


In [18]:
grid.best_params_


{'model__max_depth': 5, 'model__n_estimators': 50}

In [19]:
grid.best_score_

np.float64(0.9665079365079364)

In [21]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

best_model = grid.best_estimator_
best_model.fit(x_train, y_train)

pred = best_model.predict(x_test)
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        14
           1       1.00      1.00      1.00        14
           2       1.00      1.00      1.00         8

    accuracy                           1.00        36
   macro avg       1.00      1.00      1.00        36
weighted avg       1.00      1.00      1.00        36



In [22]:
#Reflection

**Why Pipleines prevent leakage**

Pipelines ensure that preprocessing steps like scaling and feature transformation are learned only from the training data and then applied to the test data, preventing information from leaking and inflating performance metrics.

**Why cross validation is important**

Cross-validation provides a more reliable estimate of model performance by evaluating it across multiple train-test splits, reducing dependence on a single random split and improving result stability.

**How feature emgineering helped**

Feature engineering created more informative representations of the data by capturing ratios and interactions between variables, which improved class separability and helped the model learn more meaningful patterns.

**WHy Random Forest worked well**

Random Forest performed well because it combines multiple decision trees, reducing overfitting while capturing non-linear relationships and interactions present in the dataset.

**How this approach generalises real data**

This workflow—feature engineering, pipelines, cross-validation, and tuning—is robust to noisy and complex real-world datasets, making it suitable for applications such as battery health estimation, renewable energy mapping, and sensor-based systems.