In [37]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler,OneHotEncoder

In [38]:
data = pd.read_csv('apple_quality.csv')
data.head()

Unnamed: 0,A_id,Size,Weight,Sweetness,Crunchiness,Juiciness,Ripeness,Acidity,Quality
0,0.0,-3.970049,-2.512336,5.34633,-1.012009,1.8449,0.32984,-0.491590483,good
1,1.0,-1.195217,-2.839257,3.664059,1.588232,0.853286,0.86753,-0.722809367,good
2,2.0,-0.292024,-1.351282,-1.738429,-0.342616,2.838636,-0.038033,2.621636473,bad
3,3.0,-0.657196,-2.271627,1.324874,-0.097875,3.63797,-3.413761,0.790723217,good
4,4.0,1.364217,-1.296612,-0.384658,-0.553006,3.030874,-1.303849,0.501984036,good


In [39]:
data.tail()

Unnamed: 0,A_id,Size,Weight,Sweetness,Crunchiness,Juiciness,Ripeness,Acidity,Quality
3996,3996.0,-0.293118,1.949253,-0.20402,-0.640196,0.024523,-1.0879,1.854235285,good
3997,3997.0,-2.634515,-2.138247,-2.440461,0.657223,2.199709,4.763859,-1.334611391,bad
3998,3998.0,-4.008004,-1.779337,2.366397,-0.200329,2.161435,0.214488,-2.229719806,good
3999,3999.0,0.27854,-1.715505,0.121217,-1.154075,1.266677,-0.776571,1.599796456,good
4000,,,,,,,,Created_by_Nidula_Elgiriyewithana,


In [40]:
data.drop(4000,axis=0,inplace=True)

In [41]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000 entries, 0 to 3999
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   A_id         4000 non-null   float64
 1   Size         4000 non-null   float64
 2   Weight       4000 non-null   float64
 3   Sweetness    4000 non-null   float64
 4   Crunchiness  4000 non-null   float64
 5   Juiciness    4000 non-null   float64
 6   Ripeness     4000 non-null   float64
 7   Acidity      4000 non-null   object 
 8   Quality      4000 non-null   object 
dtypes: float64(7), object(2)
memory usage: 281.4+ KB


In [42]:
data['Acidity'] = data['Acidity'].astype('Float64')

In [43]:
data.describe()

Unnamed: 0,A_id,Size,Weight,Sweetness,Crunchiness,Juiciness,Ripeness,Acidity
count,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0
mean,1999.5,-0.503015,-0.989547,-0.470479,0.985478,0.512118,0.498277,0.076877
std,1154.844867,1.928059,1.602507,1.943441,1.402757,1.930286,1.874427,2.11027
min,0.0,-7.151703,-7.149848,-6.894485,-6.055058,-5.961897,-5.864599,-7.010538
25%,999.75,-1.816765,-2.01177,-1.738425,0.062764,-0.801286,-0.771677,-1.377424
50%,1999.5,-0.513703,-0.984736,-0.504758,0.998249,0.534219,0.503445,0.022609
75%,2999.25,0.805526,0.030976,0.801922,1.894234,1.835976,1.766212,1.510493
max,3999.0,6.406367,5.790714,6.374916,7.619852,7.364403,7.237837,7.404736


In [44]:
data.isnull().sum()

A_id           0
Size           0
Weight         0
Sweetness      0
Crunchiness    0
Juiciness      0
Ripeness       0
Acidity        0
Quality        0
dtype: int64

In [45]:
data.columns

Index(['A_id', 'Size', 'Weight', 'Sweetness', 'Crunchiness', 'Juiciness',
       'Ripeness', 'Acidity', 'Quality'],
      dtype='object')

In [46]:
X = data[['Size', 'Weight', 'Sweetness', 'Crunchiness', 'Juiciness',
       'Ripeness', 'Acidity']]
y = data['Quality']

In [47]:
numerical_features = [x for x in X.columns if X.dtypes[x] != 'object']
categorical_features = [x for x in X.columns if X.dtypes[x] == 'object']

In [48]:
numerical_features

['Size',
 'Weight',
 'Sweetness',
 'Crunchiness',
 'Juiciness',
 'Ripeness',
 'Acidity']

In [49]:
categorical_features

[]

In [50]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

In [51]:
X_train

Unnamed: 0,Size,Weight,Sweetness,Crunchiness,Juiciness,Ripeness,Acidity
1693,2.903090,-0.748370,-1.608414,0.836871,-0.558577,0.723190,0.818821
1954,0.485271,-0.009211,-4.028833,0.804455,1.069991,4.018541,0.554639
2213,-0.334986,-1.667268,-0.104787,2.107930,-3.644783,0.045082,0.198689
2281,-4.759438,-2.516415,0.587103,1.199243,3.218731,2.725957,5.568768
744,0.122972,-2.433090,1.315824,1.138016,-0.401675,0.828748,-1.031509
...,...,...,...,...,...,...,...
1740,0.332102,-2.729038,-0.213953,2.392896,0.282413,-2.144787,-0.15461
838,1.487181,-1.822071,0.814733,0.802819,1.933550,-1.841884,2.44817
1557,4.485782,0.014331,-1.958082,0.118627,1.627263,1.115202,1.66707
3525,-0.636076,-3.746856,4.377382,-0.886810,-0.655860,2.587461,-0.482244


In [52]:
X_test

Unnamed: 0,Size,Weight,Sweetness,Crunchiness,Juiciness,Ripeness,Acidity
2740,-0.111685,0.489788,-0.333828,-1.265075,0.343857,1.031916,-4.86575
3180,1.978805,-1.668787,0.160050,1.923281,-1.968770,0.502737,1.177379
3900,0.047628,-2.035774,-1.706659,1.760040,1.669244,1.843260,2.753851
3897,-3.236927,-0.204709,1.172163,1.953748,-0.882900,0.627905,1.358348
2550,-2.037482,1.275922,-1.686684,2.481398,-0.557054,0.648534,2.008481
...,...,...,...,...,...,...,...
1646,0.320943,-4.124308,3.894007,-0.250682,0.524530,2.891306,-0.832732
3296,1.339489,-0.984984,-0.245295,1.863707,1.636745,-2.173060,1.108448
3290,-2.520364,0.233835,-5.671517,1.611891,-2.915728,2.817676,-1.021048
3915,-4.140536,0.454972,1.667803,-0.341601,3.355342,-0.622478,-0.308599


In [54]:
models = {
    'Logistic Regression' : LogisticRegression(),
    'Decison Tree' : DecisionTreeClassifier(),
    'Random Forest' : RandomForestClassifier()
}

for x in range(len(list(models))):
    model = list(models.values())[x]

    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    print(model)
    print('Accuracy : ',accuracy_score(y_test,y_pred))
    print('='*50)

LogisticRegression()
Accuracy :  0.73625
DecisionTreeClassifier()
Accuracy :  0.82125
RandomForestClassifier()
Accuracy :  0.8975


In [75]:
param_grid = {'n_estimators':[50,100,150,200,300],
              'criterion':['gini','entropy'],
              'max_depth':[1,3,5,8,None],
              'min_samples_split':[0.1,0.3,0.5,0.7,0.9],
              'min_samples_leaf':[1,2,3,4,5]
}
grid = GridSearchCV(RandomForestClassifier(),param_grid=param_grid,cv=5,verbose=True,n_jobs=-1,scoring='accuracy')

grid.fit(X_train,y_train)

Fitting 5 folds for each of 1250 candidates, totalling 6250 fits


In [77]:
grid.best_params_

{'criterion': 'entropy',
 'max_depth': None,
 'min_samples_leaf': 3,
 'min_samples_split': 0.1,
 'n_estimators': 200}

In [90]:
best_model = RandomForestClassifier(criterion='gini',max_depth=None,min_samples_leaf=3,min_samples_split=0.1,
n_estimators=200)

In [100]:
best_model.fit(X_train,y_train)
y_pred = best_model.predict(X_test)

print('Accuracy : ',accuracy_score(y_test,y_pred))

Accuracy :  0.82375
