# Fruit

Farmer bought a machine with is supposed to separate fruits by its characteristics. Create a model for this machine to do that.



# Data Set and Missing Data Check

In [30]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

# Loading Data Set
data = pd.read_csv('fruit.txt', delimiter='\t')

data.head(5)

Unnamed: 0,fruit_label,fruit_name,fruit_subtype,mass,width,height,color_score
0,1,apple,granny_smith,192,8.4,7.3,0.55
1,1,apple,granny_smith,180,8.0,6.8,0.59
2,1,apple,granny_smith,176,7.4,7.2,0.6
3,2,mandarin,mandarin,86,6.2,4.7,0.8
4,2,mandarin,mandarin,84,6.0,4.6,0.79


In [31]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59 entries, 0 to 58
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   fruit_label    59 non-null     int64  
 1   fruit_name     59 non-null     object 
 2   fruit_subtype  59 non-null     object 
 3   mass           59 non-null     int64  
 4   width          59 non-null     float64
 5   height         59 non-null     float64
 6   color_score    59 non-null     float64
dtypes: float64(3), int64(2), object(2)
memory usage: 3.4+ KB


In [32]:
# Missing Data Check
print("Number of Missing Data:")
print(data.isnull().sum())

Number of Missing Data:
fruit_label      0
fruit_name       0
fruit_subtype    0
mass             0
width            0
height           0
color_score      0
dtype: int64


In [33]:
data.index

RangeIndex(start=0, stop=59, step=1)

#  Model Building and Training

In [34]:
# Transform Categorical Variables
label_encoder = LabelEncoder()
data['fruit_name'] = label_encoder.fit_transform(data['fruit_name'])
data['fruit_subtype'] = label_encoder.fit_transform(data['fruit_subtype'])

# Specify Features and Labels
X = data[['mass', 'width', 'height', 'color_score']]
y = data['fruit_label']

# Split the Dataset into Training and Test Sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and Train the Random Forest Model
random_forest_model = RandomForestClassifier(n_estimators=100, random_state=42)
random_forest_model.fit(X_train, y_train)

# Evaluate the Model on the Test Set
y_pred_rf = random_forest_model.predict(X_test)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f"Random Forest Model Accuracy: {accuracy_rf}")

# Evaluate the Model with Cross-Validation
cross_val_scores = cross_val_score(random_forest_model, X, y, cv=5)
print("Cross-Validation Scores:", cross_val_scores)
print("Mean Cross-Validation Score:", cross_val_scores.mean())


Random Forest Model Accuracy: 1.0
Cross-Validation Scores: [1.         0.91666667 0.91666667 0.91666667 1.        ]
Mean Cross-Validation Score: 0.95
