In [1]:
# pre-requisites
import warnings
warnings.filterwarnings(action="ignore")

In [2]:
# import required packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
# load the data
df = pd.read_csv('glass.csv')
df.head(3)

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,Type
0,1.52101,13.64,4.49,1.1,71.78,0.06,8.75,0.0,0.0,1
1,1.51761,13.89,3.6,1.36,72.73,0.48,7.83,0.0,0.0,1
2,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.0,0.0,1


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 214 entries, 0 to 213
Data columns (total 10 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   RI      214 non-null    float64
 1   Na      214 non-null    float64
 2   Mg      214 non-null    float64
 3   Al      214 non-null    float64
 4   Si      214 non-null    float64
 5   K       214 non-null    float64
 6   Ca      214 non-null    float64
 7   Ba      214 non-null    float64
 8   Fe      214 non-null    float64
 9   Type    214 non-null    int64  
dtypes: float64(9), int64(1)
memory usage: 16.8 KB


In [6]:
df['Type'].value_counts()

Type
2    76
1    70
7    29
3    17
5    13
6     9
Name: count, dtype: int64

In [7]:
df['Type'].unique()

array([1, 2, 3, 5, 6, 7])

In [5]:
df.corr()

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,Type
RI,1.0,-0.191885,-0.122274,-0.407326,-0.542052,-0.289833,0.810403,-0.000386,0.14301,-0.164237
Na,-0.191885,1.0,-0.273732,0.156794,-0.069809,-0.266087,-0.275442,0.326603,-0.241346,0.502898
Mg,-0.122274,-0.273732,1.0,-0.481799,-0.165927,0.005396,-0.44375,-0.492262,0.08306,-0.744993
Al,-0.407326,0.156794,-0.481799,1.0,-0.005524,0.325958,-0.259592,0.479404,-0.074402,0.598829
Si,-0.542052,-0.069809,-0.165927,-0.005524,1.0,-0.193331,-0.208732,-0.102151,-0.094201,0.151565
K,-0.289833,-0.266087,0.005396,0.325958,-0.193331,1.0,-0.317836,-0.042618,-0.007719,-0.010054
Ca,0.810403,-0.275442,-0.44375,-0.259592,-0.208732,-0.317836,1.0,-0.112841,0.124968,0.000952
Ba,-0.000386,0.326603,-0.492262,0.479404,-0.102151,-0.042618,-0.112841,1.0,-0.058692,0.575161
Fe,0.14301,-0.241346,0.08306,-0.074402,-0.094201,-0.007719,0.124968,-0.058692,1.0,-0.188278
Type,-0.164237,0.502898,-0.744993,0.598829,0.151565,-0.010054,0.000952,0.575161,-0.188278,1.0


#### Data cleansing
- as there is no NA record, no need to handle missing values
- as there is not categorical feature, no need of transformation
- as we are going to use RandomForest, we will use all the features except 'Type'

#### split the data 

In [25]:
# create x
x= df.drop('Type',axis=1)

# create y
y=df['Type']

In [26]:
from sklearn.model_selection import train_test_split

# split the data into train and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.7, random_state=12345)


In [27]:
from sklearn.ensemble import RandomForestClassifier

# create model
model = RandomForestClassifier(n_estimators=100)

# train model
model.fit(x_train,y_train)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [28]:
y_pred = model.predict(x_test)

In [29]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

print(f"accuracy = {accuracy_score(y_test, y_pred) * 100: .2f}%")
print(f"precision = {precision_score(y_test, y_pred, average='macro') * 100: .2f}%")
print(f"recall = {recall_score(y_test, y_pred, average='macro') * 100: .2f}%")
print(f"f1 = {f1_score(y_test, y_pred, average='macro') * 100: .2f}%")

accuracy =  72.31%
precision =  81.25%
recall =  71.23%
f1 =  70.58%


In [30]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.71      0.81      0.76        21
           2       0.67      0.76      0.71        21
           3       1.00      0.29      0.44         7
           5       1.00      0.75      0.86         4
           6       0.50      1.00      0.67         3
           7       1.00      0.67      0.80         9

    accuracy                           0.72        65
   macro avg       0.81      0.71      0.71        65
weighted avg       0.78      0.72      0.72        65



#### build decision tree

In [32]:
from sklearn.tree import DecisionTreeClassifier

# create a model
model = DecisionTreeClassifier(max_depth=30)

# train the model
model.fit(x_train, y_train)

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,30
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [34]:
y_pred = model.predict(x_test)

from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.67      0.67      0.67        21
           2       0.65      0.62      0.63        21
           3       0.50      0.57      0.53         7
           5       0.60      0.75      0.67         4
           6       0.50      1.00      0.67         3
           7       1.00      0.56      0.71         9

    accuracy                           0.65        65
   macro avg       0.65      0.69      0.65        65
weighted avg       0.68      0.65      0.65        65



In [None]:
# you will built a model using decision tree then compare the accuracy
# visualize the output

# build the models using hearing_test.csv-
# 1. decision tree
# 2. random forest
# (compare the accuracy and visualize)