In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
import plotly.express as py
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.offline import init_notebook_mode, iplot
import cufflinks as cf
init_notebook_mode(connected=True)
cf.go_offline()

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score
# from sklearn.preprocessing import StandardScaler

In [5]:
df = pd.read_csv('glass.csv')
df.head()

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,Type
0,1.52101,13.64,4.49,1.1,71.78,0.06,8.75,0.0,0.0,1
1,1.51761,13.89,3.6,1.36,72.73,0.48,7.83,0.0,0.0,1
2,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.0,0.0,1
3,1.51766,13.21,3.69,1.29,72.61,0.57,8.22,0.0,0.0,1
4,1.51742,13.27,3.62,1.24,73.08,0.55,8.07,0.0,0.0,1


In [6]:
df.shape

(214, 10)

In [7]:
sum(df.duplicated())

1

In [8]:
df.drop_duplicates(inplace=True)

In [9]:
df.shape

(213, 10)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 213 entries, 0 to 213
Data columns (total 10 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   RI      213 non-null    float64
 1   Na      213 non-null    float64
 2   Mg      213 non-null    float64
 3   Al      213 non-null    float64
 4   Si      213 non-null    float64
 5   K       213 non-null    float64
 6   Ca      213 non-null    float64
 7   Ba      213 non-null    float64
 8   Fe      213 non-null    float64
 9   Type    213 non-null    int64  
dtypes: float64(9), int64(1)
memory usage: 18.3 KB


In [11]:
df.isnull().sum()

RI      0
Na      0
Mg      0
Al      0
Si      0
K       0
Ca      0
Ba      0
Fe      0
Type    0
dtype: int64

In [12]:
df.describe()

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,Type
count,213.0,213.0,213.0,213.0,213.0,213.0,213.0,213.0,213.0,213.0
mean,1.518348,13.404085,2.679202,1.449484,72.65507,0.498873,8.954085,0.175869,0.057277,2.788732
std,0.003033,0.816662,1.443691,0.495925,0.773998,0.653185,1.425882,0.498245,0.097589,2.10513
min,1.51115,10.73,0.0,0.29,69.81,0.0,5.43,0.0,0.0,1.0
25%,1.51652,12.9,2.09,1.19,72.28,0.13,8.24,0.0,0.0,1.0
50%,1.51768,13.3,3.48,1.36,72.79,0.56,8.6,0.0,0.0,2.0
75%,1.51915,13.81,3.6,1.63,73.09,0.61,9.15,0.0,0.1,3.0
max,1.53393,17.38,4.49,3.5,75.41,6.21,16.19,3.15,0.51,7.0


In [13]:
count = df.groupby('Type')['Type'].count().reset_index(name='Count')
count

Unnamed: 0,Type,Count
0,1,69
1,2,76
2,3,17
3,5,13
4,6,9
5,7,29


In [14]:
bar = py.bar(count, x='Type', y='Count', title='Glass Type')
bar.show()

In [15]:
plot = py.scatter(df['Type'])
plot.show()

#### Make x value & y value

In [16]:
x = df.iloc[:, 0:9]
y = df[['Type']]

In [17]:
x.head(1)

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe
0,1.52101,13.64,4.49,1.1,71.78,0.06,8.75,0.0,0.0


In [18]:
y.head(1)

Unnamed: 0,Type
0,1


#### Spliting Train & Test data

In [19]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=20, random_state=4)

#### Check Accuracy based on k number

In [20]:
def getNeighborsData(x_train, y_train, x_test, y_test):
    arr = []
    for k in range(1,8):
        model = KNeighborsClassifier(n_neighbors=k)
        model.fit(x_train, y_train)
        pred = model.predict(x_test)
        acc = accuracy_score(y_test, pred)

        obj = {
            'n_neighbors': k,
            'accuracy': acc
        }
        arr.append(obj)
    
    return arr

In [21]:
getNeighborsData(x_train, y_train, x_test, y_test)

[{'n_neighbors': 1, 'accuracy': 0.85},
 {'n_neighbors': 2, 'accuracy': 0.9},
 {'n_neighbors': 3, 'accuracy': 0.85},
 {'n_neighbors': 4, 'accuracy': 0.85},
 {'n_neighbors': 5, 'accuracy': 0.8},
 {'n_neighbors': 6, 'accuracy': 0.75},
 {'n_neighbors': 7, 'accuracy': 0.8}]

#### K value 1's accuracy is higher

In [22]:
model = KNeighborsClassifier(n_neighbors=2)
model.fit(x_train, y_train)

pred = model.predict(x_test)

acc = accuracy_score(y_test, pred)
acc

0.9

In [23]:
report = classification_report(y_test, pred)
print(report)

              precision    recall  f1-score   support

           1       0.75      1.00      0.86         6
           2       1.00      0.75      0.86         8
           5       1.00      1.00      1.00         2
           7       1.00      1.00      1.00         4

    accuracy                           0.90        20
   macro avg       0.94      0.94      0.93        20
weighted avg       0.93      0.90      0.90        20

