In [1]:
import warnings

import numpy as np

import pandas as pd

import sweetviz as sv

import seaborn as sns

import matplotlib.pyplot as plt

from xgboost import XGBClassifier

warnings.filterwarnings ( 'ignore' )

from sklearn.impute import SimpleImputer

from imblearn.over_sampling import SMOTE

pd.set_option ( 'display.max_columns' , None )

from sklearn.tree import DecisionTreeClassifier 

from sklearn.linear_model import LogisticRegression , Ridge , Lasso 

from sklearn.model_selection import train_test_split , GridSearchCV , KFold

from sklearn.preprocessing import LabelEncoder , MinMaxScaler , Normalizer , StandardScaler

from sklearn.ensemble import RandomForestClassifier , AdaBoostClassifier , GradientBoostingClassifier , StackingClassifier 

from sklearn.metrics import accuracy_score , precision_score , recall_score , f1_score , confusion_matrix , roc_curve , roc_auc_sc

ModuleNotFoundError: No module named 'sweetviz'

In [None]:
DF = pd.read_csv ( "glass.data" , names = [ 'ID' , 'Refractive_Index' , 'Sodium_%' , 'Magnesium_%' , 'Aluminium_%' , 
                                           'Silicon_%' , 'Potassium_%' , 'Calcium_%' , 'Barium_%' , 'Iron_%' , 
                                           'Type_Of_Glass' ]  )

DF

In [None]:
Description = pd.DataFrame ( DF.describe ( ) ).transpose ( ) 

In [None]:
plt.figure ( figsize = ( 15 , 10 ) );

sns.lineplot ( y = 'Refractive_Index' , x = 'Type_Of_Glass', data = DF , color = '#7E4E60' , marker = '*');

plt.xlabel ( "Type of Glass" );

plt.ylabel ( "Refractive Index" );

plt.title ( "Comparing Types of Glass and Refractive Index" );

In [None]:
Chemicals = DF.groupby ( by = DF.Type_Of_Glass ).mean ( ).reset_index( ).transpose ( )

In [None]:
plt.figure ( figsize = ( 10 , 10 ) )

Chemicals.iloc [ 3 : , : ].plot ( kind = 'bar' , label = [ 'building_windows_float_processed' , 
                                                  
                                                  'building_windows_non_float_processed' , 
                                                  
                                                  'vehicle_windows_float_processed' , 'containers' , 'tableware' , 
                                                  
                                                  'headlamps' ] );

plt.title ( "The chemical composition in each glass" )

plt.xlabel ( "Chemical Percentage" )

plt.ylabel ( "Percentage" );

In [None]:
plt.figure ( figsize = ( 15 , 15) )
sns.heatmap ( data = DF.iloc [ : , 2 : ].corr ( ) ,  annot = True , fmt = '.0f' , cmap = 'BuPu' )

In [None]:
DF.iloc [ : , 2 : ].plot ( kind = 'box' , subplots = True , layout = ( 3 , 3 ) , sharex = False , sharey = False , figsize = (

In [None]:
Outliers = DF.iloc [ : , 2 : -1 ] 

In [None]:
for i in Outliers : 
        
    Outliers[Outliers [i] < Outliers [ i ].quantile ( 0.10 )] = Outliers [ i ].mean ( ) 

    Outliers[Outliers [i] > Outliers [ i ].quantile ( 0.90 )] = Outliers [ i ].mean ( ) 

In [None]:
Outliers.plot ( kind = 'box' , subplots = True , layout = ( 3 , 3 ) , sharex = False , sharey = False , figsize = ( 15 , 15) )

In [None]:
DF.drop ( columns = [ 'ID' ] , inplace = True )

Train , Test = train_test_split ( DF , test_size = 0.15 , random_state = 42 )

Train_X = Train.iloc [ : , : - 1 ]

Train_Y = Train.iloc [ : ,  -1 ]

Test_X = Test.iloc [ : , : - 1 ]

Test_Y = Test.iloc [ : ,  -1 ]

print ( "The shapes of the data sets are : " )

print ( "\nTrain >> {} , Test >> {}\n\nTrain_X >> {} , Train_Y >> {}\n\nTest_X >> {} , Test_Y >> {}".format ( Train.shape 

In [None]:
LR = LogisticRegression ( )

LR.fit ( Train_X ,Train_Y )

Y_LR = LR.predict ( Test_X )

Accuracy_LR = np.round ( accuracy_score ( Test_Y , Y_LR ) * 100 , 2 ) 

print ( "The accuracy of the Test Data is : " , Accuracy_LR )

In [None]:
Confusion_Matrix_LR = confusion_matrix ( Test_Y , Y_LR )

sns.heatmap ( Confusion_Matrix_LR , annot = True , fmt = '.0f' , cmap = 'BuPu' )

plt.xlabel ( "Predicted" )

plt.ylabel ( "Actual" )

plt.title ( "Confusion Matrix" )

In [None]:
train_predictions = LR.predict(Train_X)

test_predictions = LR.predict(Test_X)

train_accuracy = np.round ( accuracy_score ( Train_Y, train_predictions ) * 100 , 2 )

test_accuracy = np.round ( accuracy_score ( Test_Y, test_predictions ) * 100 , 2 )

print ( "Train Accuracy >> {}\n\nTest Accuracy >> {}".format ( train_accuracy , test_accuracy ) )

In [None]:
RF = RandomForestClassifier ( )

Parameters = { 'n_estimators' : [ 50 , 100 , 150 , 200 ],
              
               'criterion' : [ 'gini' , 'entropy' ],
              
               'max_depth' : [ 5 , 10 , 15 ] ,
             
               'min_samples_split' : [ 5 , 10 , 15 ] }

GS = GridSearchCV ( estimator = RF , param_grid = Parameters , cv = 5 )

GS.fit ( Train_X , Train_Y )

Parameters = GS.best_params_ 

print ( "The best parameters are : \n\n" , Parameters )

In [None]:
RF = RandomForestClassifier ( **Parameters )

RF.fit ( Train_X , Train_Y )

Y_RF = RF.predict ( Test_X )

Accuracy_RF = np.round ( accuracy_score ( Test_Y , Y_RF ) * 100 , 2 )

print ( "The accuracy of the Test Data is : " , Accuracy_RF )

In [None]:
Confusion_Matrix_RF = confusion_matrix ( Test_Y , Y_RF )

Confusion_Matrix_RF = sns.heatmap ( Confusion_Matrix_RF , fmt = '0.0f' , annot = True , cmap = 'RdGy' )

Confusion_Matrix_RF

In [None]:
train_predictions = RF.predict ( Train_X )

test_predictions = RF.predict ( Test_X )

train_accuracy = np.round ( accuracy_score ( Train_Y, train_predictions ) * 100 , 2 ) 

test_accuracy = np.round ( accuracy_score ( Test_Y, test_predictions ) * 100 , 2 ) 

print ( "Train Accuracy >> {}\n\nTest Accuracy >> {}".format ( train_accuracy , test_accuracy ) )

In [None]:
DT = DecisionTreeClassifier (  )

Parameters = { 'criterion' : [ 'gini' , 'entropy' ],
              
               'splitter' : [ 'best' , 'random' ],
              
               'max_depth' : [ None ,  5 , 10 , 15 ] ,
             
               'max_features' : [ "auto", "sqrt", "log2" ] }

GS = GridSearchCV ( estimator = DT , param_grid = Parameters , cv = 5 )

GS.fit ( Train_X , Train_Y )

Parameters = GS.best_params_

print ( "The best parameters of decision tree are : \n\n" , Parameters )

In [None]:
DT = DecisionTreeClassifier ( **Parameters )

DT.fit ( Train_X , Train_Y )

Y_DT = DT.predict ( Test_X )

Accuracy_DT = np.round ( accuracy_score ( Test_Y , Y_DT ) * 100 , 2 ) 

print ( "The accuracy of the test data is : " , Accuracy_DT )

Confusion_Matrix_DT = sns.heatmap ( confusion_matrix ( Test_Y , Y_DT ) , fmt = '0.0f' , annot = True , cmap = 'PuBu' )

Confusion_Matrix_DT

In [None]:
Y_DT_Train = DT.predict ( Train_X )

Train_Accuracy = np.round ( accuracy_score ( Train_Y , Y_DT_Train ) * 100 , 2 )


if int ( Train_Accuracy - Accuracy_DT ) > 5.0  :
    
    print ( "There is overfitting"  )
    
else :
    
    print ( "There is no overfitting\n"  )

print ( "\nThe Training accuracy is : {} \n\nThe Testing accuracy is : {}".format( Train_Accuracy , Accuracy_DT ) )

In [None]:
GB = GradientBoostingClassifier ( )

Parameters = {  'loss' : [ 'log_loss' ,'deviance', 'exponential' ] ,
    
                'learning_rate' : [ 0.1 , 0.01 , 0.001 ] ,
    
                'n_estimators' : [ 50 , 100 , 150  ] }

GS = GridSearchCV ( estimator = GB , param_grid = Parameters )

GS.fit ( Train_X , Train_Y )

Parameters = GS.best_params_ 

print ( "The the most optimum parameters are : \n\n" , Parameters )

In [None]:
GB = GradientBoostingClassifier ( **Parameters )

GB.fit ( Train_X , Train_Y )

Y_GB = GB.predict ( Test_X )

Accuracy_GB = np.round ( accuracy_score ( Test_Y , Y_GB ) * 100 , 2 )

print ( "Accuracy of gradient boosting is : " ,  Accuracy_GB )

Confusion_Matrix_GB = sns.heatmap ( confusion_matrix ( Test_Y , Y_GB ) , annot = True , fmt = '0.0f' , cmap = 'RdGy_r' )

Confusion_Matrix_GB

In [None]:
Y_GB_Train = GB.predict ( Train_X )

Train_Accuracy = np.round ( accuracy_score ( Train_Y , Y_GB_Train ) * 100 , 2 )


if int ( Train_Accuracy - Accuracy_GB ) > 5.0  :
    
    print ( "There is overfitting"  )
    
else :
    
    print ( "There is no overfitting\n"  )

print ( "\nThe Training accuracy is : {} \n\nThe Testing accuracy is : {}".format( Train_Accuracy , Accuracy_GB ) )
There is overfitting

In [None]:
XGB = XGBClassifier ( )

Parameters = {
    
    'n_estimators': [100, 200, 300],
    
    'learning_rate': [ 0.5 , 0.1, 0.01, 0.001],  
    
    'max_depth': [3, 5, 7]  
    
}

LE = LabelEncoder ( )

Train_Y = LE.fit_transform ( Train_Y )

Test_Y = LE.fit_transform ( Test_Y )

GS = GridSearchCV ( estimator = XGBClassifier ( ) , param_grid = Parameters )

GS.fit ( Train_X , Train_Y )

Parameters = GS.best_params_ 

print ( "The the most optimum parameters are : \n\n" , Parameters )

In [None]:
XGB = XGBClassifier ( **Parameters )

XGB.fit ( Train_X , Train_Y )

Y_XGB = XGB.predict ( Test_X )

Accuracy_XGB = np.round ( accuracy_score ( Test_Y , Y_XGB ) * 100 , 2 )

print ( "Accuracy of extreme gradient boosting is : " ,  Accuracy_XGB )

Confusion_Matrix_XGB = sns.heatmap ( confusion_matrix ( Test_Y , Y_XGB ) , annot = True , fmt = '0.0f' , cmap = 'twilight' )

Confusion_Matrix_XGB

In [None]:
Y_XGB_Train = XGB.predict ( Train_X )

Train_Accuracy = np.round ( accuracy_score ( Train_Y , Y_XGB_Train ) * 100 , 2 )


if int ( Train_Accuracy - Accuracy_XGB ) > 5.0  :
    
    print ( "There is overfitting"  )
    
else :
    
    print ( "There is no overfitting\n"  )

print ( "\nThe Training accuracy is : {} \n\nThe Testing accuracy is : {}".format( Train_Accuracy , Accuracy_XGB ) )

In [None]:
Accuracies = [ Accuracy_DT , Accuracy_LR , Accuracy_GB , Accuracy_RF , Accuracy_XGB ]

Accuracies_Names = [ 'Accuracy_DT' , 'Accuracy_LR' , 'Accuracy_GB' , 'Accuracy_RF' , 'Accuracy_XGB' ]

Performance_Metrics = { 'Accuracies' : Accuracies , 'Values' : Accuracies_Names }

Performance_Metrics = pd.DataFrame(Performance_Metrics)

Performance_Metrics