In [1]:
# Performance metrics: Check the various performance metrics of an algorithm for different parameters(k-fold &split ratio)

# The performance metrics are chosen based on the kind of algorithms that are being used, they are different for algorithms
# Classification algorithms: where the label consists of categorical data (nominal, ordinal)
#            1. Logistic regression (supervised)
#            2. Naive-Bayes Classifier (")
#            3. Nearest Neighbors (")
#            4. Decision tree (")
#            5. Random Forest (")
#            6. Neural Network (")
#            7. SVM - SVC (")
# Regression algorithms: where the label is continous and numeric 
#            1. Linear regression (supervised)
#                  - LASSO regression
#                  - Polynomial Regression
#            2. Naive-Bayes Regressor (")
#            3. Nearest Neighbors (")
#            4. Decision tree Regressor (")
#            5. Random Forest (")

In [2]:
# Classification Algorithm Performance metrics:
# 1. Classification Accuracy - Calculates ratio of correct predictions to all predictions
#                            - Should only be used when there are equal samples of data
# 2. Classification Report
# 3. Confusion Matrix
# 4. Area under ROC curve
# 5. Logarithmic Loss

In [104]:
# Classification Accuracy:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import sklearn
from sklearn import metrics
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC 
df = pd.read_csv('C:\\Users\\vardh\\Vardhan\\ED\\breast_cancer_data.csv')
df = df.drop(columns=df.iloc[:,[0,-1]])
X = df.iloc[:,1::]
Y = df.iloc[:,0]
kf = KFold(n_splits=10,shuffle=True,random_state=7)
model = SVC(kernel='linear')
model1= LogisticRegression()
accuracy = cross_val_score(model,X,Y,cv=kf,scoring='accuracy')
accuracy1 = cross_val_score(model1,X,Y,cv=kf,scoring='neg_log_loss')
accuracy2 = cross_val_score(model1,X,Y,cv=kf,scoring='roc_auc')
models = [accuracy,accuracy1,accuracy2]
print('accuracy: ' , accuracy.mean()*100,'Std deviation: ', accuracy.std()*100)
print('accuracy: ' , accuracy1.mean()*100,'Std deviation: ', accuracy1.std()*100)

accuracy:  94.73370927318297 Std deviation:  3.1348505402369167
accuracy:  -11.643444192931762 Std deviation:  3.690869303269288


In [15]:
for i in models:
    print('accuracy: ' , i.mean()*100,'Std deviation: ', i.std()*100)

accuracy:  94.73370927318297 Std deviation:  3.1348505402369167
accuracy:  -11.643444192931762 Std deviation:  3.690869303269288
accuracy:  99.08338005200189 Std deviation:  0.71293166595404


In [27]:
# Classification Accuracy and Confusion matrix
import pandas as pd
import numpy as np
import sklearn
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.20,random_state=7)
fit1 = model.fit(X_train,Y_train)
predict = fit1.predict(X_test)
confusion = confusion_matrix(Y_test,predict)
class_accuracy = classification_report(Y_test,predict)
print(confusion,class_accuracy)

[[74  0]
 [ 6 34]]               precision    recall  f1-score   support

           B       0.93      1.00      0.96        74
           M       1.00      0.85      0.92        40

    accuracy                           0.95       114
   macro avg       0.96      0.93      0.94       114
weighted avg       0.95      0.95      0.95       114



In [103]:
# Regression Metrics

# 1. Mean Squared Error
# 2. Mean Absolute Error
# 3. R^2 (Goodness of Fit)
from sklearn.preprocessing import Normalizer
from sklearn.neighbors import KNeighborsRegressor
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder
df2 = pd.read_csv('C:\\Users\\vardh\\Vardhan\\ED\\housing.csv')
df2 = df2.dropna()
dummy = pd.get_dummies(df2['ocean_proximity'])
df2 = pd.concat((df2,dummy),axis=1)
df2 = df2.drop(columns='ocean_proximity')
X = df2.iloc[:,df2.columns != 'median_house_value']
Y = df2.iloc[:,8]
kf= KFold(n_splits=10,shuffle=True,random_state=7)
model_preprocess = Normalizer()
fit1 = model_preprocess.fit_transform(X)
model = KNeighborsRegressor()
fit2 = model.fit(X,Y)
new_data = [['-120','35','20','7000','1000','2100','1100','9.7','0','0','0','1','0']]
new_data = model_preprocess.fit_transform(new_data)
prediction = fit2.predict(new_data)
metric1 = 'neg_mean_squared_error'
metric2 = 'neg_mean_absolute_error'
metric3 = 'r2'
metrics = [metric1,metric2,metric3]
for i in metrics:
    print(cross_val_score(model,X,Y,cv=kf,scoring=i))

[-9.89291514e+09 -9.62235918e+09 -1.01635661e+10 -1.00690823e+10
 -1.01219079e+10 -1.02076960e+10 -9.98472428e+09 -9.33428217e+09
 -1.00708823e+10 -9.45122026e+09]
[-76917.89794521 -76104.61242661 -78189.56829746 -77816.93470387
 -77556.42721488 -77820.52207538 -77090.31678904 -74142.73666177
 -77420.04052863 -74835.11923642]
[0.30666989 0.25961662 0.23772368 0.22090808 0.23815569 0.27000195
 0.27069482 0.28448713 0.27556317 0.19195744]
