<a href="https://colab.research.google.com/github/vdnew/Performance-Metric-without-any-library/blob/main/5_Performance_metrics_Instructions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Compute performance metrics for the given Y and Y_score without sklearn

In [None]:
import numpy as np
import pandas as pd
# other than these two you should not import any other packages

<pre>
<font color='red'><b>A.</b></font> Compute performance metrics for the given data <strong>5_a.csv</strong>
   <b>Note 1:</b> in this data you can see number of positive points >> number of negatives points
   <b>Note 2:</b> use pandas or numpy to read the data from <b>5_a.csv</b>
   <b>Note 3:</b> you need to derive the class labels from given score</pre> $y^{pred}= \text{[0 if y_score < 0.5 else 1]}$

<pre>
<ol>
<li> Compute Confusion Matrix </li>
<li> Compute F1 Score </li>
<li> Compute AUC Score, you need to compute different thresholds and for each threshold compute tpr,fpr and then use               numpy.trapz(tpr_array, fpr_array) <a href='https://stackoverflow.com/q/53603376/4084039'>https://stackoverflow.com/q/53603376/4084039</a>, <a href='https://stackoverflow.com/a/39678975/4084039'>https://stackoverflow.com/a/39678975/4084039</a> Note: it should be numpy.trapz(tpr_array, fpr_array) not numpy.trapz(fpr_array, tpr_array)</li>
<li> Compute Accuracy Score </li>
</ol>
</pre>

In [None]:
# write your code here
data_a = pd.read_csv("/content/5_a.csv")
data_a.shape

In [None]:
# predict defination
def predict(data, y, thresh_hold):
  '''
  Argument:
  data, DataFrame
  y = 'y' column in dataframe
  thresh_hold = Thresh hold value for predict method

  AIM: predict the value as '1' if y value is more than 0.5 else it will be '0'

  Output:
  will return y_pred list as return type with classified value either '1' or '0'
  '''
  #initialize y_pred list
  y_pred = []
  # check label with thresh_hold value (0.5)
  for label in data[y]:
    if label<thresh_hold:
      y_pred.append(0)
    else:
      y_pred.append(1)
  return y_pred

# confusion matrix calculation method
def calculate_confusion(data, y, y_pred):
  '''
    Argument:
    data = DataFrame 
    y = 'y' column in dataframe 
    y_pred = 'y_pred' column in dataframe

    AIM: calculate confusion matrix: TP, TN, FN, FP

    Output: dictionary with TN, TP, FN, FP as key and value pair
  '''

  # initialize variables
  tp = 0
  tn = 0
  fn = 0
  fp = 0

  # confuntion matrix tp, tn, fn, fp calculation logic
  for value1, value2 in enumerate(data['y']):
    if (data.y_pred[value1]==1) and data.y[value1] == 1:
      tp = tp + 1
    if (data.y_pred[value1]==0) and data.y[value1] == 0:
      tn = tn + 1
    if (data.y_pred[value1]==0) and data.y[value1] == 1:
      fn = fn + 1
    if (data.y_pred[value1]==1) and data.y[value1] == 0:
      fp = fp + 1
  
  return { 'tn': tn, 'tp': tp, 'fn': fn, 'fp': fp}

In [None]:
# Thresh hold value declation
thresh_hold = 0.5
# predicting the value and storing in dataframe as y_pred column
data_a['y_pred'] = predict(data_a, 'proba', thresh_hold)

# calculating confunsion matrix
confusion_matrix = calculate_confusion(data_a, 'y', 'y_pred')

In [None]:
# confusion matrix values
print("the confusion matrix is:", confusion_matrix)

In [None]:
# F1 score calculation 
x = data_a.y.value_counts()
P = x[1]

# precision calculation = TP / (TP + FP)
precision = confusion_matrix['tp']/(confusion_matrix['tp']+confusion_matrix['fp'])
# recall calculation = TP / P
recall = confusion_matrix['tp']/P

# F1 score calculation with formula
F1 = 2* precision * recall / (precision + recall)

# display result of F1 score
print("the F1 score is:", F1)

In [None]:
# Accuracy calculation with formula
Acc = (confusion_matrix['tp']+confusion_matrix['tn'])/data_a.shape[0]
print("The accuracy is:",Acc)

In [None]:
# AUC score calculation defination
from tqdm import tqdm_notebook 
def auc(data):
  '''
  Argument:
  data: DataFrame

  AIM: This method to calculate the AUC (Area under the curve)
      with confuntion matrix, it will calculate tpr and fpr.
  
  Output: Definite integral as approximated by trapezoidal rule.

  '''
  # variable initialization
  s = data['y'].value_counts()
  P = s[1]
  N = s[0]
  tpr = []
  fpr = []

  #calculating auc 
  for element in tqdm_notebook(data['proba']):
    data['y_pred'] = predict(data, 'proba', element)
    confusion_matrix= calculate_confusion(data, 'y', 'y_pred')
    tpr.append(confusion_matrix['tp']/P)
    fpr.append(confusion_matrix['fp']/N)
    data.drop(columns=['y_pred'])
  
  return np.trapz(tpr, fpr)

In [None]:
# sort the value by 'proba' value 
data_a = data_a.sort_values(by='proba',ascending=False)

# droping 'y_pred' column
data_a.drop(columns=['y_pred'])

In [None]:
# calculate AUC score 
AUC_score=auc(data_a)

# display auc score
print("The AUC Score is :",AUC_score)

<pre>
<font color='red'><b>B.</b></font> Compute performance metrics for the given data <strong>5_b.csv</strong>
   <b>Note 1:</b> in this data you can see number of positive points << number of negatives points
   <b>Note 2:</b> use pandas or numpy to read the data from <b>5_b.csv</b>
   <b>Note 3:</b> you need to derive the class labels from given score</pre> $y^{pred}= \text{[0 if y_score < 0.5 else 1]}$

<pre>
<ol>
<li> Compute Confusion Matrix </li>
<li> Compute F1 Score </li>
<li> Compute AUC Score, you need to compute different thresholds and for each threshold compute tpr,fpr and then use               numpy.trapz(tpr_array, fpr_array) <a href='https://stackoverflow.com/q/53603376/4084039'>https://stackoverflow.com/q/53603376/4084039</a>, <a href='https://stackoverflow.com/a/39678975/4084039'>https://stackoverflow.com/a/39678975/4084039</a></li>
<li> Compute Accuracy Score </li>
</ol>
</pre>

In [None]:
# write your code

# reading dataframe b
data_b = pd.read_csv("/content/5_b.csv")

data_b.head()

In [None]:
# thresh hold value intialization
thresh_hold = 0.5

# predicting y_pred using predict method
data_b['y_pred'] = predict(data_b, 'proba', thresh_hold)

# calculating confusion matrix
confusion_matrix_b = calculate_confusion(data_b, 'y', 'y_pred')

In [None]:
# confusion matrix result display
print("The confusion matrix is :",confusion_matrix_b)

In [None]:
# F1 score calculation
x = data_b.y.value_counts()
P = x[1]

# precision calculation = TP / (TP + FP)
precision_b = confusion_matrix_b['tp']/(confusion_matrix_b['tp']+confusion_matrix_b['fp'])

# recall calculation = Tp / P
recall_b = confusion_matrix_b['tp'] / P

#F1 calculation
f1_b = 2*precision_b*recall_b / (precision_b + recall_b)

print("The F1 Score is :", f1_b)

In [None]:
# Accuracy calculation
Acc_b = (confusion_matrix_b['tp'])+confusion_matrix_b['tn']/data_b.shape[0]
print("The accuracy is :", Acc_b)

In [None]:
# Auc score calculation
data_b = data_b.sort_values(by='proba', ascending=False)
print("The Accuracy is :", Acc_b)

<font color='red'><b>C.</b></font> Compute the best threshold (similarly to ROC curve computation) of probability which gives lowest values of metric <b>A</b> for the given data <strong>5_c.csv</strong>
<br>

you will be predicting label of a data points like this: $y^{pred}= \text{[0 if y_score < threshold  else 1]}$

$ A = 500 \times \text{number of false negative} + 100 \times \text{numebr of false positive}$

<pre>
   <b>Note 1:</b> in this data you can see number of negative points > number of positive points
   <b>Note 2:</b> use pandas or numpy to read the data from <b>5_c.csv</b>
</pre>

In [None]:
 # write your code

 # reading dataframe c
 data_c = pd.read_csv("/content/5_c.csv")
 data_c.shape

In [None]:
# min metric calculation method

def Min_MetricCalculation(data):
  '''
  Argument:
  data: DataFrame

  AIM:
  Min matrix calculation

  Output:
  metric as dictionary
  '''
  # initialize variables
  s = data['y'].value_counts()
  P = s[1]
  N = s[0]
  tpr = []
  fpr = []
  metric = {}
  for element in tqdm_notebook(data['prob']):
    data['y_pred'] = predict(data, 'prob', element)
    confusion_matrix_c = calculate_confusion(data, 'y', 'y_pred')
    metric_value = (500*confusion_matrix_c['fn'])+(100*confusion_matrix_c['fp'])
    metric[element] = metric_value
    data.drop(columns=['y_pred'])
  
  return (metric)

In [None]:
# sorting dataframe values by prob column
data_c = data_c.sort_values(by='prob', ascending=False)

# calculating min matrix 
result = Min_MetricCalculation(data_c)

In [None]:
temp = min(result.values())
res = [key for key in result if result[key] == temp]

print("The key:value pair for min value of the specified metric is :", res, temp)

<pre>
<font color='red'><b>D.</b></font> Compute performance metrics(for regression) for the given data <strong>5_d.csv</strong>
    <b>Note 2:</b> use pandas or numpy to read the data from <b>5_d.csv</b>
    <b>Note 1:</b> <b>5_d.csv</b> will having two columns Y and predicted_Y both are real valued features
<ol>
<li> Compute Mean Square Error </li>
<li> Compute MAPE: https://www.youtube.com/watch?v=ly6ztgIkUxk</li>
<li> Compute R^2 error: https://en.wikipedia.org/wiki/Coefficient_of_determination#Definitions </li>
</ol>
</pre>

In [None]:
# reading dataframe d
data_d = pd.read_csv("/content/5_d.csv")
data_d.head()

In [None]:
# error calculation method
def Error_Calculation(data, column1, column2):
  value = []

  for index, (value1, value2) in enumerate(zip(data[column1], data[column2])):
    value.append(value1-value2)

  return value

# absolute error calculation method
def absolute_error(data,column):
    val=[]
    for index,value in enumerate(data[column]):
        val.append(abs(value))
    return val

# mean sqaure error calculation method
def mean_sqaure_error(data, column):
  return ss_res(data,column)/len(data[column])

# method for mape
def mape(data, column1, column2):
  value = sum(data[column1]/sum(data[column2]))
  return value

# method for ss_res
def ss_res(data, column):
  value = 0
  for index, value in enumerate(data[column]):
    value = value + (value*value)
  
  return value

# method for ss_tot
def ss_tot(data, column):
  value = 0
  mean_value = data_d['y'].mean()
  
  for index, value in enumerate(data[column]):
    value = value + (value-mean_value)*(value-mean_value)

  return value

In [None]:
# calculating the error 
data_d['error'] = Error_Calculation(data_d, 'y', 'pred')

# calculating the absolute error
data_d['abs_error'] = absolute_error(data_d,'error')

In [None]:
MSE = mean_sqaure_error(data_d, 'error')
print("The Mean sqaured error is :", MSE)

In [None]:
# calculating mape 
MAPE = mape(data_d, 'abs_error', 'y')
print("The MAPE value is :", MAPE)

In [None]:
# calculating the co-efficient of determination with formula and methods
SS_RES = ss_res(data_d, 'error')
SS_TOT = ss_tot(data_d, 'y')
R_square = 1 - (SS_RES/ SS_TOT)
print("The Co-efficient of determination value is :", R_square)

In [None]:
# Reference 

# https://towardsdatascience.com/understanding-confusion-matrix-a9ad42dcfd62 - for different way to understand
# 