# DATA 6545 Project 1 Evaluation Code
- ver. 1.1
Developed by: Dr. Jie Tao

This is the sample evaluation code provided for your project 1. 
- You should evaluated your processed data using this code whenever possible, and record the results;
- Do not modify this code here - create a __copy__ if you decide to do so.
- Note that due to randomness, although I will use the same code to evaluate your final submissions, the results might be slightly different.

In [1]:
# import required package for data ingestion
import pandas as pd
import numpy as np

# import required packages for splitting data
from sklearn import model_selection
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

# import required packages for evaluating models
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support

# import `logistic regression` model
from sklearn.linear_model import LogisticRegression

# balance the data
from imblearn.over_sampling import SMOTE

## Mount Google Drive

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Read in the Data

In [55]:
#### you should change data_path to point to your OWN data file
data_path = '/content/drive/MyDrive/Classroom/DATA 6545: Data Science and MLOps SP2023/P1B_data.xlsx'
data_df = pd.read_excel(data_path, index_col=0)
data_df.head()

Unnamed: 0,T3^,T4^,T5^,S1^,S2^,S3^,C1,C2,C3^,C5^,C6^,C7,I3^,Manufacturing,Service,Other,Y1,Y2
0,0.609278,0.103944,0.277849,-0.138698,0.237424,-0.080271,0.071304,1,1,0.094766,1.297007,-0.433805,0,1,0,0,0,1
1,0.10538,0.165734,-0.132115,-1.310689,0.860353,-0.102199,1.191919,0,0,2.223764,-0.782061,-0.786938,0,0,0,1,1,0
2,0.634245,0.052862,1.417398,2.613285,-1.040773,0.110575,-0.383053,1,0,-0.197203,-0.782061,-1.382472,0,0,0,1,1,0
3,1.060307,-1.176327,1.396308,2.08291,-0.61239,0.792523,0.873145,1,0,-0.241948,-0.782061,-1.317933,0,0,0,1,1,1
4,-0.347273,-0.606195,-0.528624,-0.374041,0.532102,0.054568,-0.559219,1,1,0.017792,0.993785,1.011921,0,0,0,1,0,1


In [None]:
# get a list of feature names
list(data_df.columns)

In [None]:
data_df.dtypes

### NOTE:

1. This code only include 1 target here - you can only evaluate 1 target at a time. If you want to evaluate another target, define another `y`.
2. It is the norm you arrange your features as *continuous*, *categorical*, and *target* features. If you do not do it this way, you should use indexing similar to below:
```python
y = data_df['Y']
X = data_df.drop['Y']
```

In [None]:
# # define features and target
# X = data_df.iloc[:,:-1].values
# y = data_df.iloc[:,-1].values
# # if you want a secondary target
# ### y1 = ...

In [None]:
# # define features and Y1 target
# X = data_df[cont_cols]
# y = data_df['Y1'].values
# # if you want a secondary target
# ### y1 = ...

In [100]:
a1 = ['T3^', 'T4^', 'T5^', 'S1^', 'S2^', 'C7']
a2 = ['S2^', 'S3^', 'C2', 'C5^', 'C6^', 'I3^']

In [69]:
X1 = data_df[a1]
y1 = data_df['Y1']

In [101]:
# define features and Y2 target
X2 = data_df[a2]
y2 = data_df['Y2']

In [102]:
### y should be binary
assert len(np.unique(y2)) == 2

In [103]:
X2.shape, y2.shape

((660, 6), (660,))

In [104]:
# resample/balance the data
# note although we do not balance data this way, 
# this works the best for this project
sm = SMOTE(random_state = 2022) 
X_res, y_res = sm.fit_resample(X2, y2) 

In [105]:
X_res.shape, y_res.shape

((922, 6), (922,))

In [106]:
# define the model
clf = LogisticRegression(max_iter=2000)

# Evaluation

In [107]:
def my_eval(X, y, classifer = clf, k=10, scoring = 'f1'):
  '''
  return evaluation results (f1-score or ROC_AUC). 
  Built in k-fold evaluation.
  INPUTS:
  ----
  - X: features; DataFrame or Numpy ndarray;
  - y: target; DataFrame or Numpy ndarray;
  - classifier: any sklearn (or its add-on) based classifier
  - k: number of folds in cross validation
  - scoring: evaluation metric ('f1' default or 'roc_auc')
  OUTPUT:
  ----
  bias/variance score of selected metric. Both lower the better
  - bias: mean of the metric over cross validation, measure the accruracy
  - variance: std.ev. of the metric, measure the consistency.
  '''
  scores = []
  for i in range(100):
    #### generate random numbers to shuffle the data for training and test
    np.random.seed(2021)
    random_int = np.random.randint(0,3000)
    #### create cross validation folds
    kfold = model_selection.KFold(n_splits=k, random_state=random_int, shuffle=True)
    #### record the score
    score = model_selection.cross_val_score(clf, X=X, y=y, cv=kfold, scoring=scoring)
    scores.append(score)
  scores = np.array(scores)
  #### we need to calculate the bias (average score) and viariance (std)
  bias, variance = 1 - round(scores.mean(),4), round(scores.std(),4)
  return(bias, variance)

In [108]:
# getting averaged f1_score from 10-fold CV (default)
my_eval(X_res, y_res, clf, 10)

(0.41379999999999995, 0.0595)

In [109]:
# getting averaged ROC_AUC from 10-fold CV
my_eval(X_res, y_res, clf, 10, 'roc_auc')

(0.3254, 0.0535)