## File content:
- Using SMOTE Boost algorithm on a single patient dataset (patient Id: SB-011) to generate synthetic data and K-NN as the base classifier.
- Checking the quality of the generated synthetic data using the gretel.ai tool.

#### Quality report:

* Raw score: 67.08148148148148
* Grade: Good
* Score: 67

In [35]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [36]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import random
from random import randrange
import matplotlib.pyplot as plt

In [37]:
# Considering a single patient dataset (patient id : SB-011)
csv_path = '/content/gdrive/MyDrive/schasRepo/REU2022/ColabNotebooks/SB-011_dataset.csv'

p11 = pd.read_csv(csv_path)
p11.head()

Unnamed: 0.1,Unnamed: 0,income,road_dist,cooking,y_am_pef,tempin,humidin,pm25in,co2in,tempdiffin,...,windsd,humid,varp,dewpt,airp,seap,solrhr,solramnt,grdt,class
0,1,0.571,0.25,0.333,0.823,0.566,0.587,0.413,0.279,0.229,...,0.226,0.762,0.628,0.842,0.566,0.551,0.294,0.191,0.748,1.0
1,2,0.571,0.25,0.333,0.725,0.426,0.673,0.363,0.279,0.213,...,0.281,0.834,0.677,0.867,0.489,0.476,0.078,0.105,0.729,1.0
2,3,0.571,0.25,0.333,0.823,0.437,0.555,0.471,0.336,0.391,...,0.692,0.444,0.299,0.601,0.668,0.658,0.543,0.221,0.576,1.0
3,4,0.571,0.25,0.333,0.823,0.414,0.502,0.454,0.502,0.435,...,0.19,0.286,0.201,0.505,0.905,0.891,0.704,0.258,0.538,1.0
4,5,0.571,0.25,0.333,0.823,0.353,0.447,0.359,0.479,0.324,...,0.089,0.475,0.348,0.658,0.827,0.81,0.465,0.219,0.635,1.0


In [38]:
# Dropping unnamed column
p11 = p11.drop('Unnamed: 0', axis=1)
p11.head()

Unnamed: 0,income,road_dist,cooking,y_am_pef,tempin,humidin,pm25in,co2in,tempdiffin,humidiffin,...,windsd,humid,varp,dewpt,airp,seap,solrhr,solramnt,grdt,class
0,0.571,0.25,0.333,0.823,0.566,0.587,0.413,0.279,0.229,0.33,...,0.226,0.762,0.628,0.842,0.566,0.551,0.294,0.191,0.748,1.0
1,0.571,0.25,0.333,0.725,0.426,0.673,0.363,0.279,0.213,0.31,...,0.281,0.834,0.677,0.867,0.489,0.476,0.078,0.105,0.729,1.0
2,0.571,0.25,0.333,0.823,0.437,0.555,0.471,0.336,0.391,0.631,...,0.692,0.444,0.299,0.601,0.668,0.658,0.543,0.221,0.576,1.0
3,0.571,0.25,0.333,0.823,0.414,0.502,0.454,0.502,0.435,0.757,...,0.19,0.286,0.201,0.505,0.905,0.891,0.704,0.258,0.538,1.0
4,0.571,0.25,0.333,0.823,0.353,0.447,0.359,0.479,0.324,0.534,...,0.089,0.475,0.348,0.658,0.827,0.81,0.465,0.219,0.635,1.0


In [39]:
p11.columns

Index(['income', 'road_dist', 'cooking', 'y_am_pef', 'tempin', 'humidin',
       'pm25in', 'co2in', 'tempdiffin', 'humidiffin', 'pm25diffin', 'pm10',
       'pm25', 'o3', 'no2', 'co', 'so2', 'temp', 'windsd', 'humid', 'varp',
       'dewpt', 'airp', 'seap', 'solrhr', 'solramnt', 'grdt', 'class'],
      dtype='object')

In [40]:
class_counts = p11['class'].value_counts()
print(class_counts)

1.0    172
0.0     38
Name: class, dtype: int64


In [41]:
from sklearn.neighbors import NearestNeighbors
def findNeighbors(value, allValues, k):
    nn = NearestNeighbors(n_neighbors = k+1, metric = "euclidean").fit(allValues)
    #using k+1 since first nearest neighbor is itself
    dist, indices = nn.kneighbors(value, return_distance = True)
    return dist, indices

In [42]:
#SMOTE Boost Algorithm

def SMOTEBoost(dataFrame, printDebug = True):
    #Divide data set into Majority (MA) and Minority (MI) Classes
    MA = dataFrame[dataFrame['class'] == 1]
    MI = dataFrame[dataFrame['class'] == 0]

    #Record number of instances for each class
    MA_num = MA.shape[0]
    print("Class 1:", MA_num)
    MI_num = MI.shape[0]
    print("Class 0:", MI_num)

    # Save original training data (MI) as CSV
    MI.to_csv('original_training_data.csv', index=False)

    #Calculate how much data to generate
    numToSynthesize = MA_num - MI_num

    #Calculate k neighbors to search for
    if (MI_num - 1) < 5:
        k = MI_num - 1
    else:
        k = 5

    if printDebug == True:
        print("~~~~~~~~~~~ MI_num = ", MI_num, " ~~~~~~~~~~")
        print("~~~~~~~~~~~ k = ", k, " ~~~~~~~~~~")
        print("~~~~~~~~~~~ numToSynthesize = ", numToSynthesize, " ~~~~~~~~~~")

    # make copy of MI
    newMI = MI
    generate = numToSynthesize

    while generate > 0:
        #Convert minority class to NumPy array for synthesizing
        mi = newMI.to_numpy()
        numAttributes = mi.shape[1]

        #Create NumPy array for synthetic data
        synth_num = numToSynthesize // 10
        syntheticArray = np.empty((0, numAttributes))
        print("syntheticArray shape: ", syntheticArray.shape)

        print("~~~~~~~~~~~ synth_num = ", synth_num, " ~~~~~~~~~~")


        for j in range(5):
          for i in range(synth_num):
              if generate == 0:
                  continue
              #Select and instance x in minority class randomly
              x = random.choice(mi)
              x = np.reshape(x, (-1, numAttributes))

              #Find indices of k nearest neighbors of x
              _, knn = findNeighbors(x, mi, k)

              #Select one knn of sample and record it as y
              y = randrange(1, k+1)

              #Generate new minority instance w/ equation
              diff = mi[knn[0, y]] - x
              gap = random.uniform(0, 1)
              xnew = x + gap * diff
              syntheticArray = np.concatenate((syntheticArray, xnew))
              generate -= 1

          print("Iteration Done!")

          #Convert synthetic array into DataFrame
          syntheticData = pd.DataFrame(syntheticArray, columns=MI.columns.values)
          # print("syntheticData: ")
          # display(syntheticData)
          newMI = pd.concat([newMI, syntheticData], ignore_index=True)

          # Save synthetic data (newMI) as CSV
          newMI.to_csv('synthetic_training_data.csv', index=False)

    #Join imbalance data with synthetic data set
    newDF = pd.concat([MA, newMI], ignore_index=True)

    X_train_balanced = newDF.drop('class', axis=1)
    y_train_balanced = newDF['class']

    # if printDebug == True:
    #     print("Original Data Scatter Plot:")
    #     majority = dataFrame[dataFrame['class'] == 1]
    #     minority = dataFrame[dataFrame['class'] == 0]
    #     fig, ax = plt.subplots()
    #     majority.plot('grdt', 'tempin', kind = 'scatter', ax = ax, c ='royalblue', figsize=(10,10))
    #     minority.plot('grdt', 'tempin', kind = 'scatter', ax = ax, c ='orange', figsize=(10,10))
    #     ax.set_title("Original Data Scatter Plot - grdt vs. tempin")
    #     #fig.savefig("original_data_scatter_grdt_tempin.png")
    #     plt.show()

    #     print("Balanced Data Scatter Plot:")
    #     majority = dataFrame[dataFrame['class'] == 1]
    #     minority = dataFrame[dataFrame['class'] == 0]
    #     fig, ax = plt.subplots()
    #     majority.plot('grdt', 'tempin', kind = 'scatter', ax = ax, c ='royalblue', figsize=(10,10))
    #     newMI.plot('grdt', 'tempin', kind = 'scatter', ax = ax, c ='red', figsize=(10,10))
    #     #minority.plot('grdt', 'tempin', kind = 'scatter', ax = ax, c ='orange', figsize=(10,10))
    #     ax.set_title("Balanced Data Scatter Plot - grdt vs. tempin")
    #     #fig.savefig("balanced_data_scatter_grdt_tempin.png")
    #     plt.show()

    return X_train_balanced, y_train_balanced

In [43]:
# Split data into training and testing

X = p11.drop('class', axis=1)
y = p11['class']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply SMOTEBoost to the training set
X_train_balanced, y_train_balanced = SMOTEBoost(pd.concat([X_train, y_train], axis=1))

X_train.count()
y_train.count()

# print("Synthetic Data:")
# print(X_train_balanced[y_train_balanced == 0])

# Create a KNN classifier
knn = KNeighborsClassifier()

# Perform 3-fold cross-validation
cv_results = cross_validate(knn, X_train_balanced, y_train_balanced, cv=3,
                            scoring=['accuracy', 'precision', 'recall', 'f1', 'balanced_accuracy'])


Class 1: 140
Class 0: 28
~~~~~~~~~~~ MI_num =  28  ~~~~~~~~~~
~~~~~~~~~~~ k =  5  ~~~~~~~~~~
~~~~~~~~~~~ numToSynthesize =  112  ~~~~~~~~~~
syntheticArray shape:  (0, 28)
~~~~~~~~~~~ synth_num =  11  ~~~~~~~~~~
Iteration Done!
Iteration Done!
Iteration Done!
Iteration Done!
Iteration Done!
syntheticArray shape:  (0, 28)
~~~~~~~~~~~ synth_num =  11  ~~~~~~~~~~
Iteration Done!
Iteration Done!
Iteration Done!
Iteration Done!
Iteration Done!
syntheticArray shape:  (0, 28)
~~~~~~~~~~~ synth_num =  11  ~~~~~~~~~~
Iteration Done!
Iteration Done!
Iteration Done!
Iteration Done!
Iteration Done!


### Quality of the synthetic data using Gretel

In [44]:
%%capture
!pip install -U gretel-client

import json
from gretel_client.projects.models import read_model_config
from gretel_client.helpers import poll
from gretel_client.config import RunnerMode
from gretel_client.evaluation.quality_report import QualityReport
from gretel_client import configure_session
import IPython
from smart_open import open
from gretel_client.projects import create_or_get_unique_project

In [45]:
pd.set_option("max_colwidth", None)
configure_session(api_key="your_key", cache="yes", validate=True)
project = create_or_get_unique_project(name="trial")

config = read_model_config("synthetics/default")

# Set the model epochs to 50
config["models"][0]["synthetics"]["params"]["epochs"] = 50

print(json.dumps(config, indent=2))

Using endpoint https://api.gretel.cloud
Logged in as siddheshwaribankar24@gmail.com ✅




{
  "schema_version": "1.0",
  "name": "default-config",
  "models": [
    {
      "synthetics": {
        "data_source": "__tmp__",
        "params": {
          "epochs": 50,
          "vocab_size": 20000,
          "learning_rate": 0.01,
          "validation_split": false
        },
        "generate": {
          "num_records": 5000
        },
        "privacy_filters": {
          "outliers": "auto",
          "similarity": "auto"
        }
      }
    }
  ]
}


In [46]:
# Specify your Gretel API Key

pd.set_option("max_colwidth", None)
configure_session(api_key="your_key", cache="yes", validate=True)

Using endpoint https://api.gretel.cloud
Logged in as siddheshwaribankar24@gmail.com ✅


In [47]:
# Load and preview real-world data
real_data = 'original_training_data.csv'

real_df = pd.read_csv(real_data)
real_df.head()

Unnamed: 0,income,road_dist,cooking,y_am_pef,tempin,humidin,pm25in,co2in,tempdiffin,humidiffin,...,windsd,humid,varp,dewpt,airp,seap,solrhr,solramnt,grdt,class
0,0.571,0.25,0.333,0.554,0.803,0.496,0.411,0.478,0.312,0.586,...,0.093,0.453,0.386,0.689,0.537,0.528,0.472,0.568,0.672,0.0
1,0.571,0.25,0.333,0.043,0.411,0.498,0.523,0.475,0.509,0.546,...,1.0,0.494,0.216,0.521,0.296,0.304,0.457,0.423,0.409,0.0
2,0.571,0.25,0.333,0.197,0.707,0.264,0.17,0.3,0.473,0.634,...,0.334,0.266,0.183,0.485,0.797,0.789,0.868,0.715,0.511,0.0
3,0.571,0.25,0.333,0.444,0.395,0.137,0.176,0.233,0.515,0.658,...,0.987,0.167,0.125,0.392,0.652,0.651,0.333,0.463,0.379,0.0
4,0.571,0.25,0.333,0.414,0.298,0.062,0.441,0.454,0.692,0.345,...,0.226,0.433,0.103,0.351,0.609,0.613,0.619,0.203,0.18,0.0


In [48]:
# Load and preview synthetic data
synth_data = 'synthetic_training_data.csv'

synth_df = pd.read_csv(synth_data)
synth_df.head()

Unnamed: 0,income,road_dist,cooking,y_am_pef,tempin,humidin,pm25in,co2in,tempdiffin,humidiffin,...,windsd,humid,varp,dewpt,airp,seap,solrhr,solramnt,grdt,class
0,0.571,0.25,0.333,0.554,0.803,0.496,0.411,0.478,0.312,0.586,...,0.093,0.453,0.386,0.689,0.537,0.528,0.472,0.568,0.672,0.0
1,0.571,0.25,0.333,0.043,0.411,0.498,0.523,0.475,0.509,0.546,...,1.0,0.494,0.216,0.521,0.296,0.304,0.457,0.423,0.409,0.0
2,0.571,0.25,0.333,0.197,0.707,0.264,0.17,0.3,0.473,0.634,...,0.334,0.266,0.183,0.485,0.797,0.789,0.868,0.715,0.511,0.0
3,0.571,0.25,0.333,0.444,0.395,0.137,0.176,0.233,0.515,0.658,...,0.987,0.167,0.125,0.392,0.652,0.651,0.333,0.463,0.379,0.0
4,0.571,0.25,0.333,0.414,0.298,0.062,0.441,0.454,0.692,0.345,...,0.226,0.433,0.103,0.351,0.609,0.613,0.619,0.203,0.18,0.0


In [49]:
# Create a Quality Report
report = QualityReport(data_source=synth_data, ref_data=real_data)
report.run()

In [50]:
# Synthetic Data Quality Score (SQS)
report.peek()

{'raw_score': 67.08148148148148, 'grade': 'Good', 'score': 67}

In [51]:
# Quality Report as HTML
import IPython

IPython.display.HTML(report.as_html, metadata=dict(isolated=True))

0,1,2,3,4,5
How to interpret your SQS,Excellent,Good,Moderate,Poor,Very Poor
Suitable for machine learning or statistical analysis,,,,,
Suitable for balancing or augmenting machine learning data sources,,,,,
Suitable for pre-production testing environments,,,,,
Suitable for demo environments or mock data,,,,,
Improve your model using our tips and advice,,,,,
Significant tuning required to improve model,,,,,

0,1,2,3,4,5
Data Sharing Use Case,Excellent,Very Good,Good,Normal,Poor
"Internally, within the same team",,,,,
"Internally, across different teams",,,,,
"Externally, with trusted partners",,,,,
"Externally, public availability",,,,,

Unnamed: 0,Training Data,Synthetic Data
Row Count,28,28
Column Count,28,28
Training Lines Duplicated,--,2

Default Privacy Protections,Advanced Protections

Field,Unique,Missing,Ave. Length,Type,Distribution Stability
income,1,0,5.0,Numeric,Excellent
road_dist,1,0,4.0,Numeric,Excellent
class,1,0,3.0,Numeric,Excellent
cooking,1,0,5.0,Numeric,Excellent
y_am_pef,10,0,4.46,Numeric,Poor
dewpt,28,0,4.93,Numeric,Poor
varp,28,0,4.89,Numeric,Moderate
no2,20,0,4.93,Numeric,Moderate
humidin,27,0,4.93,Numeric,Moderate
solramnt,26,0,4.86,Numeric,Moderate
