In [1]:
!pip install -U gretel-client

Collecting smart-open<3.0,>=2.1.0
  Using cached smart_open-2.2.1-py3-none-any.whl
Installing collected packages: smart-open
  Attempting uninstall: smart-open
    Found existing installation: smart-open 5.2.1
    Uninstalling smart-open-5.2.1:
      Successfully uninstalled smart-open-5.2.1
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
pathy 0.6.1 requires smart-open<6.0.0,>=5.0.0, but you have smart-open 2.2.1 which is incompatible.
flair 0.7 requires transformers<=3.5.1,>=3.5.0, but you have transformers 4.15.0 which is incompatible.[0m
Successfully installed smart-open-2.2.1


In [2]:
import pandas as pd
import numpy as np
from getpass import getpass
from gretel_client import configure_session, ClientConfig, create_project
from gretel_client.helpers import poll
import json
import yaml

In [3]:
df_students = pd.read_csv("./dataset/student_engagement_level.csv")
df_students.shape

(486, 14)

In [4]:
df_students = df_students.rename(columns={"# Quiz Reviews before submission": "reviews", 
                   "Assignment 3 lateness indicator": "a1",
                  "Assignment 3 duration to submit (in hours)": "a2",
                  "Average time to submit assignment (in hours)": "a3",
                  "Engagement Level": "engagement"})
df_students = df_students[["reviews", "a1", "a2", "a3", "engagement"]]

In [5]:
df_students = df_students.assign(gender=np.random.randint(2, size=df_students.index.size))
df_students = df_students.assign(grade=np.random.randint(3, size=df_students.index.size))
df_students.head()

Unnamed: 0,reviews,a1,a2,a3,engagement,gender,grade
0,3,0,116.166667,129.016667,H,0,0
1,4,0,217.75,235.955556,M,1,1
2,3,0,260.333333,221.888889,M,1,2
3,6,0,271.216667,286.088889,M,0,1
4,1,0,260.733333,274.172222,M,1,1


In [6]:
df_students['engagement'].unique()


array(['H', 'M', 'L'], dtype=object)

In [7]:
configure_session(ClientConfig(api_key=getpass(prompt="Enter Gretel API key"), 
                               endpoint="https://api.gretel.cloud"))
# key grtu3fe15bbc3de08089a923e94ed388883ccac29ba7227f938d8fb500ad531a9960

Enter Gretel API key········


In [8]:
project = create_project(display_name="synthetic-data-students")

with open("gretel-config.yml", 'r') as stream:
    config = yaml.safe_load(stream)

# Set the model epochs to 50
config['models'][0]['synthetics']['params']['epochs'] = 60

config['models'][0]['synthetics']['generate']['num_records'] = 10000

print(json.dumps(config, indent=2))

{
  "schema_version": "1.0",
  "models": [
    {
      "synthetics": {
        "data_source": "__tmp__",
        "params": {
          "epochs": 60,
          "batch_size": 64,
          "vocab_size": 20000,
          "reset_states": false,
          "learning_rate": 0.01,
          "rnn_units": 256,
          "dropout_rate": 0.2,
          "overwrite": true,
          "early_stopping": true,
          "gen_temp": 1.0,
          "predict_batch_size": 64,
          "validation_split": false,
          "dp": false,
          "dp_noise_multiplier": 0.001,
          "dp_l2_norm_clip": 5.0,
          "dp_microbatches": 1,
          "data_upsample_limit": 10000
        },
        "validators": {
          "in_set_count": 10,
          "pattern_count": 10
        },
        "generate": {
          "num_records": 10000,
          "max_invalid": null
        },
        "privacy_filters": {
          "outliers": "medium",
          "similarity": "medium"
        }
      }
    }
  ]
}


In [9]:
df_students.to_csv('./dataset/clean_student_engagement_level.csv', index=False)
model = project.create_model_obj(model_config=config)
model.data_source = "./dataset/clean_student_engagement_level.csv"
model.submit(upload_data_source=True)
poll(model)

synthetic_df = pd.read_csv(model.get_artifact_link("data_preview"), compression='gzip')
synthetic_df

[32mINFO: [0mStarting poller


{
    "uid": "61e11421f26c370a837cd372",
    "model_name": "oafish-entertaining-wolf",
    "runner_mode": "cloud",
    "user_id": "61d90774bff621712241f5de",
    "project_id": "61e1141e0d7ef82693f93da5",
    "status_history": {
        "created": "2022-01-14T06:11:45.787674Z"
    },
    "last_modified": "2022-01-14T06:11:45.876602Z",
    "status": "created",
    "last_active_hb": null,
    "duration_minutes": null,
    "error_msg": null,
    "error_id": null,
    "traceback": null,
    "container_image": "074762682575.dkr.ecr.us-west-2.amazonaws.com/gretelai/synthetics@sha256:2f104a33478cca62f466303cf5750c05d5b4b9b81954821f091f66d5fb36a89b",
    "model_type": "synthetics",
    "config": {
        "schema_version": "1.0",
        "name": null,
        "models": [
            {
                "synthetics": {
                    "data_source": [
                        "gretel_babe4fcb419043df9f1261f9f54b77bd_clean_student_engagement_level.csv"
                    ],
                    

[32mINFO: [0mStatus is pending. A Gretel Cloud worker is being allocated to begin model creation.
[32mINFO: [0mStatus is active. A worker has started creating your model!
2022-01-14T06:12:02.490045Z  Starting synthetic model training
2022-01-14T06:12:02.492077Z  Loading training data
2022-01-14T06:12:02.635880Z  Training data loaded
{
    "record_count": 486,
    "field_count": 7,
    "upsample_count": 9514
}
2022-01-14T06:12:05.992992Z  Creating semantic validators and preparing training data
2022-01-14T06:12:13.811590Z  Beginning ML model training
2022-01-14T06:12:21.182653Z  Training epoch completed
{
    "epoch": 0,
    "accuracy": 0.2444,
    "loss": 3.8884,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 0
}
2022-01-14T06:12:22.132059Z  Training epoch completed
{
    "epoch": 1,
    "accuracy": 0.2471,
    "loss": 3.4786,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 0
}
2022-01-14T06:12:23.070716Z  Training epoch completed
{
    "epoch": 2,
    "accuracy": 0.

2022-01-14T06:13:03.600484Z  Training epoch completed
{
    "epoch": 44,
    "accuracy": 0.8925,
    "loss": 0.4277,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 0
}
2022-01-14T06:13:04.510080Z  Training epoch completed
{
    "epoch": 45,
    "accuracy": 0.893,
    "loss": 0.428,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 0
}
2022-01-14T06:13:05.425811Z  Training epoch completed
{
    "epoch": 46,
    "accuracy": 0.8924,
    "loss": 0.4301,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 0
}
2022-01-14T06:13:06.378149Z  Training epoch completed
{
    "epoch": 47,
    "accuracy": 0.8933,
    "loss": 0.4262,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 0
}
2022-01-14T06:13:07.323541Z  Training epoch completed
{
    "epoch": 48,
    "accuracy": 0.8929,
    "loss": 0.4257,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 0
}
2022-01-14T06:13:08.273117Z  Training epoch completed
{
    "epoch": 49,
    "accuracy": 0.8936,
    "loss": 0.425,
    "v

2022-01-14T06:15:17.423035Z  Generation in progress
{
    "current_valid_count": 5985,
    "current_invalid_count": 223,
    "new_valid_count": 1114,
    "new_invalid_count": 37,
    "completion_percent": 62.41
}
2022-01-14T06:15:22.428204Z  Generation in progress
{
    "current_valid_count": 6990,
    "current_invalid_count": 242,
    "new_valid_count": 1005,
    "new_invalid_count": 19,
    "completion_percent": 72.89
}
2022-01-14T06:15:27.434045Z  Generation in progress
{
    "current_valid_count": 8104,
    "current_invalid_count": 280,
    "new_valid_count": 1114,
    "new_invalid_count": 38,
    "completion_percent": 84.5
}
2022-01-14T06:15:32.438277Z  Generation in progress
{
    "current_valid_count": 9085,
    "current_invalid_count": 325,
    "new_valid_count": 981,
    "new_invalid_count": 45,
    "completion_percent": 94.73
}
2022-01-14T06:15:35.441144Z  Generation in progress
{
    "current_valid_count": 9590,
    "current_invalid_count": 342,
    "new_valid_count": 505,
 

2022-01-14T06:18:13.454489Z  Generation in progress
{
    "current_valid_count": 3868,
    "current_invalid_count": 154,
    "new_valid_count": 1035,
    "new_invalid_count": 33,
    "completion_percent": 43.75
}
2022-01-14T06:18:18.461312Z  Generation in progress
{
    "current_valid_count": 5003,
    "current_invalid_count": 182,
    "new_valid_count": 1135,
    "new_invalid_count": 28,
    "completion_percent": 56.59
}
2022-01-14T06:18:23.466932Z  Generation in progress
{
    "current_valid_count": 5992,
    "current_invalid_count": 216,
    "new_valid_count": 989,
    "new_invalid_count": 34,
    "completion_percent": 67.78
}
2022-01-14T06:18:28.473472Z  Generation in progress
{
    "current_valid_count": 7117,
    "current_invalid_count": 244,
    "new_valid_count": 1125,
    "new_invalid_count": 28,
    "completion_percent": 80.5
}
2022-01-14T06:18:33.478540Z  Generation in progress
{
    "current_valid_count": 8155,
    "current_invalid_count": 281,
    "new_valid_count": 1038,


MaxRetryError: HTTPSConnectionPool(host='api.gretel.cloud', port=443): Max retries exceeded with url: /projects/yvesconst-e5eb8/models/61e11421f26c370a837cd372?expand=logs (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7fce343dc790>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known'))

In [None]:
synthetic_df.to_csv('./dataset/gen_student_engagement_level.csv', index=False)

In [None]:
df_students.append(synthetic_df, ignore_index = True).to_csv('./dataset/merged_student_engagement_level.csv', index=False)