# Synthesize Data 

## 0 - Setup

### 0.1 - Packages 

In [18]:
import pandas as pd
from numpy import repeat

from ydata.metadata import Metadata
from ydata.synthesizers.regular import RegularSynthesizer
from ydata.connectors import LocalConnector
from ydata.connectors.filetype import FileType
from ydata.dataset import Dataset

## 1 - Load Data

In [12]:
# Read the data (Only if you want to train a new network)
data = pd.read_csv('data_processed.csv')
#Filter the data by the unusual observations
data = data.query("Unusual == 1")

In [15]:
data = data.drop(['Time', 'CellName'], axis=1)

## 2 - Synthesization 

### 2.1 - Train the Synthesizer 

In [19]:
#create the dataset. This allows scale and integration with YData's platform SDK's
dataset = Dataset(data)
#Calculate the metadata
m = Metadata(dataset)

[########################################] | 100% Completed |  0.8s


In [20]:
print(m)

[1mMetadata Summary 
 
[0m[1mDataset type: [0mTABULAR
[1mDataset attributes: [0m
[1mNumber of columns: [0m9
[1m% of duplicate rows: [0m0
[1mTarget column: [0m

[1mColumn detail: [0m
        Column    Data type Variable type
0   PRBUsageUL    numerical         float
1   PRBUsageDL    numerical         float
2   meanThr_DL    numerical         float
3   meanThr_UL    numerical         float
4    maxThr_DL    numerical         float
5    maxThr_UL    numerical         float
6    meanUE_UL    numerical         float
7  maxUE_UL+DL  categorical           int
8      Unusual  categorical           int




In [21]:
# Initialize and Load an alerady trained Synthesizer.
network_synth = RegularSynthesizer()

network_synth.fit(dataset, metadata=m)
network_synth.save('trained_model.pkl')

#Load a pre-saved model
#network_synth = network_synth.load('trained_model.pkl')

INFO: 2022-07-25 23:46:05,758 [SYNTHESIZER] - Number columns considered for synth: 9
INFO: 2022-07-25 23:46:20,623 [SYNTHESIZER] - Starting the synthetic data modeling process over 1x1 blocks.
INFO: 2022-07-25 23:46:20,629 [SYNTHESIZER] - Preprocess segment
INFO: 2022-07-25 23:46:24,777 [SYNTHESIZER] - Synthesizer init.
INFO: 2022-07-25 23:46:24,778 [SYNTHESIZER] - Processing the data prior fitting the synthesizer.


### 2.2 - Sample Data 

In [22]:
# Create the samples and flag them.
synth_sample = network_synth.sample(5000).to_pandas()

# Flag the created samples and assign columns 
synth_sample.index = repeat(999999, len(synth_sample))
synth_sample.columns = data.columns

INFO: 2022-07-25 23:47:13,908 [SYNTHESIZER] - Start generating model samples.


### 3 - Store Data 

In [23]:
synth_sample.to_csv('data_sampled.csv', index=False)