# Creation of a dataset

In [None]:
import random as rd
import pandas as pd

from cosapp.drivers import NonLinearSolver, RungeKutta, MonteCarlo, LinearDoE, RunSingleCase
from cosapp.utils.distributions import Normal
from cosapp.recorders import DataFrameRecorder
from cosapp_lab.widgets import SysExplorer
from time import time

from cpu.systems import CPUSystem

Selection of the parameters to create the dataset:
- sampleNumbers: number of samples created with cpu
- datasetSize: size of the dataset use for training
- percentageBroken: percentage of broken samples
- end: end of the file name, can be "" if there is nothing to add

In [None]:
sampleNumbers = 10001
datasetSize = 10000
percentageBroken = 50
end = ""
rd.seed(9)

# Creation of the cpu and its drivers

Creation of the CPU.

In [None]:
cpu = CPUSystem("cpu")
cpu.fan.mass_flow_scalar=1.

design = cpu.add_driver(NonLinearSolver('solver'))
design.extend(cpu.design_methods["exchanger_surface"])

design.runner.set_values({"fan.T_air": 40., "T_cpu": 80., "cpu.usage": 100.})

# run design
cpu.run_drivers()

To create the dataset we use a linear DoE so we add a driver to the CPU.

In [None]:
doe = cpu.add_driver(LinearDoE('doe'))

To run the CPU we need a time driver, so we add a child to the doe and give it an interval and a time delta between each point.

In [None]:
time_driver = doe.add_child(RungeKutta(order=3, history = True))
solver = time_driver.add_child(NonLinearSolver('solver', max_iter=10, factor=1.0))
time_driver.time_interval = ([0, 30])
time_driver.dt = 0.5

# Define a simulation scenario
time_driver.set_scenario(
    init = {'T_cpu': 30,
           'fan.mass_flow_scalar':1.
           },
    values = {
        #"fan.T_air": 40., mettre en commentaire !!! sinon remet à zéro quand on appelle time_driver
        "cpu.usage": "100 if time <20 else 0."}
)

This is to create a number of samples, choosen earlier, between 0 and 30 °C.

In [None]:
doe.add_input_var({'fan.T_air': {"lower": 0., "upper": 30., "count": sampleNumbers}})

In [None]:
doe.add_recorder(DataFrameRecorder(includes=['fan.tension', 'cpu.usage', 'T_cpu', 'fan.T_air', "exchanger.surface"]))

In [None]:
cpu.run_drivers()

# Dataset creation

## Train set data creation

We create a dataset with only data of a working CPU first.

In [None]:
dfW = doe.recorder.data

In [None]:
working = [True for k in range(len(df))]

In [None]:
dfW = dfW.assign(working=working)

## Test set data creation

We now create the dysfunctional CPU. In this case it is dysfunctional when it has no way to create an air flow so put the value of fan.mass_flow_scalar to 0.

In [None]:
time_driver.set_scenario(
    init = {'T_cpu': 30,
           'fan.mass_flow_scalar':0.
           },
    values = {
        "cpu.usage": "100 if time <20 else 0."}
)

In [None]:
cpu.run_drivers()

In [None]:
dfD = doe.recorder.data

In [None]:
working = [False for k in range(len(dfD))]

In [None]:
dfD = dfD.assign(working=working)

# Creation of the final datasets

## Creation of the training set

We drop columns that are useless in our study and create datasets to keep the data and be able to access them later without issue.

In [None]:
dfWclean = dfW.drop(['Section', 'Status', 'Error code', 'Reference', 'cpu.usage', 'exchanger.surface'], axis=1)
dfDclean = dfD.drop(['Section', 'Status', 'Error code', 'Reference', 'cpu.usage', 'exchanger.surface'], axis=1)

We create a training set of the size and percentage of broken CPU wanted.   
We first randomly take cases in each if the classes while respecting those parameters.

In [None]:
dataset=[]
for k in range(datasetSize):
    if k < datasetSize*percentageBroken/100:
        i=rd.randint(0, len(df2clean)-1)
        dataset.append(df2clean.iloc[i])
        df2clean=df2clean.drop(df2clean.index[i])
    else:
        i=rd.randint(0, len(dfclean)-1)
        dataset.append(dfclean.iloc[i])
        dfclean=dfclean.drop(dfclean.index[i])
    dataset[k].name=k

Then we create a pandas.DataFrame and save it in the data folder.

In [None]:
cols=['T_cpu', 'fan.T_air', 'fan.tension', 'working']
dfFinal=pd.DataFrame(dataset, columns=cols)

dfFinal.to_csv(f"./data/dataset_{datasetSize}_cases_{percentageBroken}_percent_broken{end}.csv", index=False)

## Creation of the test set

We drop columns that are useless in our study.

In [None]:
dfclean=df.drop(['Section', 'Status', 'Error code', 'Reference', 'cpu.usage', 'exchanger.surface'], axis=1)
df2clean=df2.drop(['Section', 'Status', 'Error code', 'Reference', 'cpu.usage', 'exchanger.surface'], axis=1)

We use all the data to create the test set, but we could also use new data.

In [None]:
dataset=[]
for k in range(datasetSize):
    dataset.append(df2clean.iloc[k])
    dataset.append(dfclean.iloc[k])
    dataset[2*k].name=2*k
    dataset[2*k+1].name=2*k+1

We create a pandas.DataFrame and save it in the data folder.

In [None]:
cols=['T_cpu', 'fan.T_air', 'fan.tension', 'working']
testfinal=pd.DataFrame(dataset, columns=cols)
testfinal.to_csv(f"./data/test_set_{datasetSize}_cases_{percentageBroken}_percent_broken{end}.csv", index=False)