# Simulation of Common Data Platform
## Bronze Analysis - Conversation Tracking example
## THKA, SET, August 2021

In [1]:
# Import needed libraries
%load_ext autoreload
%autoreload 2
import numpy as np
import pandas as pd
import seaborn as sns
import logsim.datapool as cdp
pd.options.mode.chained_assignment = None

## Fitting SW data

In [2]:
# Reload data
bronze = cdp.CDP()
bronze.loadAsCSV(ver='03')
df = bronze.getFswDaily().df
print('Shape of BRONZE Fsw Daily data frame is ' + str(df.shape))

Shape of BRONZE Fsw Daily data frame is (2220, 15)


In [3]:
print('Head of BRONZE Fsw Daily data frame')
df[df['id'] == 0].head()

Head of BRONZE Fsw Daily data frame


Unnamed: 0,id,power_cycle,charge,usage,ovd,speech,noise,snr-low,snr-med,snr-high,ovd-snr-low,ovd-snr-med,ovd-snr-high,vcUp,vcDwn
0,0,0,26122,31228,2040,6120,4440,3120,6240,21420,480,1440,1440,4,4
1,0,0,26122,31228,2040,6120,4440,3120,6240,21420,480,1440,1440,4,4
2,0,1,90427,53517,3480,10440,7620,5340,10680,37380,840,2520,2520,7,7
3,0,2,146068,84557,5640,16560,12000,8460,16920,58800,1380,4140,4140,11,11
4,0,3,200438,116452,7680,23040,16620,11700,23280,81480,1920,5580,5580,15,15


In [4]:
# Check for duplicates
df[df.duplicated(['id', 'power_cycle'])].shape

(200, 15)

In [7]:
# Display duplicates
df[df.duplicated(['id', 'power_cycle'], keep=False)].sort_values(['id', 'power_cycle']).head(20)

Unnamed: 0,id,power_cycle,charge,usage,ovd,speech,noise,snr-low,snr-med,snr-high,ovd-snr-low,ovd-snr-med,ovd-snr-high,vcUp,vcDwn


In [6]:
# Remove any duplicates
if df[df.duplicated(['id', 'power_cycle'])].size:
    df.drop_duplicates(['id', 'power_cycle'], inplace=True)
df[df.duplicated(['id', 'power_cycle'])].size

0

In [8]:
# Check for NaN
df.isnull().any()

id              False
power_cycle     False
charge          False
usage           False
ovd             False
speech          False
noise           False
snr-low         False
snr-med         False
snr-high        False
ovd-snr-low     False
ovd-snr-med     False
ovd-snr-high    False
vcUp            False
vcDwn           False
dtype: bool

In [None]:
# Convert to integer
df = df.astype(int)

In [9]:
df.head()

Unnamed: 0,id,power_cycle,charge,usage,ovd,speech,noise,snr-low,snr-med,snr-high,ovd-snr-low,ovd-snr-med,ovd-snr-high,vcUp,vcDwn
0,0,0,26122,31228,2040,6120,4440,3120,6240,21420,480,1440,1440,4,4
2,0,1,90427,53517,3480,10440,7620,5340,10680,37380,840,2520,2520,7,7
3,0,2,146068,84557,5640,16560,12000,8460,16920,58800,1380,4140,4140,11,11
4,0,3,200438,116452,7680,23040,16620,11700,23280,81480,1920,5580,5580,15,15
5,0,4,258287,144781,9600,28800,20640,14520,29040,101640,2340,7020,7020,19,18
