# Analysis of call data

In [1]:
import avro.schema
from avro.datafile import DataFileReader
from avro.io import DatumReader
import csv
from flatten_dict import flatten
import re
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq

### Get the input data and extract AVRO schema

In [2]:
inputFile = 'CT_Sample.avro'
reader = DataFileReader(open(inputFile,"rb"),DatumReader())
schema = reader.meta
input_data_schema = schema['avro.schema'].decode('latin1')

In [3]:
input_data_schema

'{"type":"record","name":"topLevelRecord","fields":[{"name":"DusTimestamp","type":["string","null"]},{"name":"Version","type":["string","null"]},{"name":"CtId","type":["string","null"]},{"name":"TimeInfo","type":[{"type":"record","name":"TimeInfo","fields":[{"name":"TimestampTableau","type":["string","null"]},{"name":"TimestampDateTime","type":["string","null"]},{"name":"TimestampOffset","type":["double","null"]},{"name":"TimestampMillis","type":["long","null"]},{"name":"IsSynced","type":["boolean","null"]},{"name":"TimeSource","type":["string","null"]},{"name":"MillisSinceLastSync","type":["long","null"]},{"name":"DeviceDriftMillis","type":["long","null"]}]},"null"]},{"name":"TestTimestamp","type":["string","null"]},{"name":"ScreenState","type":["string","null"]},{"name":"ServerFilename","type":["string","null"]},{"name":"DurationTcpConnect","type":["long","null"]},{"name":"DurationHttpReceive","type":["long","null"]},{"name":"BytesRead","type":["long","null"]},{"name":"HeaderBytesRea

### Flatten the data using '_' separator, compute average and max RxLevel

In [4]:
def underscore_partitioner(key1, key2):
    if key1 is None:
        return key2
    else:
        return key1 + "_" + key2

In [5]:
csv_columns = set()
flattenedData = list()
i = 0
sum = 0
max_RxLevel = None
average_RxLevel = None

for topLevelRecord in reader:
    csv_columns = csv_columns.union(set(flatten(topLevelRecord, reducer=underscore_partitioner).keys()))
    flattenedData.append(flatten(topLevelRecord, reducer=underscore_partitioner))
    if (topLevelRecord['Simoperator'] == '26202'):
        sum = topLevelRecord['RxLevel']
        if (i == 0):
            max_RxLevel = topLevelRecord['RxLevel']
        elif (topLevelRecord['RxLevel'] > max_RxLevel):
            max_RxLevel = topLevelRecord['RxLevel']        
        i+=1
        
if (sum != 0 and i != 0):
    average_RxLevel = sum / i

In [6]:
max_RxLevel

2147483647

In [7]:
average_RxLevel

-0.002645169416803124

### Write the content into a csv file

In [8]:
outputCSVFile = re.sub(r'\..+','',inputFile) + '.csv'
try:
    with open(outputCSVFile, 'w') as file:
        csv_writer = csv.DictWriter(file, fieldnames=list(csv_columns))
        csv_writer.writeheader()
        csv_writer.writerows(flattenedData)
except IOError:
    print("I/O error") 

### Write the content into parquet file with partition as first 3 characters of 'Simoperator' field

In [9]:
outputParquetFile = re.sub(r'\..+','',inputFile)# + '.parquet'
df_call_data = pd.read_csv(outputCSVFile)
df_call_data['SimOperator_key'] = df_call_data['Simoperator'].apply(lambda x : str(x)[:3])
df_call_data.head()

Unnamed: 0,BatteryInfo_BatteryChargePlug,TestTimestamp,BatteryInfo_MissingPermission,TimeInfo_TimestampOffset,DurationTcpConnect,TimeInfo_TimestampTableau,DurationOverallNoSleep,RxLevel,BytesRead,IdleStateOnStart,...,BatteryInfo_BatteryLevel,HTTPStatus,StorageInfo_StorageInternalVideo,DurationDNS,TimeInfo_IsSynced,HeaderBytesRead,BatteryInfo_BatteryVoltage,DurationSSL,StorageInfo_StorageInternalSize,SimOperator_key
0,AC,2019-05-19 15:41:11.827 +0200,False,2.0,-1,2019-05-19 15:41:11.827 +0200,30466,-88,-1,NonIdle,...,32.0,-1,0,101,False,-1,3988,-1,25320071168,262
1,Unknown,2019-05-19 16:12:04.347 +0200,False,2.0,-1,2019-05-19 16:12:04.347 +0200,30209,-91,-1,NonIdle,...,30.000002,-1,0,65,False,-1,3734,-1,25320071168,262
2,Unknown,2019-05-19 16:37:18.106 +0200,False,2.0,-1,2019-05-19 16:37:18.106 +0200,30255,-95,-1,NonIdle,...,25.0,-1,0,108,False,-1,3714,-1,25320071168,262
3,Unknown,2019-05-20 07:03:39.292 +0200,False,2.0,-1,2019-05-20 07:03:39.292 +0200,30488,-99,-1,NonIdle,...,99.0,-1,0,98,False,-1,4006,-1,25320071168,262
4,Unknown,2019-05-19 10:52:38.392 +0200,False,2.0,33,2019-05-19 10:52:38.392 +0200,859,-91,1032,NonIdle,...,88.0,200,0,5,True,436,4043,157,26497818624,262


In [10]:
df_call_data_table = pa.Table.from_pandas(df_call_data, preserve_index=False)
#pq.write_table(df_call_data_table, outputParquetFile)
pq.write_to_dataset(df_call_data_table, root_path=outputParquetFile, partition_cols=['SimOperator_key'])

### Read from the saved parquet file into pandas dataframe

In [11]:
pq.read_table(outputParquetFile).to_pandas().drop(columns='SimOperator_key')

Unnamed: 0,BatteryInfo_BatteryChargePlug,TestTimestamp,BatteryInfo_MissingPermission,TimeInfo_TimestampOffset,DurationTcpConnect,TimeInfo_TimestampTableau,DurationOverallNoSleep,RxLevel,BytesRead,IdleStateOnStart,...,BatteryInfo_BatteryCapacity,BatteryInfo_BatteryLevel,HTTPStatus,StorageInfo_StorageInternalVideo,DurationDNS,TimeInfo_IsSynced,HeaderBytesRead,BatteryInfo_BatteryVoltage,DurationSSL,StorageInfo_StorageInternalSize
0,Unknown,2019-05-17 11:18:49.640 +0200,False,2.0,69291,2019-05-17 11:18:49.640 +0200,16602,-89,1032,LightIdle,...,3220425,97.000000,200,0,152,True,436,4058,109,56421154816
1,Unknown,2019-05-17 14:36:10.052 +0200,False,2.0,-1,2019-05-17 14:36:10.052 +0200,30366,-83,-1,NonIdle,...,2817459,85.000000,-1,0,69,False,-1,4060,-1,56421154816
2,Unknown,2019-05-17 19:34:42.761 +0200,False,2.0,-1,2019-05-17 19:34:42.761 +0200,13357,-89,-1,LightIdle,...,1714257,51.000000,-1,0,-1,False,-1,3796,-1,56421154816
3,Unknown,2019-05-18 11:11:12.443 +0200,False,2.0,-1,2019-05-18 11:11:12.443 +0200,28269,-90,-1,NonIdle,...,2371554,71.000000,-1,0,76,True,-1,3978,-1,56421154816
4,Unknown,2019-05-19 12:04:23.145 +0200,False,2.0,-1,2019-05-19 12:04:23.145 +0200,30250,-91,-1,NonIdle,...,0,47.000000,-1,0,101,True,-1,3708,-1,57981337600
5,Unknown,2019-05-19 12:38:25.612 +0200,False,2.0,-1,2019-05-19 12:38:25.612 +0200,30144,-91,-1,NonIdle,...,0,43.000000,-1,0,4,True,-1,3799,-1,57981337600
6,Unknown,2019-05-19 13:04:22.712 +0200,False,2.0,-1,2019-05-19 13:04:22.712 +0200,30202,-83,-1,NonIdle,...,0,37.000000,-1,0,37,True,-1,3759,-1,57981337600
7,Unknown,2019-05-19 13:35:00.909 +0200,False,2.0,-1,2019-05-19 13:35:00.909 +0200,35308,-97,-1,NonIdle,...,0,29.000000,-1,0,41,True,-1,3680,-1,57981337600
8,USB,2019-05-19 14:04:00.131 +0200,False,2.0,251,2019-05-19 14:04:00.131 +0200,2347,-89,1033,NonIdle,...,0,39.000000,200,0,6,True,437,3902,1382,57981337600
9,Unknown,2019-05-19 14:18:02.146 +0200,False,2.0,-1,2019-05-19 14:18:02.146 +0200,34023,-69,-1,LightIdle,...,0,42.000000,-1,0,633,True,-1,3793,-1,57981337600
