In [None]:
import numpy as np
from numpy import nan
import pandas as pd

import pickle
import collections
from collections import defaultdict 

from matplotlib import pyplot as plt
from os import listdir
from os.path import isfile, join

import re
%matplotlib inline

In [None]:
# Convert field names to dict for easy access.
# Can be hard coded 
# 
fields_path = '../../input/training_validation_2/fields.csv'  
fields_df = pd.read_csv(fields_path)
fields_df.columns = ['name', 'f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6']

fields_dict = {}

for idx in range(fields_df.shape[0]):
    name = fields_df.loc[idx, 'name']

    _fields = []
    
    for f in fields_df.columns[1:]:
        if not (str(fields_df.loc[idx, f]) == 'nan'):
            _fields.append(name + "_" + str(fields_df.loc[idx, f]))
    
    fields_dict[idx] = {'name': fields_df.loc[idx, 'name'] , 'fields': _fields}

fields_dict

In [None]:
# Feautures that have more than 10% missing values:
features_with_10p_na = [
    'DurationRobotFromTestBenchToFeeder_vTrend', 'FeederBackgroundIlluminationIntensity_vMax',
    'FeederBackgroundIlluminationIntensity_vMin', 'FeederBackgroundIlluminationIntensity_vStd',
    'FeederBackgroundIlluminationIntensity_vTrend', 'FeederBackgroundIlluminationIntensity_value', 
    'FuseHeatSlope_vTrend', 'FuseHeatSlopeNOK_vMax', 'FuseHeatSlopeNOK_vMin', 'FuseHeatSlopeNOK_vStd', 
    'FuseHeatSlopeNOK_vTrend', 'FuseHeatSlopeNOK_value', 'FuseHeatSlopeOK_vMax', 'FuseHeatSlopeOK_vMin', 
    'FuseHeatSlopeOK_vStd', 'FuseHeatSlopeOK_vTrend', 'FuseHeatSlopeOK_value', 'IntensityTotalImage_vMax', 
    'IntensityTotalImage_vMin', 'IntensityTotalImage_vStd', 'IntensityTotalImage_vTrend', 'IntensityTotalImage_value', 
    'IntensityTotalThermoImage_vTrend', 'NumberFuseDetected_vMax', 'NumberFuseDetected_vMin', 'NumberFuseDetected_vStd', 
    'NumberFuseDetected_vTrend', 'NumberFuseDetected_value', 'NumberFuseEstimated_vMax', 'NumberFuseEstimated_vMin', 
    'NumberFuseEstimated_vStd', 'NumberFuseEstimated_vTrend', 'NumberFuseEstimated_value', 'SharpnessImage_vMax', 
    'SharpnessImage_vMin', 'SharpnessImage_vStd', 'SharpnessImage_vTrend', 'SharpnessImage_value', 
    'TemperatureThermoCam_vTrend'
]

len(features_with_10p_na)

In [None]:
# Feautures that have more than 20% missing values:
features_with_20p_na = [
    'FeederBackgroundIlluminationIntensity_vMax', 'FeederBackgroundIlluminationIntensity_vMin',
    'FeederBackgroundIlluminationIntensity_vStd', 'FeederBackgroundIlluminationIntensity_vTrend',
    'FeederBackgroundIlluminationIntensity_value', 'FuseHeatSlope_vTrend', 'FuseHeatSlopeNOK_vTrend', 
    'FuseHeatSlopeOK_vMax', 'FuseHeatSlopeOK_vMin', 'FuseHeatSlopeOK_vStd', 'FuseHeatSlopeOK_vTrend', 
    'FuseHeatSlopeOK_value', 'IntensityTotalImage_vMax', 'IntensityTotalImage_vMin', 'IntensityTotalImage_vStd', 
    'IntensityTotalImage_vTrend', 'IntensityTotalImage_value', 'IntensityTotalThermoImage_vTrend', 
    'NumberFuseDetected_vMax', 'NumberFuseDetected_vMin', 'NumberFuseDetected_vStd', 'NumberFuseDetected_vTrend',
    'NumberFuseDetected_value', 'NumberFuseEstimated_vMax', 'NumberFuseEstimated_vMin', 'NumberFuseEstimated_vStd',
    'NumberFuseEstimated_vTrend', 'NumberFuseEstimated_value', 'SharpnessImage_vMax', 'SharpnessImage_vMin', 
    'SharpnessImage_vStd', 'SharpnessImage_vTrend', 'SharpnessImage_value', 'TemperatureThermoCam_vTrend'
]

len(features_with_20p_na)

In [None]:
set(features_with_10p_na).difference(set(features_with_20p_na))

In [None]:
# Get class id and run id from filename
def parse_class_name(fname):
    p = re.compile("^class[^\d]*(\d+)_(\d+).*.csv")
    m = p.match(fname)
    
    return m.groups()   


In [None]:
def impute_df(df):
    
#     for f in features_with_20p_na:
#         new_f_name = f + "_na"
#         df[new_f_name] = df[f].isna().astype(np.int32)
    
#         del df[f]

    df = df.interpolate(limit_direction='both')
    
    return df

In [None]:
# Load one data file and return in a data frame
def load_data_file(path, fname):
    fullpath = join(path,fname)
    df = pd.read_csv(fullpath)
    df.columns = ['name', 'data']
    
    dfx = []
    
    for f in fields_dict:
        name = fields_dict[f]['name']
        fields = fields_dict[f]['fields']
        
        data = eval(df.loc[f,'data']) # convert data to array
        
        new_df = pd.DataFrame(data)
        if (f==33) and (new_df.shape[1] == 6): # NumberFuseDetected has a special case!
            new_df[6] = new_df[5]
            new_df[5] = np.NaN
            

        new_df.columns = fields_dict[f]['fields']
        
        dfx.append(new_df)
        
    merged_df = pd.concat(dfx, axis=1) # Merge columns
    
# Do some imputation on the data file
    merged_df = impute_df(merged_df.copy())

    c, r = parse_class_name(fname) # Get class id and run id

    # Add class labels and run id
    merged_df['class'] = int(c)
    merged_df['run'] = int(r)

    return merged_df

In [None]:
# Load data files from a directory and return merged data frame
def load_data_files(path):
    
    print ("In", path)
    files = [] 
    for f in listdir(path):
        if (isfile(join(path, f)) and (f.startswith("class"))):
            files.append(f)
    
    data_df_list = []
    for fname in files:
        print ("Loading:", fname)
        
        df = load_data_file(path, fname)
        
        data_df_list.append(df)

    data_df = pd.concat(data_df_list, axis=0) # Merge data frames
    
    return data_df

In [None]:
%%time
data_df_1 = load_data_files("../../input/training_validation_1/")
data_df_2 = load_data_files("../../input/training_validation_2/")
data_df_3 = load_data_files("../../input/ModelRefinement/")

In [None]:
# Uncomment to store data frames

data_df_1.to_csv("../../data/interpolated_training_validation_1.csv", index=False)
data_df_2.to_csv("../../data/interpolated_training_validation_2.csv", index=False)
data_df_3.to_csv("../../data/interpolated_model_refinement.csv", index=False)
