# Feature engineering
Our goal with this notebook is to load the raw datasets that we explored in the previous step, and build a new single dataset. This new dataset will be very close to the expected format that will feed the ML classifiers (we'll just skip the scaling step out of here since we expect this to happen in the pipeline)

#### Steps:
- [x] Load tappy dataset and drop columns that we won't use
- [x] Same for the users dataset
- [x] Merge both datasets and select the meaninful observations
- [x] Drop outliers and useless data
- [x] Group by user, generating stats from the raw data, and creating new features
- [x] Dump the new dataset into a separate file

#### Prerequisites for this notebook:
- Go thorough `02_loading_and_exploring.ipynb` so the raw parsed datasets are available 

In [1]:
# First we must mount google drive 
from google.colab import drive
GDRIVE_BASE_PATH = '/content/gdrive'
drive.mount(GDRIVE_BASE_PATH)

# Loading all updates from GitHub and our project setup
HOME_DIR = f'{GDRIVE_BASE_PATH}/My Drive/tappy_parkinsons'
% cd '$HOME_DIR'
! git fetch origin && git reset --hard origin/master
from util.project_setup import ProjectSetup

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
/content/gdrive/My Drive/tappy_parkinsons
HEAD is now at 64a0ed0 Updating notebook #2


## ====================================================================================
## 1. Merging and cleaning the dataset

In [0]:
# We decided to declare this class here (instead of inside the util package)
# so it's easier to understand and visualize the feature engineering steps.
# In production code, this would be part of the library so we can automate
# the ETL process in a simple way. 
from util.tappy_loader import TappyLoader
from util.users_loader import UsersLoader

class RawFeaturesCleaner:
  # We are setting the defaults to what we observed and recommended in the
  # exploration steps detailed in `02_loading_and_exploring.ipynb`
  UNWANTED_COLUMNS = ['date','timestamp','flight_time']
  HOLD_TIME_OUTLIER_THRESHOLD = 1000
  MINIMUM_OBSERVATIONS_PER_USER = 1000
  EXCLUDE_HIGH_IMPACT_CLASSES = ['Medium', 'Severe']
  EXCLUDE_USERS_TAKING = []
  
  def __init__(self, options={}):
    self.unwanted_columns = self.UNWANTED_COLUMNS
    self.hold_time_outlier_threshold = self.HOLD_TIME_OUTLIER_THRESHOLD
    self.minimum_observations_per_user = self.MINIMUM_OBSERVATIONS_PER_USER
    self.exclude_high_impact_classes = self.EXCLUDE_HIGH_IMPACT_CLASSES
    self.exclude_users_taking = self.EXCLUDE_USERS_TAKING
    
    if 'unwanted_columns' in options:
      self.unwanted_columns = options['unwanted_columns']
    if 'hold_time_outlier_threshold' in options:
      self.hold_time_outlier_threshold = options['hold_time_outlier_threshold']
    if 'minimum_observations_per_user' in options:
      self.minimum_observations_per_user = options['minimum_observations_per_user']
    if 'exclude_high_impact_classes' in options:
      self.exclude_high_impact_classes = options['exclude_high_impact_classes']
    if 'exclude_users_taking' in options:
      self.exclude_users_taking = options['exclude_users_taking']
  
  def load_parsed_dataframe(self):
    tappy_raw_df = TappyLoader().load_dataframe()
    users_raw_df = UsersLoader().load_dataframe()
    return tappy_raw_df.merge(users_raw_df, how='inner', on='userkey')
  
  def fill_impact_nan_with_mode(self, df):
    mode = df[df.parkinsons == True].impact.mode()[0]
    df.fillna(value={'impact': mode}, inplace=True)
    return self
  
  # TODO: improve this to be idempotent
  def discard_unwanted_columns(self, df):
    df.drop(self.unwanted_columns, axis=1, inplace=True)
    return self
  
  def discard_hold_time_outliers(self, df):
    indexes = df[df.hold_time > self.hold_time_outlier_threshold].index
    df.drop(indexes, inplace=True)
    return self
  
  def discard_space_keystrokes(self, df):
    indexes = df[df.hand == 'S'].index
    df.drop(indexes, inplace=True)
    return self
  
  def discard_users_with_few_observations(self, df):
    temp_df = self.__df_with_observations_count(df)
    userkeys_to_drop = temp_df[temp_df.observations < self.minimum_observations_per_user].userkey.unique()
    
    indexes = df[df.userkey.isin(userkeys_to_drop)].index
    df.drop(indexes, inplace=True)
    return self
  
  def discard_users_with_high_impact(self, df):
    indexes = df[(df.parkinsons == True) & df.impact.isin(self.exclude_high_impact_classes)].index
    df.drop(indexes, inplace=True)
    return self
  
  def discard_users_taking_medicine(self, df):
    for column in self.exclude_users_taking:
      indexes = df[(df.parkinsons == True) & (df[column] == True)].index
      df.drop(indexes, inplace=True)
    return self

  ###################
  # Private Methods #
  ###################
  
  def __df_with_observations_count(self, df):
    temp_df = self.__observations_count_by_user(df)
    return df.merge(temp_df, how='inner', on='userkey')
  
  def __observations_count_by_user(self, df):
    temp_df = df[['userkey', 'hand']].groupby('userkey').count()[['hand']]
    temp_df.rename(columns={'hand': 'observations'}, inplace=True)
    temp_df.sort_values('observations', inplace=True)
    return temp_df

In [3]:
# Let's get the parsed datasets merged and start cleaning up the data.
df = RawFeaturesCleaner().load_parsed_dataframe()
df.head()

  mask |= (ar1 == a)


Unnamed: 0,userkey,date,timestamp,hand,hold_time,direction,latency_time,flight_time,gender,birthyear,parkinsons,tremors,diagnosisyear,sided,updrs,impact,levadopa,da,maob,other
0,0EA27ICBLF,160722,18:41:04.336,L,101.6,LL,234.4,156.3,Female,1952.0,True,True,2000,Left,Don't know,Severe,True,True,False,False
1,0EA27ICBLF,160722,18:42:14.070,L,85.9,LL,437.5,359.4,Female,1952.0,True,True,2000,Left,Don't know,Severe,True,True,False,False
2,0EA27ICBLF,160722,18:42:14.273,L,78.1,LL,210.9,125.0,Female,1952.0,True,True,2000,Left,Don't know,Severe,True,True,False,False
3,0EA27ICBLF,160722,18:42:14.617,L,62.5,LL,359.4,281.3,Female,1952.0,True,True,2000,Left,Don't know,Severe,True,True,False,False
4,0EA27ICBLF,160722,18:42:15.586,S,125.0,LS,187.5,93.8,Female,1952.0,True,True,2000,Left,Don't know,Severe,True,True,False,False


In [4]:
cleaner = RawFeaturesCleaner()

# During the exploration we found out 4 users with parkinsons that did not report
# their `impact`. We should fill them with the mode because we might use the impact
# to select different datasets (i.e. exclude users with `Severe` impact since we
# want to detect early stages of the disease
cleaner.fill_impact_nan_with_mode(df)

# We want to discard date and time since we don't plan to use it.
# We'll also discard flight time since we notice discrepancies in the data there, 
# and flight time is present inside hold time, so we don't need both
cleaner.discard_unwanted_columns(df)

# We don't want to deal with hold time higher than 1 second
cleaner.discard_hold_time_outliers(df)

# All `S` observations should also be discarded since we just want to detect
# left and right patterns
cleaner.discard_space_keystrokes(df)

# We should also discard users with very few observations so their "weak"
# stats don't affect the whole dataset, producing noise in the data
cleaner.discard_users_with_few_observations(df)

# We should focus on those with mild severity because we want to detect
# early stages of the disease
cleaner.discard_users_with_high_impact(df)

# After some roundtrips between training models and feature engineering, we noticed
# that users taking medicine might affect the results of the research because they
# will have PD but not display the symptoms
cleaner.discard_users_taking_medicine(df)

# At this point we have a clean raw dataset to start feature engineering steps
# by applying some stats on the data on ~100 users (depending on the params
# for the initial cleanup)
print(f'Number of users: {df.userkey.nunique()}')
df.head()

Number of users: 84


Unnamed: 0,userkey,hand,hold_time,direction,latency_time,gender,birthyear,parkinsons,tremors,diagnosisyear,sided,updrs,impact,levadopa,da,maob,other
158430,0QAZFRHQHW,L,78.1,LL,312.5,Female,1959.0,False,False,------,,Don't know,------,False,False,False,False
158431,0QAZFRHQHW,L,78.1,LL,453.1,Female,1959.0,False,False,------,,Don't know,------,False,False,False,False
158432,0QAZFRHQHW,L,62.5,LL,414.1,Female,1959.0,False,False,------,,Don't know,------,False,False,False,False
158433,0QAZFRHQHW,L,93.8,LL,742.2,Female,1959.0,False,False,------,,Don't know,------,False,False,False,False
158434,0QAZFRHQHW,R,101.6,LR,523.4,Female,1959.0,False,False,------,,Don't know,------,False,False,False,False


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3900754 entries, 158430 to 8791783
Data columns (total 17 columns):
userkey          object
hand             object
hold_time        float64
direction        object
latency_time     float64
gender           object
birthyear        float64
parkinsons       bool
tremors          bool
diagnosisyear    object
sided            object
updrs            object
impact           object
levadopa         bool
da               bool
maob             bool
other            bool
dtypes: bool(6), float64(3), object(8)
memory usage: 379.4+ MB


In [6]:
# The stats seems consistent to our expected clean up
df.describe().round(2)

Unnamed: 0,hold_time,latency_time,birthyear
count,3900754.0,3900754.0,3326052.0
mean,110.22,271.84,1949.38
std,52.95,140.35,6.58
min,0.0,1.0,1928.0
25%,78.1,171.9,1943.0
50%,101.6,242.2,1950.0
75%,136.7,343.8,1952.0
max,968.8,799.9,1986.0


## ====================================================================================
## 2. Generating features per user (2 datasets, one for hold time and another for latency)

In [0]:
# Same idea. Let's encapsulate the feature engineering process in a concise class
import pandas as pd

class FeaturesConverter:
  DEFAULT_AGGREGATE_FUNCTIONS = ['mean', 'std', 'skew', 'kurtosis']
  DIRECTIONS = ['LL', 'RR', 'LR', 'RL']
  HANDS = ['L', 'R']

  def __init__(self, raw_df=None, aggregate_functions=DEFAULT_AGGREGATE_FUNCTIONS):
    self.raw_df = raw_df
    if raw_df is None:
      self.raw_df = self.__load_raw_df()

    self.basic_features_dataframe_file = f'{ProjectSetup.data_dir}/basic_features_dataframe_file.csv'
    self.aggregate_functions = aggregate_functions
    self.target_df = self.__create_target_df_from(self.raw_df)
    
  def convert_and_dump(self):
    hold_time_df = self.group_by_user_producing_hold_time_basic_stats()
    hold_time_df = self.apply_hold_time_mean_diff(hold_time_df)
    
    latency_time_df = self.group_by_user_producing_latency_time_basic_stats()
    latency_time_df = self.apply_latency_time_mean_diff(latency_time_df)
    
    final_df = hold_time_df.merge(latency_time_df, how='inner', on='userkey')
    final_df.dropna(inplace=True)
    final_df = self.apply_target_variable(final_df)
    #final_df.drop('userkey', axis=1, inplace=True)
    
    final_df.to_csv(self.basic_features_dataframe_file, index=True, header=True)
    print(f'Features dataframe exported to {self.basic_features_dataframe_file}')
    return final_df
    
  def group_by_user_producing_hold_time_basic_stats(self):
    df_to_return = None
    
    for aggregate_function in self.aggregate_functions:
      for hand in self.HANDS:
        temp_df = self.__create_aggregated_field_by(
          field_name = 'hand',
          field_value = hand,
          aggregate_by = 'hold_time',
          aggregate_function = aggregate_function
        )
        
        if df_to_return is None:
          df_to_return = temp_df
        else:
          df_to_return = df_to_return.merge(temp_df, how='inner', on='userkey')
        
    return df_to_return

  def group_by_user_producing_latency_time_basic_stats(self):
    df_to_return = None
    
    for aggregate_function in self.aggregate_functions:
      for direction in self.DIRECTIONS:
        temp_df = self.__create_aggregated_field_by(
          field_name = 'direction',
          field_value = direction,
          aggregate_by = 'latency_time',
          aggregate_function = aggregate_function
        )
        
        if df_to_return is None:
          df_to_return = temp_df
        else:
          df_to_return = df_to_return.merge(temp_df, how='inner', on='userkey')
        
    return df_to_return
  
  def apply_hold_time_mean_diff(self, stats_df):
    temp_df = stats_df.copy()
    temp_df['hold_time_diff'] = (temp_df['hold_time_l_mean'] - temp_df['hold_time_r_mean'])
    temp_df['hold_time_diff_abs'] = abs(temp_df['hold_time_l_mean'] - temp_df['hold_time_r_mean'])
    return temp_df
  
  def apply_latency_time_mean_diff(self, stats_df):
    temp_df = stats_df.copy()
    temp_df['latency_time_ll_rr_diff'] = (temp_df['latency_time_ll_mean'] - temp_df['latency_time_rr_mean'])
    temp_df['latency_time_lr_rl_diff'] = (temp_df['latency_time_lr_mean'] - temp_df['latency_time_rl_mean'])
    temp_df['latency_time_ll_rr_diff_abs'] = abs(temp_df['latency_time_ll_mean'] - temp_df['latency_time_rr_mean'])
    temp_df['latency_time_lr_rl_diff_abs'] = abs(temp_df['latency_time_lr_mean'] - temp_df['latency_time_rl_mean'])
    return temp_df
  
  def apply_target_variable(self, stats_df):
    temp_df = stats_df.merge(self.target_df, how='inner', on='userkey')
    return temp_df
  

  ###################
  # Private Methods #
  ###################
  
  def __create_target_df_from(self, df):
    target_df = df[['userkey','parkinsons']].groupby('userkey', as_index=False).sum()
    target_df['parkinsons'] = target_df['parkinsons'].apply(lambda x: x > 0)
    return target_df

  def __create_aggregated_field_by(self, field_name, field_value, aggregate_by, aggregate_function):
    temp_df = self.raw_df[self.raw_df[field_name] == field_value]
    group_by = temp_df[['userkey', aggregate_by]].groupby('userkey')
  
    if aggregate_function == 'kurtosis' or aggregate_function == 'kurt':
      temp_df = eval(f'group_by.apply(pd.DataFrame.{aggregate_function})')
    else:
      temp_df = eval(f'group_by.{aggregate_function}()')
  
    new_field_name = f'{aggregate_by}_{field_value}_{aggregate_function}'.lower()
    temp_df.rename(columns={aggregate_by: new_field_name}, inplace=True)
    
    return temp_df
    
  def __load_raw_df(df):
    cleaner = RawFeaturesCleaner()
    raw_df = cleaner.load_parsed_dataframe()
    cleaner.fill_impact_nan_with_mode(raw_df)
    cleaner.discard_unwanted_columns(raw_df)
    cleaner.discard_hold_time_outliers(raw_df)
    cleaner.discard_space_keystrokes(raw_df)
    cleaner.discard_users_with_few_observations(raw_df)
    cleaner.discard_users_with_high_impact(raw_df)
    cleaner.discard_users_taking_medicine(raw_df)
    
    return raw_df

In [8]:
converter = FeaturesConverter(df)

# Let's create now the hold time dataset. This one will focus on stats related to the hold
# time only. The idea is to compute mean, standard deviation and other stats for each hand.
hold_time_df = converter.group_by_user_producing_hold_time_basic_stats()

# Another important feature igroup_by_user_producing_hold_time_basic_statss comparing the difference of hold time of 
# the left and right side. This can indicate that the user demonstrates signs of sidedness
hold_time_df = converter.apply_hold_time_mean_diff(hold_time_df)


hold_time_df.head()

Unnamed: 0_level_0,hold_time_l_mean,hold_time_r_mean,hold_time_l_std,hold_time_r_std,hold_time_l_skew,hold_time_r_skew,hold_time_l_kurtosis,hold_time_r_kurtosis,hold_time_diff,hold_time_diff_abs
userkey,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0QAZFRHQHW,98.931818,101.595749,23.869914,37.219557,0.236835,7.495779,0.859236,110.960777,-2.66393,2.66393
1XNJCXS3EY,153.702407,105.622423,53.213749,27.036622,0.26498,2.797976,0.625664,24.14231,48.079984,48.079984
3DIXPRIOSW,147.626087,167.039039,47.259923,56.045862,0.666778,0.261659,0.606065,0.502078,-19.412952,19.412952
48DZPAJ5NS,125.093648,125.944823,21.344768,20.440962,0.433463,0.122179,5.122482,5.701887,-0.851175,0.851175
4XPHKKBXS6,186.434423,141.827793,61.740463,43.035236,0.42877,0.50179,0.447854,1.400579,44.60663,44.60663


In [9]:
# Same thing for the latency dataset
converter = FeaturesConverter(df)

latency_time_df = converter.group_by_user_producing_latency_time_basic_stats()
latency_time_df = converter.apply_latency_time_mean_diff(latency_time_df)

latency_time_df.head()


Unnamed: 0_level_0,latency_time_ll_mean,latency_time_rr_mean,latency_time_lr_mean,latency_time_rl_mean,latency_time_ll_std,latency_time_rr_std,latency_time_lr_std,latency_time_rl_std,latency_time_ll_skew,latency_time_rr_skew,latency_time_lr_skew,latency_time_rl_skew,latency_time_ll_kurtosis,latency_time_rr_kurtosis,latency_time_lr_kurtosis,latency_time_rl_kurtosis,latency_time_ll_rr_diff,latency_time_lr_rl_diff,latency_time_ll_rr_diff_abs,latency_time_lr_rl_diff_abs
userkey,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
0QAZFRHQHW,406.716242,365.736471,411.718182,430.258974,162.606085,155.345904,196.200749,178.154385,0.377702,0.575523,0.249532,0.283994,-0.511024,-0.246378,-1.100377,-0.917784,40.979771,-18.540793,40.979771,18.540793
1XNJCXS3EY,347.882547,322.170833,313.541489,310.799454,101.977747,82.901327,97.680669,98.038127,-1.3423,-0.286635,0.008292,0.037789,2.022551,0.765983,-0.706664,-0.709928,25.711714,2.742036,25.711714,2.742036
3DIXPRIOSW,528.670445,493.77963,575.478761,501.274093,137.542548,163.597901,148.448891,143.404457,-0.193251,-0.370759,-0.263042,0.360416,-0.085711,0.234405,-1.16132,-0.844957,34.890816,74.204668,34.890816,74.204668
48DZPAJ5NS,300.190663,332.504906,335.274126,321.053098,81.809939,90.989458,81.450453,88.512695,0.36177,-0.052088,0.020689,0.143768,-0.564167,-1.0419,-0.704254,-0.929236,-32.314242,14.221029,32.314242,14.221029
4XPHKKBXS6,498.789123,503.428615,546.809064,622.629688,195.853948,182.659566,137.089428,133.27584,-0.330001,-0.40793,-0.242735,-1.110013,-0.890976,-0.645718,-0.32144,1.710421,-4.639493,-75.820623,4.639493,75.820623


## ====================================================================================
## 3. Verifying the intersection (we should expect that most of the users are present in both)

In [10]:
print(f'Are the any null values? {hold_time_df.isnull().values.any()}\n')
hold_time_df.info()

Are the any null values? False

<class 'pandas.core.frame.DataFrame'>
Index: 84 entries, 0QAZFRHQHW to Z2UPVHHGBE
Data columns (total 10 columns):
hold_time_l_mean        84 non-null float64
hold_time_r_mean        84 non-null float64
hold_time_l_std         84 non-null float64
hold_time_r_std         84 non-null float64
hold_time_l_skew        84 non-null float64
hold_time_r_skew        84 non-null float64
hold_time_l_kurtosis    84 non-null float64
hold_time_r_kurtosis    84 non-null float64
hold_time_diff          84 non-null float64
hold_time_diff_abs      84 non-null float64
dtypes: float64(10)
memory usage: 7.2+ KB


In [11]:
print(f'Are the any null values? {latency_time_df.isnull().values.any()}\n')
latency_time_df.info()

Are the any null values? False

<class 'pandas.core.frame.DataFrame'>
Index: 84 entries, 0QAZFRHQHW to Z2UPVHHGBE
Data columns (total 20 columns):
latency_time_ll_mean           84 non-null float64
latency_time_rr_mean           84 non-null float64
latency_time_lr_mean           84 non-null float64
latency_time_rl_mean           84 non-null float64
latency_time_ll_std            84 non-null float64
latency_time_rr_std            84 non-null float64
latency_time_lr_std            84 non-null float64
latency_time_rl_std            84 non-null float64
latency_time_ll_skew           84 non-null float64
latency_time_rr_skew           84 non-null float64
latency_time_lr_skew           84 non-null float64
latency_time_rl_skew           84 non-null float64
latency_time_ll_kurtosis       84 non-null float64
latency_time_rr_kurtosis       84 non-null float64
latency_time_lr_kurtosis       84 non-null float64
latency_time_rl_kurtosis       84 non-null float64
latency_time_ll_rr_diff        84 non

In [12]:
# Let's check the null values
pd.set_option('display.max_columns', 100)
latency_time_df[latency_time_df.isnull().any(axis=1)]

Unnamed: 0_level_0,latency_time_ll_mean,latency_time_rr_mean,latency_time_lr_mean,latency_time_rl_mean,latency_time_ll_std,latency_time_rr_std,latency_time_lr_std,latency_time_rl_std,latency_time_ll_skew,latency_time_rr_skew,latency_time_lr_skew,latency_time_rl_skew,latency_time_ll_kurtosis,latency_time_rr_kurtosis,latency_time_lr_kurtosis,latency_time_rl_kurtosis,latency_time_ll_rr_diff,latency_time_lr_rl_diff,latency_time_ll_rr_diff_abs,latency_time_lr_rl_diff_abs
userkey,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1


In [0]:
#####################################################################################################
### We commented this out because after switching from a minimum of 100 to 1000 keystrokes per
### user, all the NaN's went away. But we are keeping this code for future validation if required
#####################################################################################################

# Most likely this is happening because user 2X17VCRRQA does not have
# enough observations on direction RL in order to calculate the stats.
# Let's double check
#df[(df.userkey == '2X17VCRRQA') & (df.direction == 'RL')]

# Yup. Only 2. According to the docs, these stats functions require at
# least 4 observations, so they are returning NaN

In [14]:
# Now what is the intersection of both datasets?
final_df = hold_time_df.merge(latency_time_df, how='inner', on='userkey')
print(f'Found {len(final_df)} matching users in both datasets')

Found 84 matching users in both datasets


In [0]:
#####################################################################################################
### We commented this out because after switching from a minimum of 100 to 1000 keystrokes per
### user, all the NaN's went away. But we are keeping this code for future validation if required
#####################################################################################################


# Great news, all 191 users are here.
# We should only have the user 2X17VCRRQA with null values
#final_df[final_df.isnull().any(axis=1)]

In [0]:
# Let's drop the null values staying with 190 users
final_df.dropna(inplace=True)

In [0]:
# Adding the target variable
final_df = converter.apply_target_variable(final_df)

In [18]:
final_df.describe().round(2)

Unnamed: 0,hold_time_l_mean,hold_time_r_mean,hold_time_l_std,hold_time_r_std,hold_time_l_skew,hold_time_r_skew,hold_time_l_kurtosis,hold_time_r_kurtosis,hold_time_diff,hold_time_diff_abs,latency_time_ll_mean,latency_time_rr_mean,latency_time_lr_mean,latency_time_rl_mean,latency_time_ll_std,latency_time_rr_std,latency_time_lr_std,latency_time_rl_std,latency_time_ll_skew,latency_time_rr_skew,latency_time_lr_skew,latency_time_rl_skew,latency_time_ll_kurtosis,latency_time_rr_kurtosis,latency_time_lr_kurtosis,latency_time_rl_kurtosis,latency_time_ll_rr_diff,latency_time_lr_rl_diff,latency_time_ll_rr_diff_abs,latency_time_lr_rl_diff_abs
count,84.0,84.0,84.0,84.0,84.0,84.0,84.0,84.0,84.0,84.0,84.0,84.0,84.0,84.0,84.0,84.0,84.0,84.0,84.0,84.0,84.0,84.0,84.0,84.0,84.0,84.0,84.0,84.0,84.0,84.0
mean,118.33,110.41,43.04,36.37,0.45,0.86,3.19,10.81,7.92,11.72,304.94,309.15,304.45,309.9,121.16,120.16,129.47,121.19,0.76,0.77,0.88,0.86,1.32,1.57,1.32,1.59,-4.21,-5.45,22.46,26.75
std,32.26,26.56,19.34,14.28,0.69,1.61,4.79,40.77,14.06,11.06,91.68,90.4,110.67,126.27,32.68,32.78,34.72,31.78,0.75,0.81,0.9,0.96,2.43,2.66,2.83,3.08,28.93,37.12,18.56,26.15
min,54.78,56.62,12.99,10.06,-0.61,-0.71,-0.88,0.05,-27.61,0.28,174.84,163.61,132.96,140.59,67.29,61.87,57.56,57.62,-1.34,-0.97,-1.29,-1.13,-1.1,-1.04,-1.25,-1.19,-72.75,-137.51,0.05,0.1
25%,98.76,96.63,28.62,27.67,-0.05,0.08,0.45,1.12,-0.91,2.66,239.05,245.46,224.39,206.39,96.22,92.26,95.97,95.45,0.25,0.13,0.24,0.22,-0.4,-0.27,-0.55,-0.52,-20.55,-22.71,7.85,8.84
50%,115.63,107.98,39.68,35.97,0.29,0.49,1.17,2.27,6.22,8.25,285.93,291.79,279.07,278.71,120.61,120.72,132.22,116.17,0.66,0.7,0.72,0.65,0.17,0.73,0.31,0.27,-2.69,1.88,18.49,20.74
75%,141.75,124.58,57.64,45.65,0.81,1.09,3.51,5.5,14.91,18.2,359.07,354.7,358.55,372.54,144.87,146.84,153.29,143.48,1.26,1.28,1.52,1.59,2.04,2.43,2.49,2.86,14.43,15.76,33.77,32.34
max,193.27,181.18,99.56,86.65,3.01,8.44,27.49,345.77,48.08,48.08,560.42,594.2,575.48,683.47,222.0,216.42,217.01,217.47,2.54,2.8,3.13,2.9,9.96,12.64,14.46,11.62,73.44,74.2,73.44,137.51


In [19]:
final_df['parkinsons'].value_counts()

True     43
False    41
Name: parkinsons, dtype: int64

## ===============================================================================
## 4. Feature engineering in a single step

In [20]:
# Here we'll do all the conversion steps exposed above, but in a single method call
# simplifying and allowing us to automate this whole process
final_df = FeaturesConverter().convert_and_dump()
final_df.describe().round(2)

  mask |= (ar1 == a)


Features dataframe exported to /content/gdrive/My Drive/tappy_parkinsons/data/basic_features_dataframe_file.csv


Unnamed: 0,hold_time_l_mean,hold_time_r_mean,hold_time_l_std,hold_time_r_std,hold_time_l_skew,hold_time_r_skew,hold_time_l_kurtosis,hold_time_r_kurtosis,hold_time_diff,hold_time_diff_abs,latency_time_ll_mean,latency_time_rr_mean,latency_time_lr_mean,latency_time_rl_mean,latency_time_ll_std,latency_time_rr_std,latency_time_lr_std,latency_time_rl_std,latency_time_ll_skew,latency_time_rr_skew,latency_time_lr_skew,latency_time_rl_skew,latency_time_ll_kurtosis,latency_time_rr_kurtosis,latency_time_lr_kurtosis,latency_time_rl_kurtosis,latency_time_ll_rr_diff,latency_time_lr_rl_diff,latency_time_ll_rr_diff_abs,latency_time_lr_rl_diff_abs
count,84.0,84.0,84.0,84.0,84.0,84.0,84.0,84.0,84.0,84.0,84.0,84.0,84.0,84.0,84.0,84.0,84.0,84.0,84.0,84.0,84.0,84.0,84.0,84.0,84.0,84.0,84.0,84.0,84.0,84.0
mean,118.33,110.41,43.04,36.37,0.45,0.86,3.19,10.81,7.92,11.72,304.94,309.15,304.45,309.9,121.16,120.16,129.47,121.19,0.76,0.77,0.88,0.86,1.32,1.57,1.32,1.59,-4.21,-5.45,22.46,26.75
std,32.26,26.56,19.34,14.28,0.69,1.61,4.79,40.77,14.06,11.06,91.68,90.4,110.67,126.27,32.68,32.78,34.72,31.78,0.75,0.81,0.9,0.96,2.43,2.66,2.83,3.08,28.93,37.12,18.56,26.15
min,54.78,56.62,12.99,10.06,-0.61,-0.71,-0.88,0.05,-27.61,0.28,174.84,163.61,132.96,140.59,67.29,61.87,57.56,57.62,-1.34,-0.97,-1.29,-1.13,-1.1,-1.04,-1.25,-1.19,-72.75,-137.51,0.05,0.1
25%,98.76,96.63,28.62,27.67,-0.05,0.08,0.45,1.12,-0.91,2.66,239.05,245.46,224.39,206.39,96.22,92.26,95.97,95.45,0.25,0.13,0.24,0.22,-0.4,-0.27,-0.55,-0.52,-20.55,-22.71,7.85,8.84
50%,115.63,107.98,39.68,35.97,0.29,0.49,1.17,2.27,6.22,8.25,285.93,291.79,279.07,278.71,120.61,120.72,132.22,116.17,0.66,0.7,0.72,0.65,0.17,0.73,0.31,0.27,-2.69,1.88,18.49,20.74
75%,141.75,124.58,57.64,45.65,0.81,1.09,3.51,5.5,14.91,18.2,359.07,354.7,358.55,372.54,144.87,146.84,153.29,143.48,1.26,1.28,1.52,1.59,2.04,2.43,2.49,2.86,14.43,15.76,33.77,32.34
max,193.27,181.18,99.56,86.65,3.01,8.44,27.49,345.77,48.08,48.08,560.42,594.2,575.48,683.47,222.0,216.42,217.01,217.47,2.54,2.8,3.13,2.9,9.96,12.64,14.46,11.62,73.44,74.2,73.44,137.51


In [21]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 84 entries, 0 to 83
Data columns (total 32 columns):
userkey                        84 non-null object
hold_time_l_mean               84 non-null float64
hold_time_r_mean               84 non-null float64
hold_time_l_std                84 non-null float64
hold_time_r_std                84 non-null float64
hold_time_l_skew               84 non-null float64
hold_time_r_skew               84 non-null float64
hold_time_l_kurtosis           84 non-null float64
hold_time_r_kurtosis           84 non-null float64
hold_time_diff                 84 non-null float64
hold_time_diff_abs             84 non-null float64
latency_time_ll_mean           84 non-null float64
latency_time_rr_mean           84 non-null float64
latency_time_lr_mean           84 non-null float64
latency_time_rl_mean           84 non-null float64
latency_time_ll_std            84 non-null float64
latency_time_rr_std            84 non-null float64
latency_time_lr_std            

In [22]:
print(final_df.shape)
print(final_df.columns)

(84, 32)
Index(['userkey', 'hold_time_l_mean', 'hold_time_r_mean', 'hold_time_l_std',
       'hold_time_r_std', 'hold_time_l_skew', 'hold_time_r_skew',
       'hold_time_l_kurtosis', 'hold_time_r_kurtosis', 'hold_time_diff',
       'hold_time_diff_abs', 'latency_time_ll_mean', 'latency_time_rr_mean',
       'latency_time_lr_mean', 'latency_time_rl_mean', 'latency_time_ll_std',
       'latency_time_rr_std', 'latency_time_lr_std', 'latency_time_rl_std',
       'latency_time_ll_skew', 'latency_time_rr_skew', 'latency_time_lr_skew',
       'latency_time_rl_skew', 'latency_time_ll_kurtosis',
       'latency_time_rr_kurtosis', 'latency_time_lr_kurtosis',
       'latency_time_rl_kurtosis', 'latency_time_ll_rr_diff',
       'latency_time_lr_rl_diff', 'latency_time_ll_rr_diff_abs',
       'latency_time_lr_rl_diff_abs', 'parkinsons'],
      dtype='object')


In [23]:
# Let's check the correlation matrix with our new features
final_df.corr().parkinsons.sort_values(ascending=False)

parkinsons                     1.000000
hold_time_l_std                0.218492
hold_time_r_std                0.215538
hold_time_r_mean               0.145270
hold_time_l_mean               0.137871
latency_time_lr_rl_diff        0.095022
hold_time_diff_abs             0.070685
hold_time_diff                 0.041838
hold_time_l_kurtosis           0.012186
latency_time_lr_kurtosis       0.010037
latency_time_rl_skew           0.004255
latency_time_rl_std           -0.019534
latency_time_ll_std           -0.024688
latency_time_lr_skew          -0.034809
latency_time_lr_rl_diff_abs   -0.056644
latency_time_rl_kurtosis      -0.063084
latency_time_rr_kurtosis      -0.081218
latency_time_rr_mean          -0.084355
latency_time_ll_skew          -0.088284
latency_time_lr_std           -0.097480
latency_time_rr_skew          -0.108098
latency_time_lr_mean          -0.109339
hold_time_l_skew              -0.109536
latency_time_rr_std           -0.110562
latency_time_ll_rr_diff_abs   -0.111198
