In [None]:
%matplotlib inline

In [3]:
process_data_files = True  # if set to False, data will be loaded from saved files

In [5]:
from KaggleDataLoader import *

if not process_data_files:
    kaggle_taps = pd.read_csv(KAGGLE_TAPS_INPUT)
    print(kaggle_taps.head())
    kaggle_users = pd.read_csv(KAGGLE_USERS_INPUT)
    print(kaggle_users.head())

else:
    # Create dataframe from files, perform basic cleaning
    kaggle_users = create_merged_users_details_file()
    print(kaggle_users.head())
    
    kaggle_taps = create_merged_taps_dataframe()
    kaggle_taps = clean_bad_values(kaggle_taps)
    kaggle_taps = clean_incompatible_user_ids(kaggle_taps, kaggle_users)
    print(kaggle_taps.head())
    
    
    # Filter outliers
    def filter_column_by_quantile(df, column, threshold):
        len_before = len(df)
        df = df[df[column] < np.percentile(df[column], threshold)]
        len_after = len(df)
        print("Filtered out {} rows with outliers in column '{}'".format((len_before - len_after), column))
        return df
    
    
    def plot_percentiles_of_column(df, col, start, end, bins):
        X = np.linspace(start, end, bins)
        Y = [np.percentile(df[col], x) for x in X]
        plt.plot(X, Y)
        plt.title(col + " Percentiles")
        plt.xlabel("Percent")
        plt.ylabel("Percentile Value")
        plt.show()
    
    
    # Filter out outliers of HoldTime:
    plot_percentiles_of_column(kaggle_taps, 'HoldTime', 99.96, 99.9999, 20)
    # After the percentile 99.993 we see significantly higher values, which are definitely outliers.
    kaggle_taps = filter_column_by_quantile(kaggle_taps, 'HoldTime', 99.993)
    
    # Add parsed date and time column + calculate cumulative time
    kaggle_taps = add_cumulative_timestamps_column(kaggle_taps)
    
    
    # Group to bin indexes by the cumulative timestamps
    def build_bins(df, bin_size_seconds):
        df["PressTimeCumulative"] = df["PressTimeCumulative"] / 1000
        max_press = (int(max((df["PressTimeCumulative"])) / bin_size_seconds) + 1) * bin_size_seconds + 1
        user_bins = [i for i in range(0, max_press, bin_size_seconds)]
        df["binIndex"] = pd.cut((df["PressTimeCumulative"]), user_bins)
        return df
    
    
    kaggle_taps = build_bins(kaggle_taps, 90)
    
    # Keep only necessary columns and save to file
    kaggle_taps = kaggle_taps[TAPS_FINAL_COLUMNS + ['binIndex']]
    print(kaggle_taps.head())
    
    kaggle_taps.to_csv(constants.KAGGLE_TAPS_INPUT, index=False)

Loading taps files: 100/622


Loading taps files: 200/622


Loading taps files: 300/622


Loading taps files: 400/622


Loading taps files: 500/622


KeyboardInterrupt: 

In [4]:
from MITDataLoader import *

if not process_data_files:
    mit_taps = pd.read_csv(MIT_TAPS_INPUT)
    print(mit_taps.head())
    mit_users = pd.read_csv(MIT_USERS_INPUT)
    print(mit_users.head())

else:
    mit_users = pd.read_csv(USERS, delimiter=',', header=0, error_bad_lines=False,
                            low_memory=False, usecols=["pID", "gt", "updrs108", "file_1", "file_2"])
    
    mit_taps = create_merged_taps_dataframe()
    mit_taps = clean_errors_and_bad_values(mit_taps)
    
    # Group to bin indexes by pressTime and add as a new column
    bin_size_seconds = 90
    max_press = (int(max(mit_taps["pressTime"]) / bin_size_seconds) + 1) * bin_size_seconds + 1
    user_bins = [i for i in range(0, max_press, bin_size_seconds)]
    mit_taps["binIndex"] = pd.cut(mit_taps["pressTime"], user_bins)
    print(mit_taps.head())
    
    
    # Filter outliers
    
    def plot_percentile(df, column, start, end, bins):
        X = np.linspace(start, end, bins)
        Y = [np.percentile(df[column], x) for x in X]
        plt.plot(X, Y)
        plt.title(column + " Percentiles")
        plt.xlabel("Percent")
        plt.ylabel("Percentile Value")
        plt.show()
    
    
    def filter_column_by_quantile(df, column, threshold):
        len_before = len(df)
        df = df[df[column] < np.percentile(df[column], threshold)]
        len_after = len(df)
        print("Filtered out {} rows with outliers in column '{}'".format((len_before - len_after), column))
    
    
    if SHOW_PLOTS:
        for col in list(set(FLOAT_COLUMNS) - {"pressTime"}):
            plot_percentile(mit_taps, col, 98, 99.9999, 40)
    
    # Filter according to the results in the plots
    filter_column_by_quantile(mit_taps, "HoldTime", 99.99)
    filter_column_by_quantile(mit_taps, "LatencyTime", 99.4)
    filter_column_by_quantile(mit_taps, "FlightTime", 99.95)
    
    # Save to file - Taps file
    mit_taps[["HoldTime", "LatencyTime", "FlightTime"]] = \
        1000 * mit_taps[["HoldTime", "LatencyTime", "FlightTime"]]  # to milliseconds
    print(mit_taps.head())
    
    mit_taps.to_csv(MIT_TAPS_INPUT, index=False)
    
    # Save to file - Users file
    mit_users.rename(columns={'pID': 'ID', 'gt': 'Parkinsons', 'updrs108': 'UDPRS'}, inplace=True)
    mit_users = mit_users[['ID', 'Parkinsons', 'UDPRS']]
    print(mit_users.head())
    
    mit_users.to_csv(MIT_USERS_INPUT, index=False)


Filtered out 1 rows with bad values in column 'HoldTime'
Filtered out 3504 rows with bad values in column 'LatencyTime'


Filtered out 12987 rows with bad values in column 'FlightTime'
Filtered out 0 rows with bad values in column 'pressTime'


Filtered out 10664 rows with bad values in column 'Hand'
Filtered out 16481 rows with bad values in column 'Direction'


Filtered out 17 rows with outliers in column 'HoldTime'


Filtered out 966 rows with outliers in column 'LatencyTime'


Filtered out 81 rows with outliers in column 'FlightTime'
