# **Part 1: Extracting Features from Raw Hand Movement, Mousing Clicking, Keyboard Tapping and Memory Game Data**

Part 1 of this study focuses on transforming raw interaction data from a web-based Parkinson’s assessment platform into meaningful features suitable for machine learning. Participants engage in a series of interactive tasks, including mouse-tracing games to assess motor control, mouse clicking games to capture both reaction time and motor precision, timed keyboard tapping tests to evaluate reaction time and coordination, and memory games to probe cognitive function.

From these activities, detailed features are engineered—capturing spatial deviation, timing precision, error rates, and response consistency—forming a structured dataset. This processed dataset serves as the foundation for model training and evaluation, as presented in Part 2 and Part 3.

### **Mouting Google Colab (If Running on Colab)**

In [None]:
try:
    from google.colab import drive
    drive.mount('/content/drive')
    COLAB = True
    print("Note: using Google CoLab")
    %cd '/content/drive/MyDrive/Parkinson Paper'
    PROJECT_URL = './'
except:
    COLAB = False
    print("Note: using Jupyter Notebook")
    PROJECT_URL = './'

Mounted at /content/drive
Note: using Google CoLab
/content/drive/MyDrive/Parkinson Paper


### **Importing Libraries**

In [None]:
import pandas as pd
import numpy as np

import json
import os
import argparse
import ast

### **Extracting Features from Web Data**

In [None]:
class DataCleaner:
    def __init__(self, raw_data: pd.DataFrame):
        self.data = raw_data.copy()

    def clean_data(self):
        game_data = ['boxclicktimes', 'correctboxClicks', 'expectedKeys.round1',
                     'expectedKeys.round2', 'expectedKeys.round3', 'falseClicks',
                     'gameData.gameData', 'gameData.user','keyPresses.round1',
                     'keyPresses.round2', 'keyPresses.round3','r1points', 'pDominantTimes',
                     'pNonDominantTimes', 'qDominantTimes', 'qNonDominantTimes', 'r2points',
                     'r3points', 'rightClicks', 'timestamps.round1', 'timestamps.round2',
                     'timestamps.round3', 'wrongClicks', 'wrongboxClicks']
        self.data = self.data.dropna(subset=game_data)

    def safe_literal_eval(self, val):
        if isinstance(val, str):
            try:
                return ast.literal_eval(val)
            except (ValueError, SyntaxError):
                return val
        return val

    def preprocess_columns(self):
        list_columns = ['boxclicktimes', 'expectedKeys.round1', 'expectedKeys.round2', 'expectedKeys.round3',
                        'falseClicks', 'keyPresses.round1', 'keyPresses.round2',
                        'keyPresses.round3', 'timestamps.round1', 'timestamps.round2',
                        'timestamps.round3','rightClicks', 'wrongClicks', 'therapies', 'medications']

        for column in list_columns:
            self.data[column] = self.data[column].apply(self.safe_literal_eval)

    def split_by_next(self, s: str):
        parts = s.split('NEXT')
        return parts[:-1] if parts[-1] == '' else parts

    def transform_next_columns(self):
        next_columns = ['pDominantTimes', 'pNonDominantTimes', 'qDominantTimes',
                        'qNonDominantTimes', 'r1points', 'r2points', 'r3points']

        for column in next_columns:
            self.data[column] = self.data[column].apply(self.split_by_next)

    def get_cleaned_data(self) -> pd.DataFrame:
        return self.data

class FeatureExtractor:
    def __init__(self, cleaned_data: pd.DataFrame, output_path: str):
        self.data = cleaned_data
        self.output_path = output_path
        self.features = []
        self.labels = []
        self.feature_names = [
            "Participant ID", "Parkinson's Disease status", "Age", "Gender", "Race",
            "1. Mean deviation from centerline when tracing straight line (fraction of screen height)",
            "2. Amount of time taken to trace straight line (ms)",
            "2.5. Amount of time taken to trace straight line with respect to window width (miliseconds)",
            "3. Percentage of points traced inside straight line",
            "3.5. Number of points traced inside straight line with no regard to time taken",
            "4. Amount of time taken to trace sine wave (miliseconds)",
            "4.5. Amount of time taken to trace sine wave with respect to window width (miliseconds)",
            "5. Percentage of points traced inside sine wave",
            "5.5. Number of points traced inside sine wave with no regard to time taken",
            "6. Amount of time taken to trace spiral (miliseconds)",
            "6.5. Amount of time taken to trace spiral with respect to window width (miliseconds)",
            "7. Percentage of points traced inside spiral",
            "7.5 Percentage of points traced inside spiral with no regard to time taken",
            "8. False presses when prompted with a constant key",
            "9. False presses when prompted with a semi-random(random between 2 letters) key",
            "10. False presses when prompted with a random key",
            "10.5 Total false presses from all tests",
            "10.5X Average false presses from all tests",
            "11. Average response time when prompted with a constant key (miliseconds)",
            "11.5 Sum of response times when prompted with a constant key (miliseconds)",
            "12. Average response time when prompted with a semi-random(random between 2 letters) key (miliseconds)",
            "12.5 Sum of response times when prompted with a semi-random(random between 2 letters) key (miliseconds)",
            "13. Average response time when prompted with a random key (miliseconds)",
            "13.5 Sum of response times when prompted with a random key (miliseconds)",
            "14. Number of correctly pressed keys when prompted with a constant key",
            "15. Number of correctly pressed keys when prompted with a semi-random(random between 2 letters) key",
            "16. Number of correctly pressed keys when prompted with a random key",
            "16.5. Number of correctly pressed keys when prompted with a random key with respect to average response time",
            "17. Maximum deviation from centerline when tracing straight line (percentage of screen height)",
            "17.5 Maximum deviation from centerline when tracing straight line without regard to window height (pixels)",
            "18. Net accumulated deviation from centerline when tracing straight line (percentage of screen height)",
            "19. Total accumulated deviation from centerline when tracing straight line (percentage of screen height)",
            "20. Avg of absolute values of deviation from centerline when tracing straight line (percentage of screen height)",
            "21.1 Average Time from Dominant Hands for pressing p",
            "21.2 Average Time from Non-Dominant Hands for pressing p",
            "21.3 Ratio of Average Time from Dominant Hand and Non-Dominant Hand for pressing p",
            "21.4 Total Time from Dominant Hands for pressing p",
            "21.5 Total Time from Non-Dominant Hands for pressing p",
            "22.1 Average Time from Dominant Hands for pressing q",
            "22.2 Average Time from Non-Dominant Hands for pressing q",
            "22.3 Ratio of Average Time from Dominant Hand and Non-Dominant Hand for pressing q",
            "22.4 Total Time from Dominant Hands for pressing q",
            "22.5 Total Time from Non-Dominant Hands for pressing q",
            "23.1 Average Time taken for clicking Box",
            "23.2 Total Time taken for clicking Box",
            "23.3 Standard Deviation of Time taken for clicking Box",
            "24.1 Average Reaction time for Data collected from Game",
            "24.2 Total Time for Data collected from Game",
            "24.3 Standard Deviation of Time for Data collected from Game",
            "24.4 Difference of Maximum and Minimum Reaction time for Data Collected from Game",
            "25.1 Mean Right Clicks",
            "25.2 Total Right Clicks",
            "25.3 Standard Deviation Right Clicks",
            "26.1 Mean Wrong clicks",
            "26.2 Total Wrong clicks",
            "26.3 Standard Deviation Wrong Clicks",
            "27 Participant Height",
            "28 Dominant Hand",
            "29 Device Type",
            "30 Correct number of Box Clicks"
        ]

    def analyze_point(self, points):
        counter = 0
        sum_deviation = 0
        abs_deviation = 0
        max_deviation = 0

        for point in points:
            if ('true' in point):
                counter += 1;

            p = point.replace('false', 'fal')

            if len (p.split('s')) < 4:
                continue

            deviation = float(p.split('s')[3])
            sum_deviation += deviation
            abs_deviation += abs(deviation)

            if (abs(deviation) > abs(max_deviation)):
                max_deviation = deviation

        mean_abs_deviation = abs_deviation/len(points)

        return counter, sum_deviation, abs_deviation, mean_abs_deviation, max_deviation

    def analyze_keyboard_data(self, data, expected, real):
        correct = 0
        total = 0

        for i, point in enumerate(data):
            if (expected[i] == real[i]): correct+=1
            if point == '': continue
            total += float(point)

        return correct, total

    def extract_features(self):
        self.data['false1'], self.data['false2'], self.data['false3'] = list(zip(*self.data['falseClicks']))

        for _, row in self.data.iterrows():
            port_height, port_width = row['height'], row['width']
            inside_points = 0
            feature_vector = []
            label = 1 if row['status'] == 'pd' else 0

            feature_vector.append(row['user'])
            feature_vector.append(row['status'])
            feature_vector.append(row['age'])
            feature_vector.append(row['gender'])
            feature_vector.append(row['race'])

            r1_counter, r1_sum_deviation, r1_abs_deviation, r1_mean_abs_deviation, r1_max_deviation = self.analyze_point(row['r1points'])
            r2_counter, r2_sum_deviation, r2_abs_deviation, r2_mean_abs_deviation, r2_max_deviation = self.analyze_point(row['r2points'])
            r3_counter, r3_sum_deviation, r3_abs_deviation, r3_mean_abs_deviation, r3_max_deviation = self.analyze_point(row['r3points'])

            k1_correct, k1_total = self.analyze_keyboard_data(row['timestamps.round1'], row['expectedKeys.round1'], row['keyPresses.round1'])
            k2_correct, k2_total = self.analyze_keyboard_data(row['timestamps.round2'], row['expectedKeys.round2'], row['keyPresses.round2'])
            k3_correct, k3_total = self.analyze_keyboard_data(row['timestamps.round3'], row['expectedKeys.round3'], row['keyPresses.round3'])

            pDominantTimes = [int(num) for num in row['pDominantTimes']]
            pNonDominantTimes = [int(num) for num in row['pNonDominantTimes']]
            qDominantTimes = [int(num) for num in row['qDominantTimes']]
            qNonDominantTimes = [int(num) for num in row['qNonDominantTimes']]
            gameData = json.loads(row['gameData.gameData'])
            reaction_time_values = [entry["reactionTime"] for entry in gameData]

            feature_vector.append((r1_sum_deviation / len(row['r1points'])) / float(port_height)) #feature 1
            feature_vector.append(len(row['r1points']) * 500) #feature 2
            feature_vector.append((len(row['r1points'])/float(port_width))*500) #feature 2.5
            feature_vector.append(r1_counter / len(row['r1points'])) #feature 3
            feature_vector.append(r1_counter) #feature 3.5
            feature_vector.append(len(row['r2points']) * 500) #feature 4
            feature_vector.append((len(row['r2points'])/float(port_width))*500) #feature 4.5
            feature_vector.append(r2_counter / len(row['r2points'])) #feature 5
            feature_vector.append(r2_counter) #feature 5.5
            feature_vector.append(len(row['r3points']) * 500) #feature 6
            feature_vector.append((len(row['r3points'])/float(port_width))*500) #feature 6.5
            feature_vector.append(r3_counter / len(row['r3points'])) #feature 7
            feature_vector.append(r3_counter) #feature 7.5
            feature_vector.append(int(row['false1'])) #feature 8
            feature_vector.append(int(row['false2'])) #feature 9
            feature_vector.append(int(row['false3'])) #feature 10
            feature_vector.append(int(row['false1']) + int(row['false2']) + int(row['false3'])) #feature 10.5
            feature_vector.append(np.mean([int(row['false1']), int(row['false2']), int(row['false3'])])) #feature 10.5X
            feature_vector.append(k1_total/len(row['timestamps.round1'])) #feature 11
            feature_vector.append(k1_total) #feature 11.5
            feature_vector.append(k2_total/len(row['timestamps.round2'])) #feature 12
            feature_vector.append(k2_total) #feature 12.5
            feature_vector.append(k3_total/len(row['timestamps.round3'])) #feature 13
            feature_vector.append(k3_total) #feature 13.5
            feature_vector.append(k1_correct) #feature 14
            feature_vector.append(k2_correct) #feature 15
            feature_vector.append(k3_correct) #feature 16
            feature_vector.append(k3_correct/(k3_total/len(row['timestamps.round3']))) #feature 16.5
            feature_vector.append(r1_max_deviation / float(port_height)) #feature 17
            feature_vector.append(r1_max_deviation) #feature 17.5
            feature_vector.append(r1_sum_deviation / float(port_height)) #feature 18
            feature_vector.append(r1_abs_deviation/float(port_height)) #feature 19
            feature_vector.append(r1_mean_abs_deviation/float(port_height)) #feature 20
            feature_vector.append(np.mean(pDominantTimes)) #feature 21.1
            feature_vector.append(np.mean(pNonDominantTimes)) #feature 21.2
            feature_vector.append(np.mean(pDominantTimes)/np.mean(pNonDominantTimes)) #feature 21.3
            feature_vector.append(np.sum(pDominantTimes)) #feature 21.4
            feature_vector.append(np.sum(pNonDominantTimes)) #feature 21.5
            feature_vector.append(np.mean(qDominantTimes)) #feature 22.1
            feature_vector.append(np.mean(qNonDominantTimes)) #feature 22.2
            feature_vector.append(np.mean(qDominantTimes)/np.mean(qNonDominantTimes)) #feature 22.3
            feature_vector.append(np.sum(qDominantTimes)) #feature 22.4
            feature_vector.append(np.sum(qNonDominantTimes)) #feature 22.5
            feature_vector.append(np.mean(row['boxclicktimes'])) #feature 23.1
            feature_vector.append(np.sum(row['boxclicktimes'])) #feature 23.2
            feature_vector.append(np.std(row['boxclicktimes'])) #feature 23.3
            feature_vector.append(np.mean(reaction_time_values)) #feature 24.1
            feature_vector.append(np.sum(reaction_time_values)) #feature 24.2
            feature_vector.append(np.std(reaction_time_values)) #feature 24.3
            feature_vector.append(np.max(reaction_time_values) - np.min(reaction_time_values)) #feature 24.4
            feature_vector.append(np.mean(row['rightClicks'])) #feature 25.1
            feature_vector.append(np.sum(row['rightClicks'])) #feature 25.2
            feature_vector.append(np.std(row['rightClicks'])) #feature 25.3
            feature_vector.append(np.mean(row['wrongClicks'])) #feature 26.1
            feature_vector.append(np.sum(row['wrongClicks'])) #feature 26.2
            feature_vector.append(np.std(row['wrongClicks'])) #feature 26.3
            feature_vector.append(row['Participant_height']) #feature 27
            feature_vector.append(row['dominantHand']) #feature 28
            feature_vector.append(row['deviceType']) #feature 29
            feature_vector.append(row['correctboxClicks']) #feature 30

            self.features.append(feature_vector)
            self.labels.append(label)

    def save_dataset(self):
        df = pd.DataFrame(self.features, columns=self.feature_names)
        df.to_csv(self.output_path, index=False)
        print(f"Dataset saved at {self.output_path}")

    def run_pipeline(self):
        self.extract_features()
        self.save_dataset()

In [None]:
feature_name_mapping = {
    "Parkinson's Disease status": "Parkinson's Disease status",
    "Age": "Age",
    "Gender": "Gender",
    "Race": "Race",
    "1. Mean deviation from centerline when tracing straight line (fraction of screen height)": "M_deviation",
    "2. Amount of time taken to trace straight line (ms)": "A_tcl",
    "2.5. Amount of time taken to trace straight line with respect to window width (miliseconds)": "A_tslww",
    "3. Percentage of points traced inside straight line": "P_isl",
    "3.5. Number of points traced inside straight line with no regard to time taken": "N_tislnt",
    "4. Amount of time taken to trace sine wave (miliseconds)": "A_tsw",
    "4.5. Amount of time taken to trace sine wave with respect to window width (miliseconds)": "A_tww",
    "5. Percentage of points traced inside sine wave": "P_ptsw",
    "5.5. Number of points traced inside sine wave with no regard to time taken": "Num_iswtnt",
    "6. Amount of time taken to trace spiral (miliseconds)": "A_tts",
    "6.5. Amount of time taken to trace spiral with respect to window width (miliseconds)": "A_spww",
    "7. Percentage of points traced inside spiral": "P_insp",
    "7.5 Percentage of points traced inside spiral with no regard to time taken": "P_spnt",
    "8. False presses when prompted with a constant key": "False_pck",
    "9. False presses when prompted with a semi-random(random between 2 letters) key": "False_psr",
    "10. False presses when prompted with a random key": "False_prk",
    "10.5 Total false presses from all tests": "Tof_pa",
    "10.5X Average false presses from all tests": "Avg_fpa",
    "11. Average response time when prompted with a constant key (miliseconds)": "Avg_pck",
    "11.5 Sum of response times when prompted with a constant key (miliseconds)": "Sum_pck",
    "12. Average response time when prompted with a semi-random(random between 2 letters) key (miliseconds)": "Avg_psr",
    "12.5 Sum of response times when prompted with a semi-random(random between 2 letters) key (miliseconds)": "Sum_psr",
    "13. Average response time when prompted with a random key (miliseconds)": "Avg_prk",
    "13.5 Sum of response times when prompted with a random key (miliseconds)": "Sum_prk",
    "14. Number of correctly pressed keys when prompted with a constant key": "Num_crpck",
    "15. Number of correctly pressed keys when prompted with a semi-random(random between 2 letters) key": "Num_crpsr",
    "16. Number of correctly pressed keys when prompted with a random key": "Num_crprk",
    "16.5. Number of correctly pressed keys when prompted with a random key with respect to average response time": "Num_crprk_av",
    "17. Maximum deviation from centerline when tracing straight line (percentage of screen height)": "Maxdev_cltsl",
    "17.5 Maximum deviation from centerline when tracing straight line without regard to window height (pixels)": "Maxdev_cltslw",
    "18. Net accumulated deviation from centerline when tracing straight line (percentage of screen height)": "Net_devsl",
    "19. Total accumulated deviation from centerline when tracing straight line (percentage of screen height)": "T_devsl",
    "20. Avg of absolute values of deviation from centerline when tracing straight line (percentage of screen height)": "Avg_sl",
    "21.1 Average Time from Dominant Hands for pressing p": "Avg_dhpp",
    "21.2 Average Time from Non-Dominant Hands for pressing p": "Avg_ndhpp",
    "21.3 Ratio of Average Time from Dominant Hand and Non-Dominant Hand for pressing p": "Ratio_dndhpp",
    "21.4 Total Time from Dominant Hands for pressing p": "Total_dhpp",
    "21.5 Total Time from Non-Dominant Hands for pressing p": "Total_ndhpp",
    "22.1 Average Time from Dominant Hands for pressing q": "Avg_dhpq",
    "22.2 Average Time from Non-Dominant Hands for pressing q": "Avg_ndhpq",
    "22.3 Ratio of Average Time from Dominant Hand and Non-Dominant Hand for pressing q": "Ratio_dndhpq",
    "22.4 Total Time from Dominant Hands for pressing q": "Total_dhpq",
    "22.5 Total Time from Non-Dominant Hands for pressing q": "Total_ndhpq",
    "23.1 Average Time taken for clicking Box": "Avg_ttcb",
    "23.2 Total Time taken for clicking Box": "Total_ttcb",
    "23.3 Standard Deviation of Time taken for clicking Box": "Std_ttcb",
    "24.1 Average Reaction time for Data collected from Game": "Avg_reaction",
    "24.2 Total Time for Data collected from Game": "Totaltime_game",
    "24.3 Standard Deviation of Time for Data collected from Game": "Std_game",
    "24.4 Difference of Maximum and Minimum Reaction time for Data Collected from Game": "Maxmin_reaction",
    "25.1 Mean Right Clicks": "Mean_rc",
    "25.2 Total Right Clicks": "Total_rc",
    "25.3 Standard Deviation Right Clicks": "Std_rc",
    "26.1 Mean Wrong clicks": "Mean_wc",
    "26.2 Total Wrong clicks": "Total_wc",
    "26.3 Standard Deviation Wrong Clicks": "Std_wc",
    "27 Participant Height": "Paricipant_height",
    "28 Dominant Hand": "Dominant_hand",
    "29 Device Type": "Device_type",
    "30 Correct number of Box Clicks": "Correct_boxclick"
}

### **Dataset Cleaning, Preparation and Engineering**

In [None]:
class Dataset:
    def __init__(self, input_path: str, output_path: str):
        self.output_path = output_path
        raw_data = pd.read_csv(input_path)
        self.data_cleaner = DataCleaner(raw_data)
        cleaned_data = self.clean_data()
        self.feature_extractor = FeatureExtractor(cleaned_data, output_path)

    def clean_data(self):
        self.data_cleaner.clean_data()
        self.data_cleaner.preprocess_columns()
        self.data_cleaner.transform_next_columns()
        return self.data_cleaner.get_cleaned_data()

    def process_and_save(self):
        self.feature_extractor.run_pipeline()

    def get_extracted_dataset(self) -> pd.DataFrame:
        return pd.DataFrame(self.feature_extractor.features)

    def apply_feature_name_shortening(self, feature_name_mapping):
        extracted_features = pd.read_csv(self.output_path)
        extracted_features.drop(columns=['Participant ID'], inplace=True)
        extracted_features.rename(columns=feature_name_mapping, inplace=True)

        base, ext = os.path.splitext(self.output_path)
        self.shorten_output_path = f"{base}_shorten{ext}"
        extracted_features.to_csv(self.shorten_output_path, index=False)

    def apply_feature_engineering(self, final_output_path:str):
        processed_dataset = pd.read_csv(self.shorten_output_path)
        processed_dataset["Parkinson's Disease status"] = processed_dataset["Parkinson's Disease status"].replace('suspectedpd', 'pd')

        processed_dataset['false_presses_over_total_key_presses_constant_key'] = processed_dataset['False_pck'] / (processed_dataset['False_pck'] + processed_dataset['Num_crpck'])
        processed_dataset['false_presses_over_total_key_presses_semi-random_key'] = processed_dataset['False_psr'] / (processed_dataset['False_psr'] + processed_dataset['Num_crpsr'])
        processed_dataset['false_presses_over_total_key_presses_random_key'] = processed_dataset['False_prk'] / (processed_dataset['False_prk'] + processed_dataset['Num_crprk'])


        processed_dataset['mean_false_presses_over_total_key_presses'] = np.mean([
            processed_dataset['false_presses_over_total_key_presses_constant_key'],
            processed_dataset['false_presses_over_total_key_presses_semi-random_key'],
            processed_dataset['false_presses_over_total_key_presses_random_key']
            ], axis=0)

        processed_dataset ['std of the pfalse_presses_over_total_key_presses'] = np.std([
            processed_dataset['false_presses_over_total_key_presses_constant_key'],
            processed_dataset['false_presses_over_total_key_presses_semi-random_key'],
            processed_dataset['false_presses_over_total_key_presses_random_key']
            ], axis =0)


        processed_dataset['mean_of_total_amount_of_time_taken_to_trace_lines'] = np.mean([
            processed_dataset['A_tcl'],
            processed_dataset['A_tsw'],
            processed_dataset['A_tts']
            ], axis=0)


        processed_dataset['mean_of_total_amount_of_time_taken_to_trace_lines_with_respect_to_window_width'] = np.mean([
            processed_dataset['A_tslww'],
            processed_dataset['A_tww'],
            processed_dataset['A_spww']
            ], axis=0)



        processed_dataset['mean_of_total_Number_of_points_to_traced_inside_line_with_no_regard_to_time_taken'] = np.mean([
            processed_dataset['N_tislnt'],
            processed_dataset['Num_iswtnt'],
            processed_dataset['P_spnt']
            ], axis=0)


        processed_dataset['Interaction_Time_Straight_False_Presses']=processed_dataset['A_tcl'] * processed_dataset['False_pck']
        processed_dataset['Interaction_Time_Sine_Wave_False_presses'] =processed_dataset['A_tsw']* processed_dataset['False_pck']
        processed_dataset['Interaction_Time_Spiral_False_presses'] =processed_dataset['A_tts']* processed_dataset['False_pck']

        processed_dataset['Number of correctly pressed keys when prompted with a constant key with respect to average response time'] = processed_dataset['Num_crpck']/processed_dataset['Avg_pck']
        processed_dataset['Number of correctly pressed keys when prompted with a semi-random key with respect to average response time'] = processed_dataset['Num_crpsr']/processed_dataset['Avg_psr']


        processed_dataset['Ratio_Points_Sine_Straight'] = processed_dataset['P_ptsw']/processed_dataset['P_isl']
        processed_dataset['Ratio_Points_Spiral_Straight'] = processed_dataset['P_insp']/processed_dataset['P_isl']
        processed_dataset['Ratio_Points_Sine_Spiral'] = processed_dataset['P_ptsw']/processed_dataset['P_insp']


        processed_dataset.to_csv(final_output_path, index=False)
        print(f"Feature Engineered Dataset saved at {final_output_path}")

In [None]:
raw_dataset = Dataset(PROJECT_URL + '/data/raw/dataset.csv', PROJECT_URL + '/data/interim/dataset.csv')
raw_dataset.process_and_save()
raw_dataset.apply_feature_name_shortening(feature_name_mapping)
raw_dataset.apply_feature_engineering('./data/final/dataset.csv')

Dataset saved at .//data/interim/dataset.csv
Feature Engineered Dataset saved at ./data/final/dataset.csv
