# Encode dataset with categorical variables to be used in a tree model

## Import necessary packages

In [1]:
from datetime import datetime
import pandas as pd
import os

## Define working directory and file input names

In [2]:
def build_output_path_string():
    """Dynamically build the ouput file string name to include the date at the end
    """
    suffix = datetime.today().strftime('%m%d%Y')
    out_path = f'preprocessed_tree_model_data_{suffix}.csv'
    return out_path

In [3]:
# define filenames and CWD as variable
working_directory = r"C:\Users\nick_simmons\Mitel"
input_file_name = 'ModelingTrainingSet.csv'
out_path = build_output_path_string()
full_input_path = os.path.join(working_directory, input_file_name)

In [4]:
# ensure you are working in the right directory
os.chdir(working_directory)
print(f'Your working directory is: {os.getcwd()}\n')
print(f'Input file name: {input_file_name}\n')
print(f'Binarized output file name: {out_path}')

Your working directory is: C:\Users\nick_simmons\Mitel

Input file name: ModelingTrainingSet.csv

Binarized output file name: preprocessed_tree_model_data_06242019.csv


# Preprocessing Steps
1. Import data as Pandas DataFrame
2. Remove the target variable, selected variables, and continuous variables from the df. 
3. Binarize the remaining fields in the dataframe
4. Rebuild the excluded continuous and target fields
5. Stitch the target, binarized and continuous dataframes into one single dataframe
6. Write the output dataframe as a csv file using the path string defined in the function: build_output_path_string()

## Declare dataset specific variables

In [5]:
full_input_path = r'C:\Users\nick_simmons\Mitel\ModelingTrainingSet.csv'
fields_to_drop =  ['ttc_buckets', 'Total Pipeline']
continuous_fields = ['Opportunity: Total # of Products', 'TTC']
target = 'Opportunity: Won'

## Import data and remove target, continuous, and user selected fields

In [6]:
def import_df(full_input_path):
    """Read in a DataFrame from a csv using pandas
    """
    return pd.read_csv(full_input_path, encoding='latin-1')

In [7]:
def remove_target(df, target):
    """Remove the target variable from the list of variables to define
    """
    return df.drop(columns=target)

In [8]:
def remove_selected_fields(df, fields_to_drop):
    """Create a list of user selected fields to remove from the dataset
    A helper function to allow the user to remove a list of selected columns
    """
    return df.drop(columns=fields_to_drop)

In [9]:
def remove_continuous_fields(df, continuous_fields):
    """Input a list of continuous fields to remove from the dataset
    """
    return df.drop(columns=continuous_fields)

## Binarize remaining fields by running all of the remove functions

In [10]:
def binarize_ready_df(df):
    """Use pandas built in function get_dummies to binarize all categorical variables
    """
    return pd.get_dummies(df)

In [11]:
def get_binarized_df(full_input_path, target, fields_to_drop, continuous_fields):
    """Run all the 'remove functions' before binarizing the remaining categorical variables
    """
    input_df = import_df(full_input_path)
    df_remove_target = remove_target(input_df, target)
    df_remove_selected = remove_selected_fields(df_remove_target, fields_to_drop)
    df_remove_selected_and_continuous = remove_continuous_fields(df_remove_selected, continuous_fields)
    binarized_df = binarize_ready_df(df_remove_selected_and_continuous)
    return binarized_df
binarized_df = get_binarized_df(full_input_path, target, fields_to_drop, continuous_fields)

## Build DataFrames for selected, continuous and target variables

In [12]:
def create_target_df(target):
    """Build a new df that only contains the target variable
    """
    input_df = import_df(full_input_path)
    return input_df[target]

target_df = create_target_df(target)

In [13]:
def create_continuous_df(continuous_fields):
    """Create a new df that only contains the variables defined as continuous
    """
    input_df = import_df(full_input_path)
    continuous_fields_df = input_df[continuous_fields]
    return continuous_fields_df

continuous_fields_df = create_continuous_df(continuous_fields)

## Concatenate target, binarized, and continuous DataFrames into one output DataFrame

In [14]:
def stitch_dfs_together(target_df, binarized_df, continuous_fields_df):
    """Concatenate the target_df, binarized_df, continuous_fields_df to create a single output dataframe
    """
    dfs_list = [target_df, binarized_df, continuous_fields_df]
    stiched_df = pd.concat(dfs_list, axis=1)
    return stiched_df

stiched_df = stitch_dfs_together(binarized_df, target_df, continuous_fields_df)

In [15]:
def write_csv(out_df):
    '''If the output file doesn't exist write to a csv else warn user that the file already exists
    '''
    if os.path.isfile(out_path):
        print('That file already exists, please change the output file name')
    else:
        print(f"File does not exist yet, writing new csv named: {out_path}")
        out_df.to_csv(out_path, mode='x', index=False)

## Define the main function and run the entire script from end to end

In [16]:
def main():
    binarized_df = get_binarized_df(full_input_path, target, fields_to_drop, continuous_fields)
    out_df = stitch_dfs_together(binarized_df, target_df, continuous_fields_df)
    write_csv(out_df)

In [17]:
if __name__ == '__main__':
    main()

File does not exist yet, writing new csv named: preprocessed_tree_model_data_06242019.csv
