<h1>Preprocessing the Data</h1>

In [11]:
# drop award, first place vote columns
# one hot encode position, team
# train test development split

import pandas as pd

In this notebook I clean up the datasets that I have created, one-hot encode my categorical variables (position and team), and split up my data into test, validation, and train sets, and separate my x values (features) and my y value (award points a player received).
<br><br>
This notebook should be run after running the 'get_full_dataset' notebook.

In [12]:
mvp_data = pd.read_csv('data/mvp_data.csv')
dpoy_data = pd.read_csv('data/dpoy_data.csv')
roy_data = pd.read_csv('data/roy_data.csv')
mip_data = pd.read_csv('data/mip_data.csv')
smoy_data = pd.read_csv('data/smoy_data.csv')

In [13]:
# if player played with multiple teams, replace team value with 'Multiple' (reduces num cols after one hot encoding from 1000+ to 103)
def cleanup_multiple_teams(team):
    if len(team) > 3:
        return 'Multiple'
    else:
        return team

mvp_data['team'] = mvp_data['team'].apply(cleanup_multiple_teams)
dpoy_data['team'] = dpoy_data['team'].apply(cleanup_multiple_teams)
roy_data['team'] = roy_data['team'].apply(cleanup_multiple_teams)
mip_data['team'] = mip_data['team'].apply(cleanup_multiple_teams)
smoy_data['team'] = smoy_data['team'].apply(cleanup_multiple_teams)

In [14]:
# get rid of columns that are not needed, one-hot encode categorical variables
def cleanup_cols(award_data):
    award_data = award_data.drop(columns=['Unnamed: 0', 'award', 'first_place_votes'])
    award_data = pd.get_dummies(award_data, columns=['position'])
    award_data = pd.get_dummies(award_data, columns=['team'])
    return award_data
    
mvp_data = cleanup_cols(mvp_data)
dpoy_data = cleanup_cols(dpoy_data)
roy_data = cleanup_cols(roy_data)
mip_data = cleanup_cols(mip_data)
smoy_data = cleanup_cols(smoy_data)

In [15]:
# split data into train, dev, and test data
def train_test_split(award_data):
    test_data = award_data[award_data['season'] >= 2016]
    dev_data = award_data[award_data['season'] >= 2011]
    dev_data = dev_data[dev_data['season'] <= 2015]
    train_data = award_data[award_data['season'] <= 2010]
    test_data = test_data.reset_index(drop=True)
    dev_data = dev_data.reset_index(drop=True)
    train_data = train_data.reset_index(drop=True)
    return train_data, dev_data, test_data

In [16]:
def x_y_split(award_data):
    x_data = award_data.drop(columns=['award_pts_won'])
    y_data = award_data[['award_pts_won']]
    return x_data, y_data

In [17]:
# run functions defined above and save new datasets to csv files
for award_name, dataset in [('mvp', mvp_data), ('dpoy', dpoy_data), ('roy', roy_data), ('mip', mip_data), ('smoy', smoy_data)]:
    train, dev, test = train_test_split(dataset)
    x_train, y_train = x_y_split(train)
    x_dev, y_dev = x_y_split(dev)
    x_test, y_test = x_y_split(test)
    x_train.to_csv(f'data/train_x_{award_name}.csv')
    y_train.to_csv(f'data/train_y_{award_name}.csv')
    x_dev.to_csv(f'data/dev_x_{award_name}.csv')
    y_dev.to_csv(f'data/dev_y_{award_name}.csv')
    x_test.to_csv(f'data/test_x_{award_name}.csv')
    y_test.to_csv(f'data/test_y_{award_name}.csv')