In [1]:
import numpy as np
import pandas as pd

In [2]:
def drop_features(data):
    """Drop features not related to turnip prices with
    the exception of In-Game Name and Island
    
    Args: 
        data (dataframe): original data from Maddox Knight's Turnip Mafia
    
    Returns:
        data (dataframe): contains In-Game Name, Island along with 
                          the buying and selling prices of turnips
    """
    
    data = data.iloc[:, 2:18]
    data.drop(['FC'], axis=1, inplace=True)
    return data

def mask_invalid_names(data):
    """Mask rows in which In-Game Name or Island are invalid
    
    Args: 
        data (dataframe): contains data associated with invalid
                          In-Game Name or Island
    
    Returns:
        data (dataframe): contains data associated with both 
                          a valid In-Game Name and Island
    """
    invalid_ign_mask = data['In-Game Name'].notna()
    invalid_island_mask = data['Island'].notna()
    valid_data = data[invalid_ign_mask & invalid_island_mask].copy()
    valid_data.reset_index(drop=True, inplace=True)
    return valid_data

def convert_entry_to_float(entry):
    """Convert a dataframe entry to float
    
    Args:
        entry (str/float):
            entry to be converted
    
    Returns:
        convert_entry (float): 
            entry as a float or np.nan 
    """
    
    try: 
        converted_entry = float(entry)
    except:
        converted_entry = np.nan
    return converted_entry

def preprocess(data):
    """Split data based on whether or not it is associated
    with a valid in-game name
    
    Args: 
        data (dataframe): contains in-game names and turnip prices
    """
    
    preprocessed_data = drop_features(data)
    preprocessed_data = mask_invalid_names(preprocessed_data)
    
    price_columns = preprocessed_data.columns[2:]
    preprocessed_data[price_columns] = (
        preprocessed_data[price_columns].applymap(convert_entry_to_float)
    )
        
    return preprocessed_data

In [3]:
week1_data = pd.read_csv('data/week1.csv', skiprows=[0])

Since goal of this exploration is to model the buying and selling prices of turnips in Animal Crossing: New Horizons, preprocessing involves removing columns that do not relate to turnip prices. The In-Game Name and Island columns will be retained and used as primary keys within a MySQL database. Data without both a valid In-Game Name and Island will be dropped.

Invalid turnip prices and turnip prices in string format will be replaced with NaNs and cast into floats respectively using try and except in conjunction with applymap. It is important to note that this would be inefficient for larger datasets and that it may be more efficient to only target columns with issues as opposed to the entire dataframe.

In [4]:
preprocessed_week1_data = preprocess(week1_data)

In [5]:
display(preprocessed_week1_data.head(3))

Unnamed: 0,In-Game Name,Island,Buy Price,Mon AM,Mon PM,Tue AM,Tue PM,Wed AM,Wed PM,Thu AM,Thu PM,Fri AM,Fri PM,Sat AM,Sat PM
0,Maddox,Knight,102.0,43.0,40.0,36.0,32.0,28.0,139.0,118.0,146.0,148.0,142.0,61.0,57.0
1,Dev,Sootopolis,93.0,54.0,51.0,46.0,135.0,121.0,146.0,135.0,142.0,45.0,38.0,,
2,Levii,Montecki,108.0,63.0,60.0,55.0,51.0,138.0,115.0,154.0,202.0,,,,
