In [1]:
import numpy as np
import pandas as pd

In [2]:
def split_data(data):
    """Split data based on whether or not it is associated
    with a valid in-game name and drop empty rows after
    
    Args: 
        data (dataframe): contains in-game names and turnip prices
    """
    invalid_ign_mask = data['In-Game Name'].notna()
    valid_name_data = data[invalid_ign_mask].copy()
    valid_name_data.reset_index(drop=True, inplace=True)
    # valid_name_data.set_index(['In-Game Name'], inplace=True)
    # valid_name_data.dropna(how='all', inplace=True)

    valid_ign_mask = ~invalid_ign_mask
    invalid_name_data = data[valid_ign_mask].copy()
    invalid_name_data.reset_index(drop=True, inplace=True)
    invalid_name_data.drop('In-Game Name', axis=1, inplace=True)
    # invalid_name_data.dropna(how='all', inplace=True)
        
    return valid_name_data, invalid_name_data

def convert_entry_to_float(entry):
    """Convert an entry to a float 
    
    Args:
        entry (str/float):
            entry to be converted
    
    Returns:
        convert_entry (float): 
            entry as a float or np.nan 
    """
    
    try: 
        converted_entry = float(entry)
    except:
        converted_entry = np.nan
    return converted_entry

In [3]:
# load data
week1_data = pd.read_csv('data/week1.csv', skiprows=[0])
# week2_data = pd.read_csv('data/week2.csv', skiprows=[0, 1])

The goal of this exploration is to model the buying and selling prices of turnips in Animal Crossing New Horizons. Keeping this in mind, I begin by removing columns which do not relate to turnip prices while retaining In-Game Names and Island since I plan to use as primary keys within a MySQL database.

In [5]:
week1_data = week1_data.iloc[:, 2:18]
week1_data.drop(['FC'], axis=1, inplace=True)

I also split the data in terms of whether or not the player's in game name was provided as those without names will require different keys within the database.

In [6]:
valid_name_data, invalid_name_data = split_data(week1_data)

In [8]:
display(valid_name_data.head(3))

Unnamed: 0,In-Game Name,Island,Buy Price,Mon AM,Mon PM,Tue AM,Tue PM,Wed AM,Wed PM,Thu AM,Thu PM,Fri AM,Fri PM,Sat AM,Sat PM
0,Maddox,Knight,102,43.0,40.0,36.0,32.0,28,139,118,146.0,148.0,142.0,61.0,57.0
1,Dev,Sootopolis,93,54.0,51.0,46.0,135.0,121,146,135,142.0,45.0,38.0,,
2,Levii,Montecki,108,63.0,60.0,55.0,51.0,138,115,154,202.0,,,,


Using a function with try and except alongside applymap to convert entries into floats and replace strings with NaNs solves the problem. It is important to note that this would be inefficient for larger datasets and that it may be more efficient to only target columns with issues as opposed to the entire dataframe.

In [10]:
column_names = valid_name_data.columns

In [12]:
valid_name_data[column_names[2:]] = (
    valid_name_data[column_names[2:]].applymap(convert_entry_to_float)
)
invalid_name_data[column_names[2:]] = (
    invalid_name_data[column_names[2:]].applymap(convert_entry_to_float)
)

In [14]:
week_start = '2020-04-12'
buy_prices = pd.DataFrame(valid_name_data[column_names[0:3]])
buy_prices.dropna(inplace=True)
buy_prices['timestamp_observed'] = week_start
buy_prices.rename(
    columns={
        'In-Game Name': 'user_id', 
        'Island': 'island_id',
        'Buy Price': 'price'
    }, 
    inplace=True
)
buy_prices.head()

Unnamed: 0,user_id,island_id,price,timestamp_observed
0,Maddox,Knight,102.0,2020-04-12
1,Dev,Sootopolis,93.0,2020-04-12
2,Levii,Montecki,108.0,2020-04-12
3,Xia,Xiaoog,107.0,2020-04-12
4,Ardy,Ferngully,93.0,2020-04-12
5,Empress,Nuvali,95.0,2020-04-12
6,Jacob,Brink,90.0,2020-04-12
7,El,Leilani,107.0,2020-04-12
8,Bread,Brioche,110.0,2020-04-12
9,Tim,Owendia,106.0,2020-04-12
