In [1]:
import numpy as np
import pandas as pd
import sqlalchemy

from database_credentials import get_database_url
from datetime import datetime, timedelta
from preprocessing import preprocess

In [2]:
def get_buy_date():
    """Get the turnip buy date

    Args:
        None
    
    Returns:
        buy_dt (datetime): turnip buy date in %Y-%m-%d format
    """
    
    date_is_invalid = True
    while date_is_invalid:
        try:
            buy_date = input("Buy Date of Turnips (yyyy-mm-dd): ")
            buy_dt = datetime.strptime(buy_date, '%Y-%m-%d')
            date_is_invalid = False
        except:
            print("Invalid date format.")

    return buy_dt

def get_price_data(data, price_column, date):
    """Get the prices from a column and format it into a 
    dataframe columns containing In-Game Name, Island, 
    price and date 
    
    Args: 
        data (dataframe): preprocessed data
        price_column (str): column containing prices
        date (datetime): date prices were observed
    
    Returns:
        prices (dataframe): 
            contains In-Game Name, Island, turnip prices and date 
    """
    
    column_names = data.columns[0:2].to_list()
    column_names.append(price_column)
    price_data = pd.DataFrame(data[column_names])
    price_data.dropna(inplace=True)
    price_data['datetime_observed'] = date
    
    new_column_names = {
        'In-Game Name': 'user_id', 
        'Island': 'island_id',
        price_column: 'price'
    }
    
    price_data.rename(columns=new_column_names, inplace=True)
    return price_data

def get_buy_sell_price_data(data, buy_date):
    """Get the buying and selling prices of turnips from data
    
    Args:
        data (dataframe): preprocessed data
        buy_date (datetime): buy date of turnips in yyyy-mm-dd format
    
    Returns:
        buy_prices (dataframe): buying prices of turnips
        sell_prices (dataframe): selling prices of turnips
    """
    
    buy_data = get_price_data(data, 'Buy Price', buy_date)
    
    # slice Mon AM, Mon PM, ... Sat AM, Sat PM
    sell_columns = data.columns[3:]
    first_sell_dt = buy_date + timedelta(days=1)
    
    # list comp yyyy-mm-dd 00:00:00 for AM and 
    # yyyy-mm-dd 12:00:00 for PM times
    sell_times = [
        first_sell_dt + timedelta(hours=12*elapsed_interval) 
        for elapsed_interval in range(sell_columns.shape[0])
    ]
    
    # get data for each column and datetime pair
    sell_data = [
        get_price_data(data, column, date) 
        for column, date in zip(sell_columns, sell_times)
    ]
    sell_data = pd.concat(sell_data)
    
    return buy_data, sell_data

In [3]:
# data = pd.read_csv('data/week1.csv', skiprows=[0])
# week 1: 2020-04-12

# week 2: 2020-04-19
# week 3: 2020-04-26
# week 4: 2020-05-03
# week 5: 2020-05-10

# unicode error https://stackoverflow.com/questions/18171739/
# week 6: 2020-05-17 

# week 7: 2020-05-24

In [4]:
week_num = 7
file_path = 'data/week{0}.csv'.format(week_num)
data = pd.read_csv(file_path, skiprows=[0, 1], engine='python')

Preprocessing involves removing columns that do not relate to turnip prices and rows with invalid In-Game Name or Island entries. Invalid turnip prices will be replaced with NaNs while turnip prices in string format will be cast into floats using try and except in conjunction with applymap. Note that this would be inefficient for larger datasets and that a more efficient approach would be to only target specific columns as opposed to the entire dataframe.

In [5]:
preprocessed_data = preprocess(data)

In [6]:
display(preprocessed_data.head(3))

Unnamed: 0,In-Game Name,Island,Buy Price,Mon AM,Mon PM,Tue AM,Tue PM,Wed AM,Wed PM,Thu AM,Thu PM,Fri AM,Fri PM,Sat AM,Sat PM
0,Maddox,Knight,96.0,90.0,70.0,63.0,111.0,,,,,,,,
1,Bridie,Nevinova,102.0,,,141.0,154.0,167.0,162.0,75.0,72.0,68.0,64.0,60.0,
2,Izzy,Gilligan's,97.0,91.0,113.0,,,,,,,,,,


In [7]:
buy_date = get_buy_date()
buy_data, sell_data = get_buy_sell_price_data(preprocessed_data, buy_date)

Buy Date of Turnips (yyyy-mm-dd): 2020-05-24


In [8]:
display(buy_data.head(3))
display(sell_data)

Unnamed: 0,user_id,island_id,price,datetime_observed
0,Maddox,Knight,96.0,2020-05-24
1,Bridie,Nevinova,102.0,2020-05-24
2,Izzy,Gilligan's,97.0,2020-05-24


Unnamed: 0,user_id,island_id,price,datetime_observed
0,Maddox,Knight,90.0,2020-05-25 00:00:00
2,Izzy,Gilligan's,91.0,2020-05-25 00:00:00
3,Eggy,Pearadise,132.0,2020-05-25 00:00:00
4,jakob,relax land,142.0,2020-05-25 00:00:00
5,Leah,IslaNublar,126.0,2020-05-25 00:00:00
...,...,...,...,...
3,Eggy,Pearadise,146.0,2020-05-30 12:00:00
4,jakob,relax land,60.0,2020-05-30 12:00:00
15,Aika,Erijima,45.0,2020-05-30 12:00:00
18,Radirific,The Matrix,127.0,2020-05-30 12:00:00


In [9]:
database_url = get_database_url()
engine = sqlalchemy.create_engine(database_url)
connection = engine.connect()

In [10]:
buy_data.to_sql('buy_data', connection, if_exists='append', index=False)
sell_data.to_sql('sell_data', connection, if_exists='append', index=False)

In [11]:
connection.close()
engine.dispose()