# Data Preparation

Prepare historical stock prices datasets for demo analysis.

## Social Media Stock Prices

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime, date


# load social media stock prices data file
social_media_stock_filepath = r"./data/social_media_stocks_2012-2022.csv"
df = pd.read_csv(
    social_media_stock_filepath,
    header=0,
    parse_dates=[0],
    date_format='%Y-%m-%d',
    on_bad_lines='skip',
)

# rename columns
[df.rename(columns={col_name: str(col_name).lower().replace(' ', '_')}, inplace=True) for col_name in list(df.columns)]
df.rename(columns={'symbol': 'ticker'}, inplace=True, errors='ignore')
# optimize datatypes and columns for performance
for col_name in ('adj_close', 'close', 'high', 'low', 'open'):
    # df[col_name] = pd.to_numeric(df[col_name].map(lambda x: round(x, ndigits=6)), downcast='float')
    df[col_name] = df[col_name].map(lambda x: round(x, ndigits=6))
# downcast volume
df['volume'] = pd.to_numeric(df['volume'], downcast='unsigned')
df.insert(1, 'year', pd.to_numeric(df['date'].map(lambda x: x.year), downcast='unsigned'))

print(df.shape)
print(df.dtypes)
display(df.sample(n=100))

init_df = df.copy(deep=True)
init_df[init_df['year'] == 2022].iloc[:100].to_csv('./data/sample_social_media.csv', index=False)

In [None]:
# Adjusting the code to first process each ticker individually, generate the trade data,
# and then calculate cash reserves and overall portfolio value in a second pass.

df = init_df.copy(deep=True)

# Initialize starting parameters
start_shares = {'FB': 100, 'TWTR': 200, 'PINS': 100, 'SNAP': 150, 'ETSY': 50}
initial_cash_reserve = 500000.000000  # Starting cash reserve for the portfolio

# Add the new columns to the dataframe without filling values yet
df['starting_shares'] = 0
df['current_shares'] = 0
df['daily_trades'] = 0
df['trade_price'] = 0.0
df['trade_value'] = 0.0
df['portfolio_share_value'] = 0.0
df['last_purchase_value'] = 0.0

tickers = list(df['ticker'].unique())

# Iterate through each ticker and simulate trades
for ticker in tickers:
    print(ticker, end='')
    row_num = 0
    last_progress_percentage = 0
    ticker_data = df[df['ticker'] == ticker].sort_values(by='date')
    
    # Set initial values for each ticker
    initial_shares = start_shares[ticker]
    last_purchase_value = ticker_data.iloc[0]['open']  # Start with the opening price for the first trade
    
    df.loc[ticker_data.index, 'starting_shares'] = initial_shares
    df.loc[ticker_data.index, 'current_shares'] = initial_shares
    df.loc[ticker_data.index, 'last_purchase_value'] = last_purchase_value
    
    # Simulate daily trades for this ticker
    for i in range(1, len(ticker_data)):
        previous_row = ticker_data.iloc[i - 1]
        current_row = ticker_data.iloc[i]
        
        # Generate a random trade price between the low and high for that day
        trade_price = round(np.random.uniform(current_row['low'], current_row['high']), 4)
        
        # Randomly decide whether to buy or sell
        trade_action = np.random.choice([-1, 1], p=[0.5, 0.5])  # Randomly decide to buy (-1) or sell (1)
        trade_amount = np.random.randint(1, 5)  # Trade between 1 to 5 shares
        
        # Calculate trade value = trade_amount * trade_price
        trade_value = round(trade_amount * trade_price, 4)
        
        # Update shares and trade info based on trade action
        if trade_action == -1 and previous_row['current_shares'] > 0:  # Sell
            df.loc[ticker_data.index[i], 'daily_trades'] = -trade_amount
            df.loc[ticker_data.index[i], 'current_shares'] = max(0, previous_row['current_shares'] - trade_amount)
        elif trade_action == 1:  # Buy
            df.loc[ticker_data.index[i], 'daily_trades'] = trade_amount
            df.loc[ticker_data.index[i], 'current_shares'] = previous_row['current_shares'] + trade_amount
            last_purchase_value = trade_price  # Update last purchase price for buying action
        else:
            df.loc[ticker_data.index[i], 'current_shares'] = previous_row['current_shares']
        
        # Update the trade details
        df.loc[ticker_data.index[i], 'trade_price'] = trade_price
        df.loc[ticker_data.index[i], 'trade_value'] = trade_value
        df.loc[ticker_data.index[i], 'last_purchase_value'] = last_purchase_value
        
        # Calculate portfolio share value = current shares * adj_close
        df.loc[ticker_data.index[i], 'portfolio_share_value'] = df.loc[ticker_data.index[i], 'current_shares'] * current_row['adj_close']

        # updating progress count
        row_num += 1
        current_progress_percentage = round((row_num / len(ticker_data)) * 10, 0)
        if current_progress_percentage != last_progress_percentage:
            print('.', end='')
            last_progress_percentage = current_progress_percentage

    # print a new line!
    print()

# Now process the cash reserves and total portfolio values by iterating through all rows
df = df.sort_values(by=['date', 'ticker']).reset_index(drop=True)

df['cash_reserves'] = initial_cash_reserve
df['portfolio_value'] = 0.000000

for i in range(1, len(df)):
    previous_row = df.iloc[i - 1]
    current_row = df.iloc[i]
    
    # Update cash reserves based on trade value
    if df.loc[i, 'daily_trades'] > 0:  # Buying
        df.loc[i, 'cash_reserves'] = previous_row['cash_reserves'] - df.loc[i, 'trade_value']
    elif df.loc[i, 'daily_trades'] < 0:  # Selling
        df.loc[i, 'cash_reserves'] = previous_row['cash_reserves'] + df.loc[i, 'trade_value']
    else:
        df.loc[i, 'cash_reserves'] = previous_row['cash_reserves']
    
    # Calculate total portfolio value = sum of portfolio share value + cash reserves
    df.loc[i, 'portfolio_value'] = df.loc[i, 'portfolio_share_value'] + df.loc[i, 'cash_reserves']


# adjust the column data types 
# Creating a set of newly added columns
new_columns = {"starting_shares", "current_shares", "daily_trades", "trade_price", 
               "trade_value", "portfolio_share_value", "last_purchase_value", 
               "cash_reserves", "portfolio_value"}
for col_name in new_columns:
    df[col_name] = pd.to_numeric(df[col_name].map(lambda x: round(x, ndigits=6)), downcast='float')

display(df[df['year'] == 2018].iloc[:100])
# drop some columns
df = df.drop(columns=['starting_shares'])
df = df.sort_values(by=['date', 'ticker']).reset_index(drop=True)
df.to_csv('./data/social_media_stocks_plus_2012-2022.csv', index=False)

In [None]:
# display(df['ticker'].value_counts())

# display(df[['year', 'ticker', 'volume']].groupby(['year', 'ticker']).agg(['count']))

# display(df[df['year'].isin(list(range(2017, 2023)))]['ticker'].value_counts())
# display(df[df['year'].isin(list(range(2017, 2023)))].shape)


display(df['ticker'].unique())
# display the first date where each ticker is being reported
print(df[['ticker', 'date']].groupby('ticker').agg(['min']).to_string())

Output a sample set to a csv

In [24]:
df[df['year'] == 2022].iloc[:100].to_csv(r'./data/sample_social_media.csv', index=False)