# Data Preparation

Prepare historical stock prices datasets for demo analysis.

## Social Media Stock Prices

Major social media historical stock prices from 2012-2022 for meta, twitter, snap, pinterest, etsy.

**Source**: [Kaggle: Major social media historical stock prices](https://www.kaggle.com/datasets/prasertk/social-media-stock-prices)

Sample:
```csv
Date,Symbol,Adj Close,Close,High,Low,Open,Volume
2012-05-18,FB,38.22999954223633,38.22999954223633,45.0,38.0,42.04999923706055,573576400.0
2012-05-21,FB,34.029998779296875,34.029998779296875,36.65999984741211,33.0,36.529998779296875,168192700.0
2012-05-22,FB,31.0,31.0,33.59000015258789,30.940000534057617,32.61000061035156,101786600.0
2012-05-23,FB,32.0,32.0,32.5,31.360000610351562,31.3700008392334,73600000.0
2012-05-24,FB,33.029998779296875,33.029998779296875,33.209999084472656,31.770000457763672,32.95000076293945,50237200.0
2012-05-25,FB,31.90999984741211,31.90999984741211,32.95000076293945,31.110000610351562,32.900001525878906,37149800.0
```

### Clean up Source Dataset

Load and cleanup the original dataset:
- Massage data types
- Rename columns
- Add additional metadata columns

Save the result back to a new `csv` file: 

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime, date


# load social media stock prices data file
orig_social_media_stock_prices = r"./data/social_media_stocks_2012-2022.csv"
df = pd.read_csv(
    orig_social_media_stock_prices,
    header=0,
    parse_dates=[0],
    date_format='%Y-%m-%d',
    on_bad_lines='skip',
)

# rename columns
[df.rename(columns={col_name: str(col_name).lower().replace(' ', '_')}, inplace=True) for col_name in list(df.columns)]
df.rename(columns={'symbol': 'ticker'}, inplace=True, errors='ignore')
# optimize datatypes and columns for performance
for col_name in ('adj_close', 'close', 'high', 'low', 'open'):
    # df[col_name] = pd.to_numeric(df[col_name].map(lambda x: round(x, ndigits=6)), downcast='float')
    df[col_name] = df[col_name].map(lambda x: round(x, ndigits=6))
# downcast volume
df['volume'] = pd.to_numeric(df['volume'], downcast='unsigned')
# add a year column
df.insert(1, 'year', pd.to_numeric(df['date'].map(lambda x: x.year), downcast='unsigned'))

print(f"df shape: {df.shape}")
# print(df.dtypes)
display(df.sample(n=10))

# save back to a csv file
output_file = r"./data/social_media_stocks_2012-2022.clean.csv"
df.to_csv(output_file, index=False)

Plot and visualize the original stock values over time

In [None]:
import plotly.graph_objs as go
import plotly.io as pio

# Plotting the wave and amps using Plotly
pio.templates.default = 'plotly_dark'
fig = go.Figure()

# iterate through tickers and graphs each with line charts
tickers = list(df['ticker'].unique())
for ticker in tickers:      
    # Add the wave trace
    xdf = df[df['ticker'] == ticker][['date', 'ticker', 'adj_close']]
    fig.add_trace(go.Scatter(x=xdf['date'], y=xdf['adj_close'], mode='lines', name=ticker))

fig.show()

### Simulate Portfolio

This cell simulate a stock trading app portfolio:
- Starts with a set amount of cash reserves in the bank
- Trades stocks daily based on the ticker value
- Adds daily trade columns: trade_price, trade_value, current_shares
- Adjusts the total portfolio value and bank cash balance

In [None]:
import pandas as pd
import numpy as np
import math
from utils import generate_varying_amplitude_wave

# read the cleaned up file
clean_stock_prices_filepath = r"./data/social_media_stocks_2012-2022.clean.csv"
df = pd.read_csv(
    clean_stock_prices_filepath,
    header=0,
    parse_dates=[0],
    date_format='%Y-%m-%d',
    on_bad_lines='skip',
)

print(f"read source file. records: {len(df)}")
display(df.head())

# Initialize starting parameters
start_shares = {'FB': 100, 'TWTR': 200, 'PINS': 100, 'SNAP': 150, 'ETSY': 50}
initial_cash_reserve = 50000.00  # Starting cash reserve for the portfolio
scale = 1.0

dfs = []
# go through each ticker and 
#   - add a sinusoidal wave for the current_shares held in the portfolio
#   - then add daily trades based on changes in current shares
#   - add trade_prices, daily profit, and portfolio share amount
for ticker in df['ticker'].unique():
    xdf = df[df['ticker'] == ticker].copy().reset_index()
    print(f"ticker: {ticker} (len: {len(xdf)})")
    starting_shares = start_shares[ticker]
    # generate a sinusoidal wave with peaks between 1.5-2.0x starting shares
    wave = generate_varying_amplitude_wave(
        length=len(xdf),
        max_amp=int(starting_shares * (1 + np.random.uniform(0.5, 1, size=1)[0])), 
        frequency=2,
        periods=3,
    )
    # shift the wave up by number of starting shares

    current_shares = pd.Series(wave + starting_shares).map(lambda x: round(x, 4)) * scale
    xdf['current_shares'] = current_shares
    # apply daily trades
    xdf['daily_trades'] = current_shares.diff().fillna(0).map(lambda x: round(x, 4))
    # pick a random tarde value
    xdf['trade_price'] = xdf.apply(lambda r: round(np.random.uniform(low=r['low'], high=r['high'], size=1)[0], ndigits=4), axis='columns')
    # add daily tarde value in $$$
    xdf['trade_value'] = -1 * round(xdf['daily_trades'] * xdf['trade_price'], ndigits=4)
    dfs.append(xdf)

# concatenate tickers dataframes together and sort
df = pd.concat(dfs, ignore_index=True)
df = df.sort_values(by=['date', 'ticker'], ignore_index=True)
# display(df.sample(n=200))

# calculating cash reserves and portfolio value
current_portfolio_value = 0.0
current_cash_reserve = initial_cash_reserve
df['cash_reserves'] = initial_cash_reserve
df['portfolio_value'] = 0.0
cur_row = 0
current_progress_percentage = 0
portfolio = {}
print(f"Processing cash reserves & portfolio values", end='')
for i, row in df.iterrows():
    df.at[i, 'cash_reserves'] = current_cash_reserve + row['trade_value']
    portfolio[row['ticker']] = round(row['current_shares'] * row['adj_close'], ndigits=4)
    # get total portfolio value
    df.at[i, 'portfolio_value'] = round(sum(portfolio.values()), ndigits=4)
    # calculate percentage
    tmp_percentage = math.floor((i + 1) / len(df) * 10)
    if current_progress_percentage != tmp_percentage:
        print('.', end='', flush=True)
        current_progress_percentage = tmp_percentage
print()
# round up cash reserves and portfolio values
df['cash_reserves'] = df['cash_reserves'].map(lambda x: round(x, 2))
df['portfolio_value'] = df['portfolio_value'].map(lambda x: round(x, 2))
# drop index column
df = df.drop(columns=['index'], errors='ignore')
# output to file
output_file = r"./data/social_media_stocks_2012-2022.final.csv"
df.to_csv(output_file, index=False)
print(f"Generation complete: ")
display(df.sample(n=20))

# print("\nchecking null values:")
# df.isna().sum()


Visualize daily shares using plotly

In [None]:
import plotly.graph_objs as go
import plotly.io as pio

# Plotting the wave and amps using Plotly
pio.templates.default = 'plotly_dark'
fig = go.Figure()

# iterate through tickers and graphs each with line charts
tickers = list(df['ticker'].unique())
for ticker in tickers:      
    # Add the wave trace
    xdf = df[df['ticker'] == ticker][['date', 'ticker', 'current_shares', 'adj_close']]
    fig.add_trace(go.Scatter(x=xdf['date'], y=(xdf['current_shares'] * xdf['adj_close']), mode='lines', name=ticker))

fig.show()

fig2 = go.Figure()
# adding portfolio value
fig2.add_trace(go.Scatter(x=df['date'], y=df['portfolio_value'], mode='lines', name='Portfolio Value'))
fig2.show()

## Sandbox

A sandbox for testing values

In [None]:
# display(df['ticker'].value_counts())

# display(df[['year', 'ticker', 'volume']].groupby(['year', 'ticker']).agg(['count']))

# display(df[df['year'].isin(list(range(2017, 2023)))]['ticker'].value_counts())
# display(df[df['year'].isin(list(range(2017, 2023)))].shape)


display(df['ticker'].unique())
# display the first date where each ticker is being reported
print(df[['ticker', 'date']].groupby('ticker').agg(['min']).to_string())

Create a short preview of the csv:

In [14]:
df[df['year'] == 2020].head(n=50).to_csv(r"./data/sample_social_media.csv", index=False)

## Large Historical Data File Prep

In [14]:
import pandas as pd
import numpy as np


# List of top performing tech companies stock symbols
TECH_SYMBOLS = [
    "AAPL",   # Apple Inc.
    "MSFT",   # Microsoft Corporation
    "GOOGL",  # Alphabet Inc. (Class A)
    "GOOG",   # Alphabet Inc. (Class C)
    "AMZN",   # Amazon.com Inc.
    "FB",     # Meta Platforms, Inc. (formerly Facebook)
    "NFLX",   # Netflix, Inc.
    "NVDA",   # NVIDIA Corporation
    "TSLA",   # Tesla, Inc.
    "INTC",   # Intel Corporation
    "CSCO",   # Cisco Systems, Inc.
    "ADBE",   # Adobe Inc.
    "ORCL",   # Oracle Corporation
    "IBM",    # International Business Machines Corporation
    "CRM",    # Salesforce, Inc.
    "PYPL",   # PayPal Holdings, Inc.
    "AMD",    # Advanced Micro Devices, Inc.
    "TXN",    # Texas Instruments Incorporated
    "QCOM",   # Qualcomm Incorporated
    "AVGO",   # Broadcom Inc.
    "SHOP",   # Shopify Inc.
    "SNAP",   # Snap Inc.
    "TWTR",   # Twitter, Inc.
    "SQ",     # Block, Inc. (formerly Square)
    "DOCU"    # DocuSign, Inc.
]

data_file = r"./data/historical_stock_prices.csv"
output_file = r"data/historical_tech_stock_prices.csv"

# write the headers
with open(output_file, mode='w') as outfile:
    outfile.write("ticker,open,close,adj_close,low,high,volume,date\n")
# load csv
chunks = pd.read_csv(data_file, chunksize=10000)
for df in chunks:
    df = df[(df['ticker'].isin(TECH_SYMBOLS)) & (df['date'] >= '2012-01-01')]
    # sort and output
    df.to_csv(output_file, mode='a', index=False, header=False)
# read back and sort by date and ticker
df = pd.read_csv(output_file)
df.sort_values(by=['date', 'ticker'], ignore_index=True, inplace=True)
df.to_csv(output_file, index=False, header=True)


In [None]:
df = pd.read_csv(output_file)
print(f"unique tickers: {len(df['ticker'].unique())} all: {len(TECH_SYMBOLS)}")
# not in
print(f"missing tickers: {[x for x in TECH_SYMBOLS if x not in df['ticker'].unique()]}")
# min max date
df[['ticker', 'date']].groupby(by='ticker').agg(['min', 'max'])

## Simulate Daily Stock Trades 

Based on the cleaned up tech historical prices (above), generate a series of daily trades performed by different brokers.

- Stocks are traded in **logarithmic** daily distribution where some stocks are traded at higher quantities

In [None]:
import pandas as pd
import numpy as np
import random
from datetime import timedelta, datetime

# Set random seed for reproducibility
random.seed(42)
np.random.seed(42)

# Generate brokers
broker_names = [
    "Slick Sam", "Trading Tina", "Money Mike", "Clever Cathy", "Profit Pete", 
    "Risky Rachel", "Big Bucks Bob", "Smart Susan", "Lucky Luke"
]
brokers = broker_names
# brokers = [f"broker {i+1}: {name}" for i, name in enumerate(broker_names)]

# Parameters
min_trades_per_day, max_trades_per_day = 50, 200
share_prct_range = (0.00001, 0.0001)

# Read historical stock price data
data = pd.read_csv("data/historical_tech_stock_prices.csv", parse_dates=['date'])

# Create a list to hold generated trades
trades = []

# Get min and max dates from the data
min_date = data['date'].min()
max_date = data['date'].max()
# max_date = data['date'].min() + timedelta(days=5)
print(f"generating between: {min_date} - {max_date}")

# Create a date range from min to max
date_range = pd.date_range(start=min_date, end=max_date)

# Traverse the data day by day
for current_date in date_range:
    # Filter data for the current day
    day_data = data[data['date'] == current_date]
    if day_data.empty:
        continue
    
    # Sort tickers for this day and apply a smooth logarithmic curve to prioritize larger trades for certain tickers
    num_tickers = len(day_data['ticker'].unique())
    log_weights = np.logspace(0, -1, num=num_tickers)  # Smooth logarithmic distribution
    day_data = day_data.assign(weight=log_weights)
    day_data = day_data.sort_values(by='weight', ascending=False)
    
    # For each ticker in the current day, generate a series of trades
    for _, row in day_data.iterrows():
        # Get number of trades for the ticker on this day
        num_trades = random.randint(min_trades_per_day, max_trades_per_day)
        
        # Generate trades for the ticker
        for _ in range(num_trades):
            # Random broker
            broker = random.choice(brokers)
            
            # Random timestamps on the current date
            trade_timestamp = current_date + timedelta(seconds=random.randint(0, 86399))
            
            # Bid and ask prices between high and low for the day
            bid_price = np.random.uniform(row['low'], row['high'])
            ask_price = np.random.uniform(row['low'], row['high'])
            while ask_price <= bid_price:  # Ensure ask is higher than bid
                ask_price = np.random.uniform(row['low'], row['high'])
                
            # Trade price between bid and ask
            trade_price = np.random.uniform(bid_price, ask_price)
            
            # Bid-ask spread
            bid_spread = ask_price - bid_price
            
            # Determine number of shares based on the logarithmic weight and the day's volume
            shares = int(row['volume'] * row['weight'] * np.random.uniform(*share_prct_range))
            
            # Calculate trade value
            trade_value = round(trade_price * shares, ndigits=6)
            
            # Create the trade record
            trade = {
                'trade_timestamp': trade_timestamp,
                'ticker': row['ticker'],
                'broker': broker,
                'bid_price': round(bid_price, 4),
                'ask_price': round(ask_price, 4),
                'trade_price': round(trade_price, 4),
                'bid_spread': round(bid_spread, 4),
                'shares': shares,
                'trade_value': round(trade_value, 4),
                # meta columns
                'open': round(row['open'], 6),
                'close': round(row['adj_close'], 6),
                'date': row['date'],
            }
            
            # Include all the original stock data for the day
            # for col in row.index:
            #     trade[col] = row[col]
            
            # Append the trade record to the trades list
            trades.append(trade)
    # finished current date
    print(f"finished date: {current_date}")

# Convert the list of trades into a pandas DataFrame
trades_df = pd.DataFrame(trades)
# sort values
# trades_df.sort_values(by=['trade_timestamp', 'ticker', 'broker'], ignore_index=True, inplace=True)

# Output the generated trades DataFrame to a CSV file
trades_df.to_csv("data/tech_trades.csv", index=False)

print("Trades simulation complete. Output saved to data/tech_trades.csv")



In [5]:
import pandas as pd

# sort trades by timestamp and write back
df = pd.read_csv("data/tech_trades.csv")
df.sort_values(by=['trade_timestamp', 'ticker', 'broker'], ignore_index=True, inplace=True)
# reindex
df.drop(columns=['trade_id'], inplace=True, errors='ignore')
df.insert(0, 'trade_id', df.index + 100000)
df.to_csv("data/tech_trades.csv", index=False)

In [3]:
"""
Rewrite the entire final data file
    - remove teh "Broker #: "
"""
import pandas as pd

# sort trades by timestamp and write back
df = pd.read_csv("data/tech_trades.csv", nrows=10)
df['broker'] = df['broker'].map(lambda x: str(x).split(': ')[1])
df.to_csv("data/tech_trades.csv", index=False)
# display(df)

In [None]:
import pandas as pd
import numpy as np

# sort trades by timestamp and write back
df = pd.read_csv("data/tech_trades.csv", nrows=10000)
schema = {
    k: 'string' if str(v) == 'O' else \
       'string' if str(v) == 'object' else \
       'float' if str(v) == 'float64' else \
       'integer' if str(v) == 'int64' else \
       str(v)
    for k, v in dict(df.dtypes).items()
}

print(schema)