# Data Preparation

Prepare historical stock prices datasets for demo analysis.

## Social Media Stock Prices

Major social media historical stock prices from 2012-2022 for meta, twitter, snap, pinterest, etsy.

**Source**: [Kaggle: Major social media historical stock prices](https://www.kaggle.com/datasets/prasertk/social-media-stock-prices)

Sample:
```csv
Date,Symbol,Adj Close,Close,High,Low,Open,Volume
2012-05-18,FB,38.22999954223633,38.22999954223633,45.0,38.0,42.04999923706055,573576400.0
2012-05-21,FB,34.029998779296875,34.029998779296875,36.65999984741211,33.0,36.529998779296875,168192700.0
2012-05-22,FB,31.0,31.0,33.59000015258789,30.940000534057617,32.61000061035156,101786600.0
2012-05-23,FB,32.0,32.0,32.5,31.360000610351562,31.3700008392334,73600000.0
2012-05-24,FB,33.029998779296875,33.029998779296875,33.209999084472656,31.770000457763672,32.95000076293945,50237200.0
2012-05-25,FB,31.90999984741211,31.90999984741211,32.95000076293945,31.110000610351562,32.900001525878906,37149800.0
```

### Clean up Source Dataset

Load and cleanup the original dataset:
- Massage data types
- Rename columns
- Add additional metadata columns

Save the result back to a new `csv` file: 

In [8]:
import pandas as pd
import numpy as np
from datetime import datetime, date


# load social media stock prices data file
orig_social_media_stock_prices = r"./data/social_media_stocks_2012-2022.csv"
df = pd.read_csv(
    orig_social_media_stock_prices,
    header=0,
    parse_dates=[0],
    date_format='%Y-%m-%d',
    on_bad_lines='skip',
)

# rename columns
[df.rename(columns={col_name: str(col_name).lower().replace(' ', '_')}, inplace=True) for col_name in list(df.columns)]
df.rename(columns={'symbol': 'ticker'}, inplace=True, errors='ignore')
# optimize datatypes and columns for performance
for col_name in ('adj_close', 'close', 'high', 'low', 'open'):
    # df[col_name] = pd.to_numeric(df[col_name].map(lambda x: round(x, ndigits=6)), downcast='float')
    df[col_name] = df[col_name].map(lambda x: round(x, ndigits=6))
# downcast volume
df['volume'] = pd.to_numeric(df['volume'], downcast='unsigned')
# add a year column
df.insert(1, 'year', pd.to_numeric(df['date'].map(lambda x: x.year), downcast='unsigned'))

print(f"df shape: {df.shape}")
# print(df.dtypes)
display(df.sample(n=10))

# save back to a csv file
output_file = r"./data/social_media_stocks_2012-2022.clean.csv"
df.to_csv(output_file, index=False)

df shape: (8193, 9)


Unnamed: 0,date,year,ticker,adj_close,close,high,low,open,volume
1050,2015-03-18,2015,FB,80.910004,80.910004,81.239998,79.169998,79.25,36912400
720,2014-07-22,2014,FB,69.269997,69.269997,69.769997,68.610001,69.760002,40398000
7672,2021-09-09,2021,ETSY,216.630005,216.630005,218.949997,214.050003,214.330002,1346100
7807,2021-10-18,2021,ETSY,228.0,228.0,230.843002,219.289993,220.940002,2519300
2874,2017-07-12,2017,TWTR,19.25,19.25,19.540001,18.91,18.92,30008100
67,2012-08-23,2012,FB,19.440001,19.440001,19.73,19.360001,19.5,32813700
3181,2017-10-30,2017,FB,179.869995,179.869995,180.690002,177.610001,179.259995,24353200
7733,2021-09-28,2021,FB,340.649994,340.649994,349.600006,338.920013,347.970001,21710300
7852,2021-10-29,2021,ETSY,250.690002,250.690002,252.764008,238.139999,239.470001,1817900
7515,2021-07-27,2021,PINS,74.290001,74.290001,77.290001,73.315002,75.699997,7160900


Plot and visualize the original stock values over time

In [10]:
import plotly.graph_objs as go
import plotly.io as pio

# Plotting the wave and amps using Plotly
fig = go.Figure()

# iterate through tickers and graphs each with line charts
tickers = list(df['ticker'].unique())
for ticker in tickers:      
    # Add the wave trace
    xdf = df[df['ticker'] == ticker][['date', 'ticker', 'adj_close']]
    fig.add_trace(go.Scatter(x=xdf['date'], y=xdf['adj_close'], mode='lines', name=ticker))

fig.show()

In [None]:
# Adjusting the code to first process each ticker individually, generate the trade data,
# and then calculate cash reserves and overall portfolio value in a second pass.

df = init_df.copy(deep=True)

# Initialize starting parameters
start_shares = {'FB': 100, 'TWTR': 200, 'PINS': 100, 'SNAP': 150, 'ETSY': 50}
initial_cash_reserve = 500000.000000  # Starting cash reserve for the portfolio

# Add the new columns to the dataframe without filling values yet
df['starting_shares'] = 0
df['current_shares'] = 0
df['daily_trades'] = 0
df['trade_price'] = 0.0
df['trade_value'] = 0.0
df['portfolio_share_value'] = 0.0
df['last_purchase_value'] = 0.0

tickers = list(df['ticker'].unique())

# Iterate through each ticker and simulate trades
for ticker in tickers:
    print(ticker, end='')
    row_num = 0
    last_progress_percentage = 0
    ticker_data = df[df['ticker'] == ticker].sort_values(by='date')
    
    # Set initial values for each ticker
    initial_shares = start_shares[ticker]
    last_purchase_value = ticker_data.iloc[0]['open']  # Start with the opening price for the first trade
    
    df.loc[ticker_data.index, 'starting_shares'] = initial_shares
    df.loc[ticker_data.index, 'current_shares'] = initial_shares
    df.loc[ticker_data.index, 'last_purchase_value'] = last_purchase_value
    
    # Simulate daily trades for this ticker
    for i in range(1, len(ticker_data)):
        previous_row = ticker_data.iloc[i - 1]
        current_row = ticker_data.iloc[i]
        
        # Generate a random trade price between the low and high for that day
        trade_price = round(np.random.uniform(current_row['low'], current_row['high']), 4)
        
        # Randomly decide whether to buy or sell
        trade_action = np.random.choice([-1, 1], p=[0.5, 0.5])  # Randomly decide to buy (-1) or sell (1)
        trade_amount = np.random.randint(1, 5)  # Trade between 1 to 5 shares
        
        # Calculate trade value = trade_amount * trade_price
        trade_value = round(trade_amount * trade_price, 4)
        
        # Update shares and trade info based on trade action
        if trade_action == -1 and previous_row['current_shares'] > 0:  # Sell
            df.loc[ticker_data.index[i], 'daily_trades'] = -trade_amount
            df.loc[ticker_data.index[i], 'current_shares'] = max(0, previous_row['current_shares'] - trade_amount)
        elif trade_action == 1:  # Buy
            df.loc[ticker_data.index[i], 'daily_trades'] = trade_amount
            df.loc[ticker_data.index[i], 'current_shares'] = previous_row['current_shares'] + trade_amount
            last_purchase_value = trade_price  # Update last purchase price for buying action
        else:
            df.loc[ticker_data.index[i], 'current_shares'] = previous_row['current_shares']
        
        # Update the trade details
        df.loc[ticker_data.index[i], 'trade_price'] = trade_price
        df.loc[ticker_data.index[i], 'trade_value'] = trade_value
        df.loc[ticker_data.index[i], 'last_purchase_value'] = last_purchase_value
        
        # Calculate portfolio share value = current shares * adj_close
        df.loc[ticker_data.index[i], 'portfolio_share_value'] = df.loc[ticker_data.index[i], 'current_shares'] * current_row['adj_close']

        # updating progress count
        row_num += 1
        current_progress_percentage = round((row_num / len(ticker_data)) * 10, 0)
        if current_progress_percentage != last_progress_percentage:
            print('.', end='')
            last_progress_percentage = current_progress_percentage

    # print a new line!
    print()

# Now process the cash reserves and total portfolio values by iterating through all rows
df = df.sort_values(by=['date', 'ticker']).reset_index(drop=True)

df['cash_reserves'] = initial_cash_reserve
df['portfolio_value'] = 0.000000

for i in range(1, len(df)):
    previous_row = df.iloc[i - 1]
    current_row = df.iloc[i]
    
    # Update cash reserves based on trade value
    if df.loc[i, 'daily_trades'] > 0:  # Buying
        df.loc[i, 'cash_reserves'] = previous_row['cash_reserves'] - df.loc[i, 'trade_value']
    elif df.loc[i, 'daily_trades'] < 0:  # Selling
        df.loc[i, 'cash_reserves'] = previous_row['cash_reserves'] + df.loc[i, 'trade_value']
    else:
        df.loc[i, 'cash_reserves'] = previous_row['cash_reserves']
    
    # Calculate total portfolio value = sum of portfolio share value + cash reserves
    df.loc[i, 'portfolio_value'] = df.loc[i, 'portfolio_share_value'] + df.loc[i, 'cash_reserves']


# adjust the column data types 
# Creating a set of newly added columns
new_columns = {"starting_shares", "current_shares", "daily_trades", "trade_price", 
               "trade_value", "portfolio_share_value", "last_purchase_value", 
               "cash_reserves", "portfolio_value"}
for col_name in new_columns:
    df[col_name] = pd.to_numeric(df[col_name].map(lambda x: round(x, ndigits=6)), downcast='float')

display(df[df['year'] == 2018].iloc[:100])
# drop some columns
df = df.drop(columns=['starting_shares'])
df = df.sort_values(by=['date', 'ticker']).reset_index(drop=True)
df.to_csv('./data/social_media_stocks_plus_2012-2022.csv', index=False)

In [None]:
# display(df['ticker'].value_counts())

# display(df[['year', 'ticker', 'volume']].groupby(['year', 'ticker']).agg(['count']))

# display(df[df['year'].isin(list(range(2017, 2023)))]['ticker'].value_counts())
# display(df[df['year'].isin(list(range(2017, 2023)))].shape)


display(df['ticker'].unique())
# display the first date where each ticker is being reported
print(df[['ticker', 'date']].groupby('ticker').agg(['min']).to_string())

## Plotly Example

Testing plotly functionality

In [3]:
import plotly.express as px
import plotly.io as pio

# Set the theme to 'plotly_dark' for a modern dark background theme
pio.templates.default = "plotly_dark"

# Sample data
df = pd.DataFrame({
    'x': range(10),
    'y': [2, 1, 4, 3, 5, 7, 6, 8, 9, 10],
    'category': ['A', 'B', 'A', 'B', 'A', 'B', 'A', 'B', 'A', 'B']
})

# Create a line chart with vibrant colors
fig = px.line(df, x='x', y='y', color='category', title="Vibrant Line Chart Example")

# Show the figure in the notebook
fig.show()


## Sinusoidal Wave

A python function that generates a smooth sinusoidal wave with varying amplitude and frequency. This is used to simulate market movement OR our trading algorithm buying and selling actions over a stock. The idea is that this type of action would graph well with smooth visible changes over time:

In [78]:
import numpy as np
import plotly.graph_objs as go
import plotly.io as pio
import math

seed = 30
length = 1000
frequency = 2
phase = 0.0
min_amp = -3.0
max_amp = 5.0
periods = 3
# frequency = (1 / periods)

np.random.seed(seed)
    
# Time vector
time = np.linspace(0, periods * np.pi, 1000)

# Generate the base sinusoidal wave with uniform frequency and phase
wave = np.sin(frequency * time + phase)


def get_period_indices(time, frequency):
    """
    Function to return the start, mid, and end indices of each period in the time array.
    
    Parameters:
    - time (np.ndarray): The time array (linspace) generated for the wave.
    - frequency (float): The frequency of the wave.
    
    Returns:
    - list of tuples: A list of tuples, where each tuple contains the start, mid, and end indices for each period.
    """
    # Calculate the number of points per period
    points_per_period = len(time) // frequency

    # Initialize the list of tuples to store start, mid, and end indices
    period_indices = []

    # Loop over each period and calculate the start, mid, and end indices
    for i in range(int(frequency)):
        start = int(i * points_per_period)
        end = int((i + 1) * points_per_period) - 1
        mid = (start + end) // 2
        period_indices.append((start, mid, end))
    
    return period_indices

# Call the function and get the indices
period_indices = get_period_indices(time, periods)

# Output the list of tuples (starting, midpoint, and ending indices)
# display(period_indices)

wave2 = wave.copy()

for period in period_indices:
    start, mid, end = period
    high = np.random.randint(1, math.floor(max_amp))
    low = np.random.randint(1, math.floor(max_amp))
    period_length = end - start + 1
    print(f"({period}): ({(high, low)})")
    wave2[start:(mid +1)] *= high
    wave2[(mid +1):(end +1)] *= low
    # Modify wave values by multiplying element by element
    # ts = np.linspace(0, 2 * np.pi, period_length) * high
    # wave2[start:(end +1)] *= ts


# wave2 = wave * amps

# Plotting the wave and amps using Plotly
fig = go.Figure()

# Add the wave trace
fig.add_trace(go.Scatter(x=time, y=wave, mode='lines', name='Wave'))

# # Add the amps trace (as a step plot to visualize the amplitude changes over time)
# fig.add_trace(go.Scatter(x=time, y=np.interp(time, np.linspace(0, 2 * np.pi, num_cycles * 2), amps), 
#                          mode='lines', name='Amplitudes'))

# Add the amps trace (as a step plot to visualize the amplitude changes over time)
fig.add_trace(go.Scatter(x=time, y=wave2, mode='lines', name='Amplitudes'))

# Add title and labels
fig.update_layout(
    title="Wave and Varying Amplitudes",
    xaxis_title="Time",
    yaxis_title="Amplitude",
    template="plotly_dark"
)

# Show the figure
fig.show()


((0, 166, 332)): ((2, 2))
((333, 499, 665)): ((2, 2))
((666, 832, 998)): ((1, 1))
