In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys
import re
import glob
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler, LabelEncoder, OrdinalEncoder, RobustScaler
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve, precision_recall_curve, auc, f1_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression, SGDClassifier, RidgeClassifier, RidgeClassifierCV, PassiveAggressiveClassifier
from sklearn.neighbors import KNeighborsClassifier, NearestCentroid, KNeighborsRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, AdaBoostClassifier, AdaBoostRegressor, GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.svm import SVC, SVR
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.neural_network import MLPClassifier, MLPRegressor

In [2]:
insiders = pd.read_csv('qq_beta-live-insiders.csv')

In [34]:
prices = pd.read_csv('../data/av_query_ZWS.csv', skiprows=5)
prices['Date'] = pd.to_datetime(prices['5. Time Zone'])
prices.drop(columns=['US/Eastern', '5. Time Zone'], inplace=True)
prices.head()

Unnamed: 0,Unnamed: 2,Date
0,"{'1. open': '29.9500', '2. high': '30.0700', '...",2024-01-26
1,"{'1. open': '29.0200', '2. high': '29.8200', '...",2024-01-25
2,"{'1. open': '29.4300', '2. high': '29.4300', '...",2024-01-24
3,"{'1. open': '29.5500', '2. high': '29.6400', '...",2024-01-23
4,"{'1. open': '29.1500', '2. high': '29.5700', '...",2024-01-22


In [35]:
import json
prices['Open'] = prices['Unnamed: 2'].apply(lambda x: float(json.loads(x.replace("'", '"'))['1. open'].replace(',', '')))
prices['High'] = prices['Unnamed: 2'].apply(lambda x: float(json.loads(x.replace("'", '"'))['2. high'].replace(',', '')))
prices['Low'] = prices['Unnamed: 2'].apply(lambda x: float(json.loads(x.replace("'", '"'))['3. low'].replace(',', '')))
prices['Close'] = prices['Unnamed: 2'].apply(lambda x: float(json.loads(x.replace("'", '"'))['4. close'].replace(',', '')))
prices['Volume'] = prices['Unnamed: 2'].apply(lambda x: float(json.loads(x.replace("'", '"'))['5. volume'].replace(',', '')))
prices.drop(columns=['Unnamed: 2'], inplace=True)
prices.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume
0,2024-01-26,29.95,30.07,29.48,29.68,1026479.0
1,2024-01-25,29.02,29.82,28.96,29.81,1190017.0
2,2024-01-24,29.43,29.43,28.53,28.6,1082118.0
3,2024-01-23,29.55,29.64,28.94,29.17,903778.0
4,2024-01-22,29.15,29.57,29.03,29.42,860426.0


In [38]:
insiders.dropna(axis=0, inplace=True)
insiders['Date'] = pd.to_datetime(insiders['Date'])
insiders['Ticker'] = insiders['Ticker'].astype(str)
insiders['Name'] = insiders['Name'].astype(str).str.lower()
insiders['fileDate'] = pd.to_datetime(insiders['fileDate'])
insiders['total_value'] = insiders['Shares'] * insiders['PricePerShare']
insiders.drop(insiders[insiders['Shares'] == 0].index, inplace=True)
insiders.drop(insiders[insiders['PricePerShare'] == 0].index, inplace=True)
insiders['log_total_value'] = np.log(insiders['total_value'])
insiders['TraderFrequency'] = insiders.groupby('Name')['Name'].transform('count')
insiders.head()

Unnamed: 0.1,Unnamed: 0,Ticker,Date,Name,AcquiredDisposedCode,TransactionCode,Shares,PricePerShare,SharesOwnedFollowing,fileDate,total_value,log_total_value,TraderFrequency
0,0,NTRA,2023-12-29,brophy michael burkes,D,S,1377.0,62.29,64810.0,2023-12-30 02:45:08,85773.33,11.359463,2
1,1,NTRA,2023-12-28,brophy michael burkes,D,S,915.0,63.2923,66187.0,2023-12-30 02:45:08,57912.4545,10.966688,2
2,2,EAF,2023-12-29,bcp gp ltd,D,S,34111.0,2.2164,27370523.0,2023-12-30 02:38:57,75603.6204,11.233259,150
3,3,EAF,2023-12-29,bcp gp ltd,D,S,2.0,2.2164,27275612.0,2023-12-30 02:38:57,4.4328,1.489031,150
4,4,EAF,2023-12-29,bcp gp ltd,D,S,3.0,2.2164,27275614.0,2023-12-30 02:38:57,6.6492,1.894497,150


In [39]:
insiders.drop(columns=['Unnamed: 0'], inplace=True)

In [97]:
insiders['change_in_holdings'] = (insiders['Shares'] / insiders['SharesOwnedFollowing']) * 100 
insiders.head()

Unnamed: 0,Ticker,Date,Name,Shares,PricePerShare,SharesOwnedFollowing,fileDate,total_value,log_total_value,TraderFrequency,...,price_3_week,price_4_week,price_5_week,price_6_week,price_7_week,price_8_week,price_9_week,price_10_week,price_11_week,price_12_week
0,NTRA,2023-12-29,brophy michael burkes,1377.0,62.29,64810.0,2023-12-30 02:45:08,85773.33,11.359463,2.0,...,66.63,65.44,,,,,,,,
1,NTRA,2023-12-28,brophy michael burkes,915.0,63.2923,66187.0,2023-12-30 02:45:08,57912.4545,10.966688,2.0,...,67.56,64.62,,,,,,,,
2,EAF,2023-12-29,bcp gp ltd,34111.0,2.2164,27370523.0,2023-12-30 02:38:57,75603.6204,11.233259,150.0,...,1.5,1.53,,,,,,,,
3,EAF,2023-12-29,bcp gp ltd,2.0,2.2164,27275612.0,2023-12-30 02:38:57,4.4328,1.489031,150.0,...,1.5,1.53,,,,,,,,
4,EAF,2023-12-29,bcp gp ltd,3.0,2.2164,27275614.0,2023-12-30 02:38:57,6.6492,1.894497,150.0,...,1.5,1.53,,,,,,,,


In [42]:

grpuped = insiders.groupby(['Name', 'Ticker', 'Date'])#.agg({'Shares': 'sum', 'PricePerShare': 'mean', 'SharesOwnedFollowing': 'mean', 'total_value': 'sum', 'TraderFrequency': 'mean', 'change_in_holdings': 'mean'})
grpuped.head()

Unnamed: 0,Ticker,Date,Name,AcquiredDisposedCode,TransactionCode,Shares,PricePerShare,SharesOwnedFollowing,fileDate,total_value,log_total_value,TraderFrequency,change_in_holdings
0,NTRA,2023-12-29,brophy michael burkes,D,S,1377.0,62.2900,64810.0,2023-12-30 02:45:08,8.577333e+04,11.359463,2,2.124672
1,NTRA,2023-12-28,brophy michael burkes,D,S,915.0,63.2923,66187.0,2023-12-30 02:45:08,5.791245e+04,10.966688,2,1.382447
2,EAF,2023-12-29,bcp gp ltd,D,S,34111.0,2.2164,27370523.0,2023-12-30 02:38:57,7.560362e+04,11.233259,150,0.124627
3,EAF,2023-12-29,bcp gp ltd,D,S,2.0,2.2164,27275612.0,2023-12-30 02:38:57,4.432800e+00,1.489031,150,0.000007
4,EAF,2023-12-29,bcp gp ltd,D,S,3.0,2.2164,27275614.0,2023-12-30 02:38:57,6.649200e+00,1.894497,150,0.000011
...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,SFBC,2023-10-27,stilwell joseph,A,P,4470.0,35.7758,198342.0,2023-10-30 16:30:00,1.599178e+05,11.982415,35,2.253683
19996,NVR,2023-10-26,malzahn daniel david,D,S,356.0,5453.1000,10135.0,2023-10-30 16:03:15,1.941304e+06,14.478870,9,3.512580
19997,BOTJ,2023-10-26,alford john r jr,A,P,400.0,9.8500,21190.0,2023-10-30 15:48:47,3.940000e+03,8.278936,1,1.887683
19998,BUKS,2023-10-27,daly joseph patrick,A,P,5000.0,0.6800,3605000.0,2023-10-30 15:45:50,3.400000e+03,8.131531,45,0.138696


In [44]:
insiders['individual_transactions_per_trade'] = insiders.groupby(['Name', 'Ticker'])['Name'].transform('count')
insiders['investors_per_trade'] = insiders.groupby(['TransactionCode', 'Ticker'])['Name'].transform('nunique')
insiders.head()

Unnamed: 0,Ticker,Date,Name,AcquiredDisposedCode,TransactionCode,Shares,PricePerShare,SharesOwnedFollowing,fileDate,total_value,log_total_value,TraderFrequency,change_in_holdings,individual_transactions_per_trade,investors_per_trade
0,NTRA,2023-12-29,brophy michael burkes,D,S,1377.0,62.29,64810.0,2023-12-30 02:45:08,85773.33,11.359463,2,2.124672,2,7
1,NTRA,2023-12-28,brophy michael burkes,D,S,915.0,63.2923,66187.0,2023-12-30 02:45:08,57912.4545,10.966688,2,1.382447,2,7
2,EAF,2023-12-29,bcp gp ltd,D,S,34111.0,2.2164,27370523.0,2023-12-30 02:38:57,75603.6204,11.233259,150,0.124627,150,1
3,EAF,2023-12-29,bcp gp ltd,D,S,2.0,2.2164,27275612.0,2023-12-30 02:38:57,4.4328,1.489031,150,7e-06,150,1
4,EAF,2023-12-29,bcp gp ltd,D,S,3.0,2.2164,27275614.0,2023-12-30 02:38:57,6.6492,1.894497,150,1.1e-05,150,1


In [47]:
insiders.info()

<class 'pandas.core.frame.DataFrame'>
Index: 19815 entries, 0 to 19999
Data columns (total 15 columns):
 #   Column                             Non-Null Count  Dtype         
---  ------                             --------------  -----         
 0   Ticker                             19815 non-null  object        
 1   Date                               19815 non-null  datetime64[ns]
 2   Name                               19815 non-null  object        
 3   AcquiredDisposedCode               19815 non-null  object        
 4   TransactionCode                    19815 non-null  object        
 5   Shares                             19815 non-null  float64       
 6   PricePerShare                      19815 non-null  float64       
 7   SharesOwnedFollowing               19815 non-null  float64       
 8   fileDate                           19815 non-null  datetime64[ns]
 9   total_value                        19815 non-null  float64       
 10  log_total_value                    1981

#### No need for an Imputer

In [None]:
# from sklearn.impute import SimpleImputer
# imputer = SimpleImputer(strategy='median')

In [60]:
insiders.columns

Index([                           'Ticker',
                                    'Date',
                                    'Name',
                    'AcquiredDisposedCode',
                         'TransactionCode',
                                  'Shares',
                           'PricePerShare',
                    'SharesOwnedFollowing',
                                'fileDate',
                             'total_value',
                         'log_total_value',
                         'TraderFrequency',
                      'change_in_holdings',
       'individual_transactions_per_trade',
                     'investors_per_trade',
                                       'P',
                                       'S',
                                       'P',
                                       'S',
                                       nan,
                                       'P',
                                       'S',
                                

In [62]:
insiders = insiders.iloc[:, :-1]
insiders.head()

Unnamed: 0,Ticker,Date,Name,AcquiredDisposedCode,TransactionCode,Shares,PricePerShare,SharesOwnedFollowing,fileDate,total_value,log_total_value,TraderFrequency,change_in_holdings,individual_transactions_per_trade,investors_per_trade,P
0,NTRA,2023-12-29,brophy michael burkes,D,S,1377.0,62.29,64810.0,2023-12-30 02:45:08,85773.33,11.359463,2.0,2.124672,2.0,7.0,0.0
1,NTRA,2023-12-28,brophy michael burkes,D,S,915.0,63.2923,66187.0,2023-12-30 02:45:08,57912.4545,10.966688,2.0,1.382447,2.0,7.0,0.0
2,EAF,2023-12-29,bcp gp ltd,D,S,34111.0,2.2164,27370523.0,2023-12-30 02:38:57,75603.6204,11.233259,150.0,0.124627,150.0,1.0,0.0
3,EAF,2023-12-29,bcp gp ltd,D,S,2.0,2.2164,27275612.0,2023-12-30 02:38:57,4.4328,1.489031,150.0,7e-06,150.0,1.0,0.0
4,EAF,2023-12-29,bcp gp ltd,D,S,3.0,2.2164,27275614.0,2023-12-30 02:38:57,6.6492,1.894497,150.0,1.1e-05,150.0,1.0,0.0


In [51]:
onehot = OneHotEncoder(sparse=False)
transaction_onehot = onehot.fit_transform(insiders[['TransactionCode']])
transaction_onehot = pd.DataFrame(transaction_onehot, columns=onehot.categories_[0])
insiders = pd.concat([insiders, transaction_onehot], axis=1)
insiders.head()



Unnamed: 0,Ticker,Date,Name,AcquiredDisposedCode,TransactionCode,Shares,PricePerShare,SharesOwnedFollowing,fileDate,total_value,...,individual_transactions_per_trade,investors_per_trade,P,S,P.1,S.1,NaN,P.2,S.2,NaN.1
0,NTRA,2023-12-29,brophy michael burkes,D,S,1377.0,62.29,64810.0,2023-12-30 02:45:08,85773.33,...,2.0,7.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
1,NTRA,2023-12-28,brophy michael burkes,D,S,915.0,63.2923,66187.0,2023-12-30 02:45:08,57912.4545,...,2.0,7.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
2,EAF,2023-12-29,bcp gp ltd,D,S,34111.0,2.2164,27370523.0,2023-12-30 02:38:57,75603.6204,...,150.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
3,EAF,2023-12-29,bcp gp ltd,D,S,2.0,2.2164,27275612.0,2023-12-30 02:38:57,4.4328,...,150.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
4,EAF,2023-12-29,bcp gp ltd,D,S,3.0,2.2164,27275614.0,2023-12-30 02:38:57,6.6492,...,150.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0


In [64]:
insiders['Purchase?'] = insiders['P']
insiders.drop(columns=['P'], inplace=True)
insiders.drop(columns=['TransactionCode', 'AcquiredDisposedCode'], inplace=True)
insiders.head()

Unnamed: 0,Ticker,Date,Name,Shares,PricePerShare,SharesOwnedFollowing,fileDate,total_value,log_total_value,TraderFrequency,change_in_holdings,individual_transactions_per_trade,investors_per_trade,Purchase?
0,NTRA,2023-12-29,brophy michael burkes,1377.0,62.29,64810.0,2023-12-30 02:45:08,85773.33,11.359463,2.0,2.124672,2.0,7.0,0.0
1,NTRA,2023-12-28,brophy michael burkes,915.0,63.2923,66187.0,2023-12-30 02:45:08,57912.4545,10.966688,2.0,1.382447,2.0,7.0,0.0
2,EAF,2023-12-29,bcp gp ltd,34111.0,2.2164,27370523.0,2023-12-30 02:38:57,75603.6204,11.233259,150.0,0.124627,150.0,1.0,0.0
3,EAF,2023-12-29,bcp gp ltd,2.0,2.2164,27275612.0,2023-12-30 02:38:57,4.4328,1.489031,150.0,7e-06,150.0,1.0,0.0
4,EAF,2023-12-29,bcp gp ltd,3.0,2.2164,27275614.0,2023-12-30 02:38:57,6.6492,1.894497,150.0,1.1e-05,150.0,1.0,0.0


In [None]:
insiders.to_csv('insiders_cleaned_unprocessed.csv', index=False)

In [69]:
data_directory = '../data/'
combined_df = pd.DataFrame()

for filename in os.listdir(data_directory):
    if filename.startswith('av_query_') and filename.endswith('.csv'):
        file_path = os.path.join(data_directory, filename)
        print(f'Reading {file_path}')
        ticker_info = pd.read_csv(file_path, header=None, nrows=3, skiprows=1)
        ticker = ticker_info.iloc[1, 1]  # 2nd column of the 3rd row
        
        if ticker != filename[len('av_query_'):-len('.csv')]:
          print(f'Warning: filename {filename} does not match ticker {ticker}')
          
        df = pd.read_csv(file_path, skiprows=5)
        
        # Unearth from the metadata layer
        df['Date'] = pd.to_datetime(df['5. Time Zone'])
        df.drop(columns=['US/Eastern', '5. Time Zone'], inplace=True)
        df['Open'] = df['Unnamed: 2'].apply(lambda x: float(json.loads(x.replace("'", '"'))['1. open'].replace(',', '')))
        df['High'] = df['Unnamed: 2'].apply(lambda x: float(json.loads(x.replace("'", '"'))['2. high'].replace(',', '')))
        df['Low'] = df['Unnamed: 2'].apply(lambda x: float(json.loads(x.replace("'", '"'))['3. low'].replace(',', '')))
        df['Close'] = df['Unnamed: 2'].apply(lambda x: float(json.loads(x.replace("'", '"'))['4. close'].replace(',', '')))
        df['Volume'] = df['Unnamed: 2'].apply(lambda x: float(json.loads(x.replace("'", '"'))['5. volume'].replace(',', '')))
        df.drop(columns=['Unnamed: 2'], inplace=True)

        # Ticker symbol column
        df['Ticker'] = ticker

        combined_df = pd.concat([combined_df, df], ignore_index=True)
      
combined_df.to_csv('aggregated_daily_prices.csv', index=False)


Reading ../data/av_query_VNOM.csv
Reading ../data/av_query_HD.csv
Reading ../data/av_query_FOLD.csv
Reading ../data/av_query_MKTX.csv
Reading ../data/av_query_DINO.csv
Reading ../data/av_query_G.csv
Reading ../data/av_query_CINF.csv
Reading ../data/av_query_FFIV.csv
Reading ../data/av_query_TCRX.csv
Reading ../data/av_query_BMEZ.csv
Reading ../data/av_query_RELY.csv
Reading ../data/av_query_PGR.csv
Reading ../data/av_query_CRWD.csv
Reading ../data/av_query_AAON.csv
Reading ../data/av_query_NNN.csv
Reading ../data/av_query_LTHM.csv
Reading ../data/av_query_MICS.csv
Reading ../data/av_query_ET.csv
Reading ../data/av_query_PB.csv
Reading ../data/av_query_BLK.csv
Reading ../data/av_query_PPL.csv
Reading ../data/av_query_TTEK.csv
Reading ../data/av_query_CRAI.csv
Reading ../data/av_query_NXT.csv
Reading ../data/av_query_NWBI.csv
Reading ../data/av_query_TDOC.csv
Reading ../data/av_query_BBW.csv
Reading ../data/av_query_NUS.csv
Reading ../data/av_query_NWL.csv
Reading ../data/av_query_HNST.c

In [73]:
def lookup_price(ticker, date, df=combined_df):
    matched = df[(df['Ticker'] == ticker) & (df['Date'] == pd.to_datetime(date))]
    if not matched.empty:
        return matched['Close'].iloc[0]
    else:
        return np.nan

In [74]:
insiders['price_1_week'] = insiders.apply(lambda x: lookup_price(x['Ticker'], x['Date'] + pd.Timedelta(days=7)), axis=1)
insiders['price_2_week'] = insiders.apply(lambda x: lookup_price(x['Ticker'], x['Date'] + pd.Timedelta(days=14)), axis=1)
insiders['price_3_week'] = insiders.apply(lambda x: lookup_price(x['Ticker'], x['Date'] + pd.Timedelta(days=21)), axis=1)
insiders['price_4_week'] = insiders.apply(lambda x: lookup_price(x['Ticker'], x['Date'] + pd.Timedelta(days=28)), axis=1)
insiders['price_5_week'] = insiders.apply(lambda x: lookup_price(x['Ticker'], x['Date'] + pd.Timedelta(days=35)), axis=1)
insiders['price_6_week'] = insiders.apply(lambda x: lookup_price(x['Ticker'], x['Date'] + pd.Timedelta(days=42)), axis=1)
insiders['price_7_week'] = insiders.apply(lambda x: lookup_price(x['Ticker'], x['Date'] + pd.Timedelta(days=49)), axis=1)
insiders['price_8_week'] = insiders.apply(lambda x: lookup_price(x['Ticker'], x['Date'] + pd.Timedelta(days=56)), axis=1)
insiders['price_9_week'] = insiders.apply(lambda x: lookup_price(x['Ticker'], x['Date'] + pd.Timedelta(days=63)), axis=1)
insiders['price_10_week'] = insiders.apply(lambda x: lookup_price(x['Ticker'], x['Date'] + pd.Timedelta(days=70)), axis=1)
insiders['price_11_week'] = insiders.apply(lambda x: lookup_price(x['Ticker'], x['Date'] + pd.Timedelta(days=77)), axis=1)
insiders['price_12_week'] = insiders.apply(lambda x: lookup_price(x['Ticker'], x['Date'] + pd.Timedelta(days=84)), axis=1)

In [75]:
insiders.to_csv('insiders_with_prices.csv', index=False)

In [78]:
insiders.info()

<class 'pandas.core.frame.DataFrame'>
Index: 20000 entries, 0 to 19848
Data columns (total 26 columns):
 #   Column                             Non-Null Count  Dtype         
---  ------                             --------------  -----         
 0   Ticker                             19815 non-null  object        
 1   Date                               19815 non-null  datetime64[ns]
 2   Name                               19815 non-null  object        
 3   Shares                             19815 non-null  float64       
 4   PricePerShare                      19815 non-null  float64       
 5   SharesOwnedFollowing               19815 non-null  float64       
 6   fileDate                           19815 non-null  datetime64[ns]
 7   total_value                        19815 non-null  float64       
 8   log_total_value                    19815 non-null  float64       
 9   TraderFrequency                    19815 non-null  float64       
 10  change_in_holdings                 1981

## Create Several Diffferent Supervised Sets to Play With:

In [91]:
insiders.drop(insiders[insiders['Shares'] == 0].index, inplace=True)
insiders.drop(insiders[insiders['PricePerShare'] == 0].index, inplace=True)

In [98]:
baseframe = insiders.copy()
baseframe.drop(columns=['Name', 'Ticker', 'Date', 'fileDate'], inplace=True)

In [120]:
whalesframe = insiders.copy()
whalesframe.drop(columns=['Name', 'Ticker', 'Date', 'fileDate'], inplace=True)

In [100]:
baseframe.info()


<class 'pandas.core.frame.DataFrame'>
Index: 20000 entries, 0 to 19848
Data columns (total 22 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   Shares                             19815 non-null  float64
 1   PricePerShare                      19815 non-null  float64
 2   SharesOwnedFollowing               19815 non-null  float64
 3   total_value                        19815 non-null  float64
 4   log_total_value                    19815 non-null  float64
 5   TraderFrequency                    19815 non-null  float64
 6   change_in_holdings                 19815 non-null  float64
 7   individual_transactions_per_trade  19815 non-null  float64
 8   investors_per_trade                19815 non-null  float64
 9   Purchase?                          19815 non-null  float64
 10  price_1_week                       14348 non-null  float64
 11  price_2_week                       14096 non-null  float64


In [102]:
whalesframe.describe()

Unnamed: 0,Shares,PricePerShare,SharesOwnedFollowing,total_value,log_total_value,TraderFrequency,change_in_holdings,individual_transactions_per_trade,investors_per_trade,Purchase?,...,price_3_week,price_4_week,price_5_week,price_6_week,price_7_week,price_8_week,price_9_week,price_10_week,price_11_week,price_12_week
count,19815.0,19815.0,19815.0,19815.0,19815.0,19815.0,19815.0,19815.0,19815.0,19815.0,...,13925.0,13990.0,13142.0,11646.0,9533.0,7609.0,6148.0,4862.0,2603.0,1348.0
mean,246554.7,143.236114,7569150.0,11590470.0,11.198161,71.730356,inf,59.553621,4.018673,0.258895,...,155.898166,160.169853,163.897869,168.733423,163.509757,171.168822,181.732985,197.843278,200.71458,249.276171
std,2436453.0,1908.242196,38886350.0,953524700.0,2.729482,160.394469,,147.062185,2.777499,0.438039,...,371.451008,382.923315,395.096102,412.529364,413.483232,440.322296,471.010844,484.248222,434.059469,587.318204
min,0.0022,0.0001,0.0,0.242,-1.418818,1.0,3.104933e-06,1.0,1.0,0.0,...,0.0019,0.0018,0.0023,0.0019,0.0194,0.0194,0.16,0.1246,0.1164,0.2456
25%,700.0,8.09,29457.0,14338.81,9.570725,2.0,0.3416676,2.0,1.0,0.0,...,16.78,16.43,17.78,18.56,18.56,19.16,18.68,18.93,23.04,26.78
50%,3194.0,28.2301,162056.0,100958.2,11.522461,7.0,2.016811,6.0,4.0,0.0,...,52.24,54.625,57.84,60.83,59.73,61.62,66.14,73.155,81.35,89.395
75%,14712.5,120.3727,962409.0,450000.0,13.017003,32.0,11.58424,25.0,6.0,1.0,...,187.85,195.55,202.0,213.7525,208.2,222.82,227.2225,239.42,253.07,262.47
max,151000000.0,178781.84,442852700.0,94889090000.0,25.275975,730.0,inf,730.0,15.0,1.0,...,6939.98,6981.71,7024.82,7242.24,7184.96,7242.24,7416.01,7242.24,7242.24,7184.96


In [121]:
whalesframe = whalesframe[whalesframe['total_value'] > whalesframe['total_value'].median()]
whalesframe['log_shares'] = np.log(whalesframe['Shares'])
whalesframe['log_shares_owned_following'] = np.log(whalesframe['SharesOwnedFollowing']) if whalesframe['SharesOwnedFollowing'].all() > 0 else 0
whalesframe['log_shares_owned_following'] = whalesframe['SharesOwnedFollowing'].apply(lambda x: np.log(x) if x > 0 else 0)
whalesframe['change_in_holdings'] = whalesframe.apply(lambda x: (x['Shares'] / (x['Shares'] + x['SharesOwnedFollowing'])) if x['SharesOwnedFollowing'] > 0 else 0, axis=1)
whalesframe.describe()

Unnamed: 0,Shares,PricePerShare,SharesOwnedFollowing,total_value,log_total_value,TraderFrequency,change_in_holdings,individual_transactions_per_trade,investors_per_trade,Purchase?,...,price_5_week,price_6_week,price_7_week,price_8_week,price_9_week,price_10_week,price_11_week,price_12_week,log_shares,log_shares_owned_following
count,9907.0,9907.0,9907.0,9907.0,9907.0,9907.0,9907.0,9907.0,9907.0,9844.0,...,8542.0,7584.0,6041.0,4923.0,3914.0,3193.0,1725.0,893.0,9907.0,9907.0
mean,194491.7,218.141049,4353259.0,23156250.0,13.306034,81.683658,0.119109,66.383971,4.517008,0.252946,...,188.71509,195.268334,194.441952,201.756118,221.188903,232.776052,228.29468,290.201253,9.162305,11.945566
std,2281263.0,2684.362172,22123410.0,1348456000.0,1.377448,187.824954,0.189958,173.605069,2.80441,0.434722,...,445.094315,470.53788,483.320992,512.115711,555.375332,556.07055,496.242429,682.736812,1.920345,3.359937
min,1.0,0.001,0.0,101000.0,11.522876,1.0,0.0,1.0,1.0,0.0,...,0.0193,0.0019,0.0209,0.1384,0.16,0.1246,0.1164,0.2456,0.0,0.0
25%,2500.0,21.9,35838.0,215563.5,12.281011,2.0,0.007658,2.0,2.0,0.0,...,23.545,24.3175,23.98,24.01,25.9725,24.72,30.06,34.22,7.824046,10.486764
50%,8363.0,71.23,177891.0,450000.0,13.017003,6.0,0.035746,6.0,4.0,0.0,...,76.905,78.81,79.29,80.97,84.82,87.65,106.76,113.91,9.031572,12.088926
75%,28527.0,202.835,1220169.0,1225570.0,14.018917,25.0,0.14187,22.0,7.0,1.0,...,235.59,236.97,237.0,242.96,245.59,261.34,279.94,280.3,10.258606,14.0145
max,151000000.0,178781.84,396156400.0,94889090000.0,25.275975,730.0,0.999996,730.0,15.0,1.0,...,7024.82,7242.24,7184.96,7242.24,7416.01,7242.24,7242.24,7184.96,18.83279,19.79732


In [123]:
whales_1_week_classifier = whalesframe.copy().drop(columns=['price_2_week', 'price_3_week', 'price_4_week', 'price_5_week', 'price_6_week', 'price_7_week', 'price_8_week', 'price_9_week', 'price_10_week', 'price_11_week', 'price_12_week'])
whales_1_week_classifier.drop(columns=['total_value', 'Shares', 'SharesOwnedFollowing'], inplace=True)
whales_1_week_classifier['target'] = whalesframe['price_1_week'] > whalesframe['PricePerShare']
whales_1_week_classifier.head()

Unnamed: 0,PricePerShare,log_total_value,TraderFrequency,change_in_holdings,individual_transactions_per_trade,investors_per_trade,Purchase?,price_1_week,log_shares,log_shares_owned_following,target
7,2.2323,12.904533,150.0,0.00653,150.0,1.0,0.0,1.97,12.101501,17.126223,False
8,2.2323,12.329368,150.0,0.003661,150.0,1.0,0.0,1.97,11.526335,17.132775,False
9,2.2323,12.32763,150.0,0.003641,150.0,1.0,0.0,1.97,11.524597,17.136443,False
10,2.2164,11.81016,150.0,0.002222,150.0,1.0,0.0,1.99,11.014275,17.121504,False
12,63.2923,12.012568,2.0,0.014884,2.0,7.0,0.0,59.69,7.864804,12.057271,False


In [124]:
whales_1_week_classifier.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9907 entries, 7 to 19996
Data columns (total 11 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   PricePerShare                      9907 non-null   float64
 1   log_total_value                    9907 non-null   float64
 2   TraderFrequency                    9907 non-null   float64
 3   change_in_holdings                 9907 non-null   float64
 4   individual_transactions_per_trade  9907 non-null   float64
 5   investors_per_trade                9907 non-null   float64
 6   Purchase?                          9844 non-null   float64
 7   price_1_week                       9296 non-null   float64
 8   log_shares                         9907 non-null   float64
 9   log_shares_owned_following         9907 non-null   float64
 10  target                             9907 non-null   bool   
dtypes: bool(1), float64(10)
memory usage: 861.1 KB


In [129]:
x = whales_1_week_classifier.drop(columns=['target', 'price_1_week'])
y = whales_1_week_classifier['target']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, shuffle=True)


minmax = MinMaxScaler()
minmax.fit(x_train.drop(columns=['Purchase?']))
x_train_num = minmax.transform(x_train.drop(columns=['Purchase?']))
x_test_num = minmax.transform(x_test.drop(columns=['Purchase?']))
x_train = pd.DataFrame(x_train_num, columns=x_train.drop(columns=['Purchase?']).columns)
x_test = pd.DataFrame(x_test_num, columns=x_test.drop(columns=['Purchase?']).columns)


In [130]:
import sklearn.decomposition
import sklearn.preprocessing
import sklearn.metrics
import sklearn.svm
import sklearn.tree
import sklearn.neighbors
import sklearn.model_selection

dtr = sklearn.tree.DecisionTreeRegressor()
knnr = sklearn.neighbors.KNeighborsRegressor()
svr = sklearn.svm.SVR(kernel='poly', C=5)

svr.fit(x_train, y_train)
dtr.fit(x_train, y_train)
knnr.fit(x_train, y_train)

y_svr = svr.predict(x_test)
y_dtr = dtr.predict(x_test)
y_knnr = knnr.predict(x_test)

#fig, ax = plt.subplots() 

mae_svr = sklearn.metrics.mean_absolute_error(y_test, y_svr)
mse_svr = sklearn.metrics.mean_squared_error(y_test, y_svr)
rmse_svr = np.sqrt(mse_svr)
r2_svr = sklearn.metrics.r2_score(y_test, y_svr)

mae_dtr = sklearn.metrics.mean_absolute_error(y_test, y_dtr)
mse_dtr = sklearn.metrics.mean_squared_error(y_test, y_dtr)
rmse_dtr = np.sqrt(mse_dtr)
r2_dtr = sklearn.metrics.r2_score(y_test, y_dtr)

mae = sklearn.metrics.mean_absolute_error(y_test, y_knnr)
mse = sklearn.metrics.mean_squared_error(y_test, y_knnr)
rmse = np.sqrt(mse)
r2 = sklearn.metrics.r2_score(y_test, y_knnr)

print(f"SVR Mean Absolute Error (MAE): {mae_svr:.9f}")
print(f"SVR Mean Squared Error (MSE): {mse_svr:.9f}")
print(f"SVR Root Mean Squared Error (RMSE): {rmse_svr:.9f}")
print(f"SVR R-squared (R2): {r2_svr:.9f}")
print('\n')
print(f"DTR Mean Absolute Error (MAE): {mae_dtr:.9f}")
print(f"DTR Mean Squared Error (MSE): {mse_dtr:.9f}")
print(f"DTR Root Mean Squared Error (RMSE): {rmse_dtr:.9f}")
print(f"DTR R-squared (R2): {r2_dtr:.9f}")
print('\n')
print(f"KNNR Mean Absolute Error (MAE): {mae:.9f}")
print(f"KNNR Mean Squared Error (MSE): {mse:.9f}")
print(f"KNNR Root Mean Squared Error (RMSE): {rmse:.9f}")
print(f"KNNR R-squared (R2): {r2:.9f}")

SVR Mean Absolute Error (MAE): 0.418538945
SVR Mean Squared Error (MSE): 0.318279914
SVR Root Mean Squared Error (RMSE): 0.564163021
SVR R-squared (R2): -0.323311048


DTR Mean Absolute Error (MAE): 0.306256307
DTR Mean Squared Error (MSE): 0.306256307
DTR Root Mean Squared Error (RMSE): 0.553404289
DTR R-squared (R2): -0.273320548


KNNR Mean Absolute Error (MAE): 0.415237134
KNNR Mean Squared Error (MSE): 0.248092836
KNNR Root Mean Squared Error (RMSE): 0.498089184
KNNR R-squared (R2): -0.031494530


In [103]:
baseframe.drop(baseframe[baseframe['Shares'] == 0].index, inplace=True)
baseframe.drop(baseframe[baseframe['SharesOwnedFollowing'] == 0].index, inplace=True)
baseframe['log_shares'] = np.log(baseframe['Shares'])
baseframe['log_shares_owned_following'] = np.log(baseframe['SharesOwnedFollowing'])
baseframe.head()


Unnamed: 0,Shares,PricePerShare,SharesOwnedFollowing,total_value,log_total_value,TraderFrequency,change_in_holdings,individual_transactions_per_trade,investors_per_trade,Purchase?,...,price_5_week,price_6_week,price_7_week,price_8_week,price_9_week,price_10_week,price_11_week,price_12_week,log_shares,log_shares_owned_following
0,1377.0,62.29,64810.0,85773.33,11.359463,2.0,2.124672,2.0,7.0,0.0,...,,,,,,,,,7.227662,11.079215
1,915.0,63.2923,66187.0,57912.4545,10.966688,2.0,1.382447,2.0,7.0,0.0,...,,,,,,,,,6.818924,11.100239
2,34111.0,2.2164,27370523.0,75603.6204,11.233259,150.0,0.124627,150.0,1.0,0.0,...,,,,,,,,,10.437375,17.124977
3,2.0,2.2164,27275612.0,4.4328,1.489031,150.0,7e-06,150.0,1.0,0.0,...,,,,,,,,,0.693147,17.121504
4,3.0,2.2164,27275614.0,6.6492,1.894497,150.0,1.1e-05,150.0,1.0,0.0,...,,,,,,,,,1.098612,17.121504


In [105]:
whalesframe['log_shares_owned_following'].plot(kind='hist', bins=500)

ValueError: autodetected range of [-inf, 19.908747841590685] is not finite

In [88]:
one_week_classifier = baseframe.copy().iloc[:, :-11]
one_week_classifier.drop(columns=['total_value', 'Shares', 'SharesOwnedFollowing'], inplace=True)
one_week_classifier['target'] = baseframe['price_1_week'] > baseframe['PricePerShare']
one_week_classifier.head()

Unnamed: 0,PricePerShare,log_total_value,TraderFrequency,change_in_holdings,individual_transactions_per_trade,investors_per_trade,Purchase?,price_1_week,price_2_week,price_3_week,target
0,62.29,11.359463,2.0,2.124672,2.0,7.0,0.0,62.23,60.89,66.63,False
1,63.2923,10.966688,2.0,1.382447,2.0,7.0,0.0,59.69,63.01,67.56,False
2,2.2164,11.233259,150.0,0.124627,150.0,1.0,0.0,1.99,1.7,1.5,False
3,2.2164,1.489031,150.0,7e-06,150.0,1.0,0.0,1.99,1.7,1.5,False
4,2.2164,1.894497,150.0,1.1e-05,150.0,1.0,0.0,1.99,1.7,1.5,False


In [25]:
one_week_classifier = baseframe.copy().iloc[:, :-11]
one_week_classifier['target'] = baseframe['price_1_week'] > baseframe['PricePerShare']
one_week_classifier.head()

29.68
<class 'float'>


Unnamed: 0,5. Time Zone,Unnamed: 2
0,2024-01-26,"{'1. open': '29.9500', '2. high': '30.0700', '..."
1,2024-01-25,"{'1. open': '29.0200', '2. high': '29.8200', '..."
2,2024-01-24,"{'1. open': '29.4300', '2. high': '29.4300', '..."
3,2024-01-23,"{'1. open': '29.5500', '2. high': '29.6400', '..."
4,2024-01-22,"{'1. open': '29.1500', '2. high': '29.5700', '..."
