In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys
import re
import glob
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler, LabelEncoder, OrdinalEncoder, RobustScaler
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve, precision_recall_curve, auc, f1_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression, SGDClassifier, RidgeClassifier, RidgeClassifierCV, PassiveAggressiveClassifier
from sklearn.neighbors import KNeighborsClassifier, NearestCentroid, KNeighborsRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, AdaBoostClassifier, AdaBoostRegressor, GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.svm import SVC, SVR
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.neural_network import MLPClassifier, MLPRegressor

In [3]:
df = pd.read_csv('full_insiders_with_prices.csv')
df.head()

Unnamed: 0,Ticker,Date,Name,AcquiredDisposedCode,TransactionCode,Shares,PricePerShare,SharesOwnedFollowing,fileDate,total_value,...,price_3_week,price_4_week,price_5_week,price_6_week,price_7_week,price_8_week,price_9_week,price_10_week,price_11_week,price_12_week
0,NTRA,2023-12-29 00:00:00,brophy michael burkes,D,S,1377.0,62.29,64810.0,2023-12-30 02:45:08,85773.33,...,66.63,65.44,,,,,,,,
1,NTRA,2023-12-28 00:00:00,brophy michael burkes,D,S,915.0,63.2923,66187.0,2023-12-30 02:45:08,57912.4545,...,67.56,64.62,,,,,,,,
2,EAF,2023-12-29 00:00:00,bcp gp ltd,D,S,34111.0,2.2164,27370523.0,2023-12-30 02:38:57,75603.6204,...,1.5,1.53,,,,,,,,
3,EAF,2023-12-29 00:00:00,bcp gp ltd,D,S,2.0,2.2164,27275612.0,2023-12-30 02:38:57,4.4328,...,1.5,1.53,,,,,,,,
4,EAF,2023-12-29 00:00:00,bcp gp ltd,D,S,3.0,2.2164,27275614.0,2023-12-30 02:38:57,6.6492,...,1.5,1.53,,,,,,,,


In [4]:
df.columns

Index(['Ticker', 'Date', 'Name', 'AcquiredDisposedCode', 'TransactionCode',
       'Shares', 'PricePerShare', 'SharesOwnedFollowing', 'fileDate',
       'total_value', 'TraderFrequency', 'change_in_holdings',
       'individual_transactions_per_trade', 'investors_per_trade',
       'price_2_week', 'price_3_week', 'price_4_week', 'price_5_week',
       'price_6_week', 'price_7_week', 'price_8_week', 'price_9_week',
       'price_10_week', 'price_11_week', 'price_12_week'],
      dtype='object')

In [20]:
gdf = df.drop('fileDate', axis=1)
gdf = df.groupby(['Ticker', 'Date', 'Name', 'TransactionCode', 'AcquiredDisposedCode']).agg({'Shares': 'sum', 'PricePerShare': 'mean', 'SharesOwnedFollowing': 'min', 'total_value': 'sum', 'TraderFrequency': 'mean', 'change_in_holdings': 'sum', 'individual_transactions_per_trade': 'first', 'investors_per_trade': 'first', 'price_2_week': 'mean', 'price_3_week': 'mean', 'price_4_week': 'mean', 'price_5_week': 'mean', 'price_6_week': 'mean', 'price_7_week': 'mean', 'price_8_week': 'mean', 'price_9_week': 'mean', 'price_10_week': 'mean', 'price_11_week': 'mean', 'price_12_week': 'mean'}) 
gdf

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Shares,PricePerShare,SharesOwnedFollowing,total_value,TraderFrequency,change_in_holdings,individual_transactions_per_trade,investors_per_trade,price_2_week,price_3_week,price_4_week,price_5_week,price_6_week,price_7_week,price_8_week,price_9_week,price_10_week,price_11_week,price_12_week
Ticker,Date,Name,TransactionCode,AcquiredDisposedCode,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
(CALX),2023-10-27 00:00:00,collins john matthew,S,D,60000.0,33.14000,0.000000e+00,1988400.00,1.0,inf,1,1,,,,,,,,,,,
A,2023-12-07 00:00:00,mcmullen michael r.,S,D,62467.0,130.00000,3.062055e+05,8120710.00,2.0,20.400349,2,4,138.9400,139.7700,131.0000,129.6800,130.4600,129.8800,,,,,
A,2023-12-13 00:00:00,mcmullen michael r.,S,D,2751.0,130.00000,3.062055e+05,357630.00,2.0,0.898416,2,4,139.8200,131.1600,131.0900,128.1200,129.7800,,,,,,
A,2023-12-15 00:00:00,ancher-jensen henrik,S,D,26331.0,138.36000,8.250398e+04,3643157.16,1.0,31.914823,1,4,139.0300,130.5600,130.5400,131.2200,130.9900,,,,,,
A,2023-12-18 00:00:00,gonsalves rodney,S,D,1500.0,137.69200,2.613298e+04,206538.00,1.0,5.739873,1,4,,133.3800,,132.5500,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
vplm,2023-11-15 00:00:00,williams kevin bryan,S,D,300000.0,0.01690,1.707281e+06,5070.00,4.0,17.571800,3,3,0.0225,0.0180,0.0188,0.0245,0.0200,0.0180,0.0170,0.0170,0.0175,,
vplm,2023-11-16 00:00:00,williams kevin bryan,S,D,390000.0,0.02525,1.317281e+06,9890.00,4.0,27.705190,3,3,0.0203,0.0175,0.0218,0.0250,0.0209,0.0175,0.0173,0.0182,0.0165,,
vplm,2023-12-04 00:00:00,baggio barbara,S,D,250000.0,0.01900,1.549418e+08,4750.00,48.0,0.161351,2,3,0.0234,,,0.0175,,0.0178,,,,,
vplm,2023-12-05 00:00:00,baggio barbara,S,D,290000.0,0.01750,1.546518e+08,5075.00,48.0,0.187518,2,3,0.0230,0.0210,0.0195,0.0168,0.0183,0.0172,,,,,


In [21]:
gdf.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 12498 entries, ('(CALX)', '2023-10-27 00:00:00', 'collins john matthew', 'S', 'D') to ('wi3kpu$w', '2023-11-17 00:00:00', 'devilliers david h jr', 'S', 'A')
Data columns (total 19 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   Shares                             12498 non-null  float64
 1   PricePerShare                      12498 non-null  float64
 2   SharesOwnedFollowing               12498 non-null  float64
 3   total_value                        12498 non-null  float64
 4   TraderFrequency                    12498 non-null  float64
 5   change_in_holdings                 12498 non-null  float64
 6   individual_transactions_per_trade  12498 non-null  int64  
 7   investors_per_trade                12498 non-null  int64  
 8   price_2_week                       11294 non-null  float64
 9   price_3_week                       11198 non-null  

In [23]:
gdf = gdf.iloc[:-1,:].reset_index()
gdf.set_index(['Ticker', 'Date'], inplace=True)

In [43]:
gdf['value_cat'] = pd.qcut(gdf['total_value'], q=10, labels=False, duplicates='drop')

X_train, X_test = train_test_split(df, test_size=0.2, stratify=df['value_cat'])

X_test['value_cat'].value_counts()/len(X_test)

value_cat
3    0.100151
8    0.100151
6    0.100151
0    0.100151
5    0.099899
1    0.099899
2    0.099899
7    0.099899
9    0.099899
4    0.099899
Name: count, dtype: float64

In [26]:
correlation = gdf.corr()

correlation['price_4_week'].sort_values(ascending=False)

ValueError: could not convert string to float: 'collins john matthew'

In [30]:
gdf[gdf['TransactionCode'] == 'P']['AcquiredDisposedCode'].value_counts()

AcquiredDisposedCode
A    3789
D      18
Name: count, dtype: int64

In [31]:
gdf[gdf['TransactionCode'] == 'S']['AcquiredDisposedCode'].value_counts()

AcquiredDisposedCode
D    8674
A      16
Name: count, dtype: int64

In [33]:
gdf.drop(['Name'], axis=1, inplace=True)


In [45]:
X_train.reset_index(inplace=True)
X_train = X_train.drop(['Name', 'Ticker', 'fileDate'], axis=1)
X_train['Date'] = pd.to_datetime(X_train['Date'])
X_train


Unnamed: 0,Date,AcquiredDisposedCode,TransactionCode,Shares,PricePerShare,SharesOwnedFollowing,total_value,TraderFrequency,change_in_holdings,individual_transactions_per_trade,...,price_4_week,price_5_week,price_6_week,price_7_week,price_8_week,price_9_week,price_10_week,price_11_week,price_12_week,value_cat
0,2023-12-05,D,S,5350.0,188.5000,82964.0,1.008475e+06,1,6.448580,1,...,195.49,191.4200,190.940,205.48,,,,,,8
1,2023-11-21,A,P,49202.0,72.6800,18471876.0,3.576001e+06,29,0.266362,29,...,83.30,85.1300,84.710,83.65,80.74,81.88,,,,9
2,2023-12-20,D,S,177.0,2.0367,705819.0,3.604959e+02,1,0.025077,1,...,3.14,2.7477,,,,,,,,0
3,2023-12-01,D,S,3489.0,24.7800,122508.0,8.645742e+04,4,2.847977,4,...,25.32,24.1600,24.225,24.57,25.73,,,,,4
4,2023-11-20,D,S,2109.0,56.7157,17245.0,1.196134e+05,1,12.229632,1,...,59.82,,,63.26,,59.74,,,,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15888,2023-12-06,D,S,43228.0,11.5500,3500468.0,4.992834e+05,13,1.234921,13,...,12.68,12.4900,11.850,11.70,,,,,,7
15889,2023-11-21,D,S,144003.0,20.0885,6453539.0,2.892804e+06,6,2.231380,6,...,17.95,17.6800,16.580,16.39,16.53,17.33,,,,9
15890,2023-11-16,A,P,1000.0,12.1500,12000.0,1.215000e+04,8,8.333333,8,...,13.10,12.5800,14.270,12.61,14.35,13.91,16.72,,,2
15891,2023-10-31,D,S,100000.0,39.4163,1278124.0,3.941630e+06,4,7.823967,4,...,44.36,48.3200,49.100,57.00,58.02,55.75,52.03,51.29,52.95,9


In [47]:

onehot = OneHotEncoder()
transaction_codes = onehot.fit_transform(X_train[['TransactionCode', 'AcquiredDisposedCode']])
X_train = pd.concat([X_train, pd.DataFrame(transaction_codes.toarray())], axis=1)

X_train

Unnamed: 0,Date,AcquiredDisposedCode,TransactionCode,Shares,PricePerShare,SharesOwnedFollowing,total_value,TraderFrequency,change_in_holdings,individual_transactions_per_trade,...,price_8_week,price_9_week,price_10_week,price_11_week,price_12_week,value_cat,0,1,2,3
0,2023-12-05,D,S,5350.0,188.5000,82964.0,1.008475e+06,1,6.448580,1,...,,,,,,8,0.0,1.0,0.0,1.0
1,2023-11-21,A,P,49202.0,72.6800,18471876.0,3.576001e+06,29,0.266362,29,...,80.74,81.88,,,,9,1.0,0.0,1.0,0.0
2,2023-12-20,D,S,177.0,2.0367,705819.0,3.604959e+02,1,0.025077,1,...,,,,,,0,0.0,1.0,0.0,1.0
3,2023-12-01,D,S,3489.0,24.7800,122508.0,8.645742e+04,4,2.847977,4,...,25.73,,,,,4,0.0,1.0,0.0,1.0
4,2023-11-20,D,S,2109.0,56.7157,17245.0,1.196134e+05,1,12.229632,1,...,,59.74,,,,5,0.0,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15888,2023-12-06,D,S,43228.0,11.5500,3500468.0,4.992834e+05,13,1.234921,13,...,,,,,,7,0.0,1.0,0.0,1.0
15889,2023-11-21,D,S,144003.0,20.0885,6453539.0,2.892804e+06,6,2.231380,6,...,16.53,17.33,,,,9,0.0,1.0,0.0,1.0
15890,2023-11-16,A,P,1000.0,12.1500,12000.0,1.215000e+04,8,8.333333,8,...,14.35,13.91,16.72,,,2,1.0,0.0,1.0,0.0
15891,2023-10-31,D,S,100000.0,39.4163,1278124.0,3.941630e+06,4,7.823967,4,...,58.02,55.75,52.03,51.29,52.95,9,0.0,1.0,0.0,1.0


In [49]:
X_train.columns

Index([                             'Date',
                    'AcquiredDisposedCode',
                         'TransactionCode',
                                  'Shares',
                           'PricePerShare',
                    'SharesOwnedFollowing',
                             'total_value',
                         'TraderFrequency',
                      'change_in_holdings',
       'individual_transactions_per_trade',
                     'investors_per_trade',
                            'price_2_week',
                            'price_3_week',
                            'price_4_week',
                            'price_5_week',
                            'price_6_week',
                            'price_7_week',
                            'price_8_week',
                            'price_9_week',
                           'price_10_week',
                           'price_11_week',
                           'price_12_week',
                               '

In [51]:
X_train.loc[:, 3]

0        1.0
1        0.0
2        1.0
3        1.0
4        1.0
        ... 
15888    1.0
15889    1.0
15890    0.0
15891    1.0
15892    1.0
Name: 3, Length: 15893, dtype: float64

In [52]:
X_train['Acquired'] = X_train.loc[:, 0]
X_train['Disposed'] = X_train.loc[:, 1]
X_train['Purchase'] = X_train.loc[:, 2]
X_train['Sale'] = X_train.loc[:, 3]

X_train.drop(['TransactionCode', 'AcquiredDisposedCode'], axis=1, inplace=True)

X_train

Unnamed: 0,Date,Shares,PricePerShare,SharesOwnedFollowing,total_value,TraderFrequency,change_in_holdings,individual_transactions_per_trade,investors_per_trade,price_2_week,...,price_12_week,value_cat,0,1,2,3,Acquired,Disposed,Purchase,Sale
0,2023-12-05,5350.0,188.5000,82964.0,1.008475e+06,1,6.448580,1,2,198.8600,...,,8,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0
1,2023-11-21,49202.0,72.6800,18471876.0,3.576001e+06,29,0.266362,29,1,75.4500,...,,9,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
2,2023-12-20,177.0,2.0367,705819.0,3.604959e+02,1,0.025077,1,2,2.3188,...,,0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0
3,2023-12-01,3489.0,24.7800,122508.0,8.645742e+04,4,2.847977,4,2,24.8600,...,,4,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0
4,2023-11-20,2109.0,56.7157,17245.0,1.196134e+05,1,12.229632,1,5,59.5800,...,,5,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15888,2023-12-06,43228.0,11.5500,3500468.0,4.992834e+05,13,1.234921,13,5,12.6400,...,,7,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0
15889,2023-11-21,144003.0,20.0885,6453539.0,2.892804e+06,6,2.231380,6,9,18.3000,...,,9,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0
15890,2023-11-16,1000.0,12.1500,12000.0,1.215000e+04,8,8.333333,8,1,10.6600,...,,2,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
15891,2023-10-31,100000.0,39.4163,1278124.0,3.941630e+06,4,7.823967,4,5,44.2300,...,52.95,9,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0


In [57]:
X_train.drop(X_train.iloc[:, 21:25], axis=1, inplace=True)

In [59]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15893 entries, 0 to 15892
Data columns (total 25 columns):
 #   Column                             Non-Null Count  Dtype         
---  ------                             --------------  -----         
 0   Date                               15893 non-null  datetime64[ns]
 1   Shares                             15893 non-null  float64       
 2   PricePerShare                      15893 non-null  float64       
 3   SharesOwnedFollowing               15893 non-null  float64       
 4   total_value                        15893 non-null  float64       
 5   TraderFrequency                    15893 non-null  int64         
 6   change_in_holdings                 15892 non-null  float64       
 7   individual_transactions_per_trade  15893 non-null  int64         
 8   investors_per_trade                15893 non-null  int64         
 9   price_2_week                       14263 non-null  float64       
 10  price_3_week                      

In [60]:
correlation = X_train.drop('Date', axis=1).corr()
correlation['price_4_week'].sort_values(ascending=False)

price_4_week                         1.000000
price_5_week                         0.999589
price_3_week                         0.999217
price_6_week                         0.999166
price_7_week                         0.998643
price_8_week                         0.998451
price_9_week                         0.998385
price_2_week                         0.998289
price_12_week                        0.997980
price_10_week                        0.997204
price_11_week                        0.996310
PricePerShare                        0.188564
value_cat                            0.168990
investors_per_trade                  0.137422
individual_transactions_per_trade    0.130751
Sale                                 0.124771
Disposed                             0.124458
TraderFrequency                      0.110183
change_in_holdings                  -0.002781
total_value                         -0.004601
Shares                              -0.018881
SharesOwnedFollowing              

In [32]:
df[df['TransactionCode'] == 'S']['AcquiredDisposedCode'].value_counts()

AcquiredDisposedCode
D    14695
A       18
Name: count, dtype: int64

In [34]:
df['value_cat'] = pd.qcut(df['total_value'], q=10, labels=False, duplicates='drop')

Train, Test = train_test_split(df, test_size=0.2, stratify=df['value_cat'])

Train['value_cat'].value_counts()/len(Train)


value_cat
6    0.100044
0    0.100044
8    0.100044
1    0.099981
2    0.099981
5    0.099981
3    0.099981
4    0.099981
7    0.099981
9    0.099981
Name: count, dtype: float64

In [72]:
corr = correlation.iloc[:, 8:19].drop(correlation.index[8:19])
corr.to_csv('correlation.csv')
corr

Unnamed: 0,price_2_week,price_3_week,price_4_week,price_5_week,price_6_week,price_7_week,price_8_week,price_9_week,price_10_week,price_11_week,price_12_week
Shares,-0.021765,-0.023093,-0.018881,-0.023028,-0.016578,-0.020688,-0.019591,-0.021859,-0.023594,-0.026169,-0.057999
PricePerShare,0.189563,0.186358,0.188564,0.958628,0.979815,0.156541,0.971505,0.137573,0.989965,0.985996,0.989302
SharesOwnedFollowing,-0.029928,-0.031645,-0.033331,-0.033299,-0.029013,-0.025062,-0.024556,-0.023979,-0.035536,-0.039782,-0.039515
total_value,-0.004461,-0.004594,-0.004601,-0.003856,-0.001189,-0.005053,-0.000832,-0.006225,-0.003456,-0.002148,0.096771
TraderFrequency,0.1097,0.114844,0.110183,0.108152,0.089852,0.092225,0.087704,0.073227,0.076914,0.0911,0.053141
change_in_holdings,-0.00273,-0.002746,-0.002781,-0.002755,-0.00565,-0.006032,-0.002844,-0.005,-0.004048,-0.007458,0.012941
individual_transactions_per_trade,0.128532,0.134186,0.130751,0.133028,0.126736,0.129766,0.124627,0.107047,0.112774,0.141149,0.089281
investors_per_trade,0.128966,0.138119,0.137422,0.130454,0.126996,0.11207,0.110781,0.108256,0.097055,0.144255,0.037326
value_cat,0.169567,0.173202,0.16899,0.175644,0.176351,0.19891,0.195263,0.209373,0.208016,0.216972,0.239304
Acquired,-0.117434,-0.12163,-0.124458,-0.131369,-0.146443,-0.146451,-0.15026,-0.152649,-0.174039,-0.205402,-0.184763


In [75]:
corr

Unnamed: 0,Shares,PricePerShare,SharesOwnedFollowing,total_value,TraderFrequency,change_in_holdings,individual_transactions_per_trade,investors_per_trade,value_cat,Acquired,Disposed,Purchase,Sale
count,11.0,11.0,11.0,11.0,11.0,11.0,11.0,11.0,11.0,11.0,11.0,11.0,11.0
mean,-0.024841,0.612165,-0.031422,0.005487,0.091549,-0.002646,0.123443,0.115609,0.193781,-0.150445,0.150445,-0.15082,0.15082
std,0.011296,0.422044,0.005573,0.030321,0.018811,0.005424,0.014839,0.029823,0.023112,0.02784,0.02784,0.027832,0.027832
min,-0.057999,0.137573,-0.039782,-0.006225,0.053141,-0.007458,0.089281,0.037326,0.16899,-0.205402,0.117434,-0.205667,0.117779
25%,-0.023343,0.187461,-0.034434,-0.004597,0.082309,-0.005325,0.1187,0.109518,0.174423,-0.163344,0.127914,-0.163726,0.128274
50%,-0.021859,0.958628,-0.031645,-0.003856,0.0911,-0.002844,0.128532,0.126996,0.195263,-0.146451,0.146451,-0.146886,0.146886
75%,-0.020139,0.982905,-0.027038,-0.001668,0.108926,-0.00275,0.131889,0.133938,0.208694,-0.127914,0.163344,-0.128274,0.163726
max,-0.016578,0.989965,-0.023979,0.096771,0.114844,0.012941,0.141149,0.144255,0.239304,-0.117434,0.205402,-0.117779,0.205667


In [76]:
attributes = ['Shares', 'TraderFrequency', 'value_cat', 'investors_per_trade', 'individual_transactions_per_trade', 'investors_per_trade', 'Purchase', 'Sale']
X_train1 = X_train[attributes]
X_train1

Unnamed: 0,Shares,TraderFrequency,value_cat,investors_per_trade,individual_transactions_per_trade,investors_per_trade.1,Purchase,Sale
0,5350.0,1,8,2,1,2,0.0,1.0
1,49202.0,29,9,1,29,1,1.0,0.0
2,177.0,1,0,2,1,2,0.0,1.0
3,3489.0,4,4,2,4,2,0.0,1.0
4,2109.0,1,5,5,1,5,0.0,1.0
...,...,...,...,...,...,...,...,...
15888,43228.0,13,7,5,13,5,0.0,1.0
15889,144003.0,6,9,9,6,9,0.0,1.0
15890,1000.0,8,2,1,8,1,1.0,0.0
15891,100000.0,4,9,5,4,5,0.0,1.0


In [77]:
X_train1.describe()

Unnamed: 0,Shares,TraderFrequency,value_cat,investors_per_trade,individual_transactions_per_trade,investors_per_trade.1,Purchase,Sale
count,15893.0,15893.0,15893.0,15893.0,15893.0,15893.0,15893.0,15893.0
mean,231777.8,71.453848,4.499969,4.019757,59.404077,4.019757,0.260429,0.739571
std,2431668.0,159.712883,2.872547,2.772113,146.478684,2.772113,0.438883,0.438883
min,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0
25%,690.0,2.0,2.0,1.0,2.0,1.0,0.0,0.0
50%,3167.0,7.0,4.0,4.0,6.0,4.0,0.0,1.0
75%,14510.0,32.0,7.0,6.0,25.0,6.0,1.0,1.0
max,151000000.0,730.0,9.0,15.0,730.0,15.0,1.0,1.0


In [79]:
num_atts = ['Shares', 'TraderFrequency', 'value_cat', 'investors_per_trade', 'individual_transactions_per_trade', 'investors_per_trade']
scaled = MinMaxScaler().fit_transform(X_train1[num_atts])
X_train1 = pd.concat([X_train1.drop(num_atts, axis=1), pd.DataFrame(scaled, columns=attributes)], axis=1)
X_train1

Unnamed: 0,Purchase,Sale,Shares,TraderFrequency,value_cat,investors_per_trade,individual_transactions_per_trade,investors_per_trade.1,Purchase.1,Sale.1
0,0.0,1.0,0.000035,0.000000,0.888889,0.071429,0.071429,0.000000,0.071429,0.071429
1,1.0,0.0,0.000326,0.038409,1.000000,0.000000,0.000000,0.038409,0.000000,0.000000
2,0.0,1.0,0.000001,0.000000,0.000000,0.071429,0.071429,0.000000,0.071429,0.071429
3,0.0,1.0,0.000023,0.004115,0.444444,0.071429,0.071429,0.004115,0.071429,0.071429
4,0.0,1.0,0.000014,0.000000,0.555556,0.285714,0.285714,0.000000,0.285714,0.285714
...,...,...,...,...,...,...,...,...,...,...
15888,0.0,1.0,0.000286,0.016461,0.777778,0.285714,0.285714,0.016461,0.285714,0.285714
15889,0.0,1.0,0.000954,0.006859,1.000000,0.571429,0.571429,0.006859,0.571429,0.571429
15890,1.0,0.0,0.000007,0.009602,0.222222,0.000000,0.000000,0.009602,0.000000,0.000000
15891,0.0,1.0,0.000662,0.004115,1.000000,0.285714,0.285714,0.004115,0.285714,0.285714


In [81]:
X_train1 = X_train1.iloc[:, :-2]
X_train1

Unnamed: 0,Purchase,Sale,Shares,TraderFrequency,value_cat,investors_per_trade,individual_transactions_per_trade,investors_per_trade.1
0,0.0,1.0,0.000035,0.000000,0.888889,0.071429,0.071429,0.000000
1,1.0,0.0,0.000326,0.038409,1.000000,0.000000,0.000000,0.038409
2,0.0,1.0,0.000001,0.000000,0.000000,0.071429,0.071429,0.000000
3,0.0,1.0,0.000023,0.004115,0.444444,0.071429,0.071429,0.004115
4,0.0,1.0,0.000014,0.000000,0.555556,0.285714,0.285714,0.000000
...,...,...,...,...,...,...,...,...
15888,0.0,1.0,0.000286,0.016461,0.777778,0.285714,0.285714,0.016461
15889,0.0,1.0,0.000954,0.006859,1.000000,0.571429,0.571429,0.006859
15890,1.0,0.0,0.000007,0.009602,0.222222,0.000000,0.000000,0.009602
15891,0.0,1.0,0.000662,0.004115,1.000000,0.285714,0.285714,0.004115


In [83]:
tree = DecisionTreeRegressor()
tree.fit(X_train1, X_train['price_8_week'].fillna(X_train['price_8_week'].mean()))

In [84]:
from sklearn.metrics import mean_squared_error
pred = tree.predict(X_train1)
tree_mse = mean_squared_error(X_train['price_8_week'].fillna(X_train['price_8_week'].mean()), pred)
tree_rmse = np.sqrt(tree_mse)
print(tree_rmse)

74.89730173996966


In [85]:
import sklearn.metrics
r2tree = sklearn.metrics.r2_score(X_train['price_8_week'].fillna(X_train['price_8_week'].mean()), pred)
print(r2tree)

0.9194118600063523


In [86]:
X_train['price_8_week'].describe()

count    7829.000000
mean      133.493997
std       375.930999
min         0.004000
25%        10.830000
50%        33.510000
75%       142.150000
max      7242.240000
Name: price_8_week, dtype: float64

In [None]:
def get_price(ticker, date, df=df):
  try:
    return df.loc[(ticker, pd.to_datetime(date)), 'price_6_week']
  except KeyError:
    return np.nan

In [16]:
gdf

Unnamed: 0_level_0,Ticker,Name,TransactionCode,AcquiredDisposedCode,Shares,PricePerShare,SharesOwnedFollowing,total_value,TraderFrequency,change_in_holdings,individual_transactions_per_trade,investors_per_trade
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2023-10-27 00:00:00,(CALX),collins john matthew,S,D,60000.0,33.14000,0.000000e+00,1988400.00,1.0,inf,1,1
2023-12-07 00:00:00,A,mcmullen michael r.,S,D,62467.0,130.00000,3.062055e+05,8120710.00,2.0,20.400349,2,4
2023-12-13 00:00:00,A,mcmullen michael r.,S,D,2751.0,130.00000,3.062055e+05,357630.00,2.0,0.898416,2,4
2023-12-15 00:00:00,A,ancher-jensen henrik,S,D,26331.0,138.36000,8.250398e+04,3643157.16,1.0,31.914823,1,4
2023-12-18 00:00:00,A,gonsalves rodney,S,D,1500.0,137.69200,2.613298e+04,206538.00,1.0,5.739873,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...
2023-11-03 00:00:00,vplm,chang dennis,S,D,130000.0,0.01590,3.375358e+06,2067.00,25.0,3.851443,2,3
2023-11-15 00:00:00,vplm,williams kevin bryan,S,D,300000.0,0.01690,1.707281e+06,5070.00,4.0,17.571800,3,3
2023-11-16 00:00:00,vplm,williams kevin bryan,S,D,390000.0,0.02525,1.317281e+06,9890.00,4.0,27.705190,3,3
2023-12-04 00:00:00,vplm,baggio barbara,S,D,250000.0,0.01900,1.549418e+08,4750.00,48.0,0.161351,2,3


In [19]:
gdf6 = gdf.merge

<class 'pandas.core.frame.DataFrame'>
Index: 12497 entries, 2023-10-27 00:00:00 to 2023-12-05 00:00:00
Data columns (total 12 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   Ticker                             12497 non-null  object 
 1   Name                               12497 non-null  object 
 2   TransactionCode                    12497 non-null  object 
 3   AcquiredDisposedCode               12497 non-null  object 
 4   Shares                             12497 non-null  float64
 5   PricePerShare                      12497 non-null  float64
 6   SharesOwnedFollowing               12497 non-null  float64
 7   total_value                        12497 non-null  float64
 8   TraderFrequency                    12497 non-null  float64
 9   change_in_holdings                 12497 non-null  float64
 10  individual_transactions_per_trade  12497 non-null  int64  
 11  investors_per_trade        

In [18]:
df.sort_values(by=['Ticker'], inplace=True)
df.head(50)

Unnamed: 0_level_0,Ticker,Name,AcquiredDisposedCode,TransactionCode,Shares,PricePerShare,SharesOwnedFollowing,fileDate,total_value,TraderFrequency,...,price_3_week,price_4_week,price_5_week,price_6_week,price_7_week,price_8_week,price_9_week,price_10_week,price_11_week,price_12_week
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2023-10-27 00:00:00,(CALX),collins john matthew,D,S,60000.0,33.14,0.0,2023-10-30 21:02:28,1988400.0,1,...,,,,,,,,,,
2023-12-18 00:00:00,A,gonsalves rodney,D,S,1500.0,137.692,26132.98,2023-12-19 19:34:28,206538.0,1,...,133.38,,132.55,,,,,,,
2023-12-27 00:00:00,A,binns philip,D,S,2880.0,139.78,12824.0,2023-12-28 18:46:59,402566.4,1,...,128.12,129.78,,,,,,,,
2023-12-15 00:00:00,A,ancher-jensen henrik,D,S,26331.0,138.36,82503.98,2023-12-19 19:33:07,3643157.0,1,...,130.56,130.54,131.22,130.99,,,,,,
2023-12-13 00:00:00,A,mcmullen michael r.,D,S,2751.0,130.0,306205.5,2023-12-15 22:08:59,357630.0,2,...,131.16,131.09,128.12,129.78,,,,,,
2023-12-07 00:00:00,A,mcmullen michael r.,D,S,62467.0,130.0,306205.5,2023-12-08 19:59:53,8120710.0,2,...,139.77,131.0,129.68,130.46,129.88,,,,,
2023-12-05 00:00:00,AADI,desai neil,D,S,15740.0,5.0153,1544580.0,2023-12-05 23:40:25,78940.82,6,...,2.01,2.05,1.77,1.64,1.74,,,,,
2023-11-01 00:00:00,AADI,desai neil,D,S,10338.0,4.3185,1611205.0,2023-11-02 22:45:44,44644.65,6,...,4.75,5.35,4.71,5.36,2.06,2.08,1.95,1.77,1.59,1.71
2023-11-02 00:00:00,AADI,desai neil,D,S,31662.0,4.4566,1579543.0,2023-11-02 22:45:44,141104.9,6,...,,5.26,4.78,5.37,1.99,2.1,1.84,1.71,1.68,1.73
2023-12-04 00:00:00,AADI,desai neil,D,S,10876.0,5.1477,1560320.0,2023-12-05 23:40:25,55986.39,6,...,,,1.86,,1.68,,,,,


In [None]:
df.set_index('Date', inplace=True)
df

In [11]:
gdf

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Shares,PricePerShare,SharesOwnedFollowing,AcquiredDisposedCode,total_value,TraderFrequency,change_in_holdings,individual_transactions_per_trade,investors_per_trade
Ticker,Date,Name,TransactionCode,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
(CALX),2023-10-27 00:00:00,collins john matthew,S,60000.0,33.14000,0.000000e+00,D,1988400.00,1.0,inf,1,1
A,2023-12-07 00:00:00,mcmullen michael r.,S,62467.0,130.00000,3.062055e+05,D,8120710.00,2.0,20.400349,2,4
A,2023-12-13 00:00:00,mcmullen michael r.,S,2751.0,130.00000,3.062055e+05,D,357630.00,2.0,0.898416,2,4
A,2023-12-15 00:00:00,ancher-jensen henrik,S,26331.0,138.36000,8.250398e+04,D,3643157.16,1.0,31.914823,1,4
A,2023-12-18 00:00:00,gonsalves rodney,S,1500.0,137.69200,2.613298e+04,D,206538.00,1.0,5.739873,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...
vplm,2023-11-15 00:00:00,williams kevin bryan,S,300000.0,0.01690,1.707281e+06,D,5070.00,4.0,17.571800,3,3
vplm,2023-11-16 00:00:00,williams kevin bryan,S,390000.0,0.02525,1.317281e+06,D,9890.00,4.0,27.705190,3,3
vplm,2023-12-04 00:00:00,baggio barbara,S,250000.0,0.01900,1.549418e+08,D,4750.00,48.0,0.161351,2,3
vplm,2023-12-05 00:00:00,baggio barbara,S,290000.0,0.01750,1.546518e+08,D,5075.00,48.0,0.187518,2,3


In [None]:
gdf.reset_index(inplace=True)
gdf.set_index('Date', inplace=True)
gdf

In [None]:
onehot = OneHotEncoder(sparse=False)
transaction_onehot = onehot.fit_transform(insiders[['TransactionCode']])
transaction_onehot = pd.DataFrame(transaction_onehot, columns=onehot.categories_[0])
insiders = pd.concat([insiders, transaction_onehot], axis=1)
insiders.head()
insiders['Purchase?'] = insiders['P']
insiders.drop(columns=['P'], inplace=True)
insiders.drop(columns=['TransactionCode', 'AcquiredDisposedCode'], inplace=True)
insiders.head()