In [28]:
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_validate
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from bayes_opt.logger import JSONLogger
from bayes_opt.event import Events
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score

In [29]:
df = pd.read_csv('../data/stocks_quarterly.csv')

df2 = pd.read_csv('../data/new_data_test.csv')

cols_to_drop = df.isnull().mean().sort_values(ascending = False).head(45).index

cols_to_drop = list(cols_to_drop) + ['fiscalDateEnding','reportedDate','price','nasd_price',\
                                     'next_year_date','next_year_price','nasd_ny_price','symbol',\
                                     'Nasdaq_Performance', 'Stock_Performance']

df.drop(columns = cols_to_drop, inplace = True)

df = df.drop_duplicates()

X = df.drop(columns = 'Label')
y = df.Label

pipline = make_pipeline(SimpleImputer(strategy='median'))

X = pipline.fit_transform(X)

In [None]:
{'alpha': 4.920429154892943, 'colsample_bytree': 0.50523041456644, 'eta': 0.019902707539573634, 'gamma': 1.5855410310434896, 'max_bin': 31.096242241920844, 'max_leaves': 162.13418326428646, 'min_child_weight': 7.282409845599227}}

In [93]:
estimator = XGBClassifier( 
                           min_child_weight = 7,
                           max_bin = 31,
                           max_leaves = 162,
                           eta = 0.019902707539573634,
                           gamma = 1.5855410310434896 ,
                           colsample_bytree = 0.50523041456644,
                           alpha = 4.920429154892943,
                           eval_metric='mlogloss',
                         use_label_encoder=False)

estimator.fit(X,y)

XGBClassifier(alpha=4.920429154892943, base_score=0.5, booster='gbtree',
              colsample_bylevel=1, colsample_bynode=1,
              colsample_bytree=0.50523041456644, enable_categorical=False,
              eta=0.019902707539573634, eval_metric='mlogloss',
              gamma=1.5855410310434896, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.019902708, max_bin=31,
              max_delta_step=0, max_depth=6, max_leaves=162, min_child_weight=7,
              missing=nan, monotone_constraints='()', n_estimators=100,
              n_jobs=8, num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=4.92042923, reg_lambda=1, scale_pos_weight=1,
              subsample=1, ...)

In [94]:
cv = cross_validate(estimator,X,y,cv =5, scoring = ['accuracy','precision','recall','f1'] )

### Scores

In [95]:
print(cv['test_accuracy'].mean())
print(cv['test_precision'].mean())
print(cv['test_recall'].mean())
print(cv['test_f1'].mean())

0.6728929384965833
0.5920447892260252
0.08214497424534546
0.14408915221426447


## New Data

In [96]:
df2 = pd.read_csv('../data/new_data_test.csv')

df2.drop(columns = cols_to_drop, inplace = True)

df2 = df2.drop_duplicates()

X_new = df2.drop(columns = 'Label')
y_new = df2.Label

X_new = pipline.transform(X_new)

#### Confussion Matrix

In [97]:
y_pred = estimator.predict(X_new)
matrix = confusion_matrix(y_new,y_pred,labels = [1,0])
matrix

array([[ 30, 296],
       [ 24, 590]])

### Precision and Recall

In [98]:
print(f'precision: {(matrix[0][0]/(matrix[0][0]+matrix[1][0])):.2f}')
print(f'recall: {(matrix[0][0]/(matrix[0][0]+matrix[0][1])):.2f}')  

precision: 0.56
recall: 0.09


In [100]:
(2*0.65+0.18)/3

0.49333333333333335

In [87]:
pd.Series(estimator.feature_importances_,index = df.columns[0:-1]).sort_values(ascending = False)

surprisePercentage                       0.063326
otherCurrentAssets                       0.049891
reportedEPS                              0.032553
ebitda                                   0.028170
otherNonCurrrentAssets                   0.027230
surprise                                 0.026727
estimatedEPS                             0.026071
incomeBeforeTax                          0.023140
interestExpense                          0.022210
sellingGeneralAndAdministrative          0.022159
totalRevenue                             0.021925
propertyPlantEquipment                   0.021824
grossProfit                              0.021594
totalShareholderEquity                   0.021470
retainedEarnings                         0.021324
otherCurrentLiabilities                  0.021301
ebit                                     0.021012
totalLiabilities                         0.020945
totalAssets                              0.020743
totalCurrentLiabilities                  0.020605


In [84]:
estimator.feature_importances_.shape

(46,)