## Import Library

In [1]:
import pandas as pd
import numpy as np 
import sklearn
import joblib
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import mean_squared_error, classification_report, confusion_matrix, accuracy_score

## Data Preprocessing

In [2]:
df_Stock = pd.read_csv("Desktop/finalproject_training.csv")
comp_list = df_Stock.comp_id.unique()
print(len(comp_list))

### Create next month return

df_Stock["m_next_ret"] = df_Stock.groupby('comp_id')["m_ret"].shift(-1)
df_Stock = df_Stock.dropna(subset = ["m_next_ret"])

### Construct self-defined feature

df_Stock["h/l"] = df_Stock["m_high_adj"] / df_Stock["m_low_adj"] - 1
df_Stock["d/p"] = df_Stock["m_divs"] / df_Stock["close_adj"]
df_Stock["log_m_volume_adj"] = np.log(df_Stock["m_volume_adj"] + 1)
df_Stock["log_SP500"] = np.log(df_Stock["SP500WeeklyClose"] + 1)


feature_names = ["m_ret", "d/p", "h/l", "log_m_volume_adj", "log_SP500", 
                 "Bullish", "Bearish", "Bullish8WeekMovAvg", 
                 "epsfxq",  "mkvaltq", "gsector"]

### Select company based on data availability of feature
comp_selection = comp_list
for feature in feature_names:
    comp_selection = comp_selection[df_Stock.groupby(df_Stock.comp_id, sort = False)[feature]\
        .apply(lambda x: not x.isnull().all())]
    df_Stock = df_Stock[df_Stock.comp_id.isin(comp_selection)]
    df_Stock[feature] = df_Stock.groupby(df_Stock.comp_id, sort = False)[feature].apply(lambda x: x.ffill().bfill()) 

### Create Dummy Variables for feature gsector

non_dummy_cols = df_Stock.columns
df_Stock = pd.get_dummies(df_Stock, columns = ["gsector"])
dummy_cols = list(set(df_Stock.columns) - set(non_dummy_cols))

feature_names = ["m_ret", "d/p", "h/l", "log_m_volume_adj", "log_SP500", 
                 "Bullish", "Bearish", "Bullish8WeekMovAvg", 
                 "epsfxq",  "mkvaltq"] + dummy_cols

### Construct training dataset

X_ret = df_Stock.loc[:, feature_names]
y_ret = df_Stock.loc[:, "m_next_ret"]

X_gof = df_Stock.loc[:, feature_names]
y_gof = ((1 + np.sign(y_ret)) / 2).astype("int")

2847


## Construct an iterable of training-test splits for Cross Validation via ShufflieSplit

In [3]:
from sklearn.model_selection import ShuffleSplit

cross_validation_shufflesplit = ShuffleSplit(n_splits = 10, test_size = 0.25)

## Multi-layer Perceptron Classifier

In [5]:
from sklearn.ensemble import BaggingRegressor
from sklearn.neural_network import MLPRegressor

base_MLP_regressor = MLPRegressor(max_iter = 1000)
MLP_regressor_bagging = BaggingRegressor(base_estimator = base_MLP_regressor)
MLP_regressor_bagging_pipeline = make_pipeline(StandardScaler(), MLP_regressor_bagging)

MLP_regressor_bagging_param_grid = dict(baggingregressor__base_estimator__hidden_layer_sizes = [(5, 2), (6, 3, 2), (8, 4, 2)],
                                        baggingregressor__n_estimators = [5, 10, 15, 20])

MLP_regressor_bagging_grid_search = GridSearchCV(MLP_regressor_bagging_pipeline,
                                                 param_grid = MLP_regressor_bagging_param_grid,
                                                 cv = cross_validation_shufflesplit,
                                                 scoring = "neg_mean_squared_error")
                                                
MLP_regressor_bagging_grid_search.fit(X_ret, y_ret)

means = MLP_regressor_bagging_grid_search.cv_results_["mean_test_score"]
stds = MLP_regressor_bagging_grid_search.cv_results_["std_test_score"]


print("Mean Test Score:", means)
print("Standard Deviation of Test Score:",  stds)
print('Optimal Model Parameters: ' + str(MLP_regressor_bagging_grid_search.best_params_))

Mean Test Score: [-0.59574363 -0.59146958 -0.61635034 -0.6230525  -0.57982949 -0.55932197
 -0.56718896 -0.63691172 -0.58772993 -0.58080423 -0.60125917 -0.5596763 ]
Standard Deviation of Test Score: [0.14592403 0.21250379 0.14788211 0.18761797 0.16872052 0.1694852
 0.17916905 0.18372441 0.16693642 0.18245085 0.19069797 0.17654757]
Optimal Model Parameters: {'baggingregressor__base_estimator__hidden_layer_sizes': (6, 3, 2), 'baggingregressor__n_estimators': 10}


## Multi-layer Perceptron Classifier

In [9]:
from sklearn.ensemble import BaggingClassifier
from sklearn.neural_network import MLPClassifier

base_MLP_classifier = MLPClassifier(max_iter = 1000)
MLP_classifier_bagging = BaggingClassifier(base_estimator = base_MLP_classifier)
MLP_classifier_bagging_pipeline = make_pipeline(StandardScaler(), MLP_classifier_bagging)

MLP_classifier_bagging_param_grid = dict(baggingclassifier__base_estimator__hidden_layer_sizes = [(5, 2), (6, 3, 2), (8, 4, 2)],
                                        baggingclassifier__n_estimators = [5, 10, 15, 20])

MLP_classifier_bagging_grid_search = GridSearchCV(MLP_classifier_bagging_pipeline,
                                                 param_grid = MLP_classifier_bagging_param_grid,
                                                 cv = cross_validation_shufflesplit,
                                                 scoring = "accuracy",
                                                 n_jobs = -1)
                                                
MLP_classifier_bagging_grid_search.fit(X_gof, y_gof)

means = MLP_classifier_bagging_grid_search.cv_results_["mean_test_score"]
stds = MLP_classifier_bagging_grid_search.cv_results_["std_test_score"]


print("Mean Test Score:", means)
print("Standard Deviation of Test Score:",  stds)
print('Optimal Model Parameters: ' + str(MLP_classifier_bagging_grid_search.best_params_))

Mean Test Score: [0.63219946 0.63383072 0.63567695 0.63584977 0.63912915 0.64155707
 0.64267409 0.6423158  0.64383325 0.64732339 0.64837717 0.64990727]
Standard Deviation of Test Score: [0.00364989 0.0041399  0.00428267 0.00314623 0.00459667 0.00450393
 0.00317144 0.00227266 0.0039586  0.00436569 0.00251905 0.00337549]
Optimal Model Parameters: {'baggingclassifier__base_estimator__hidden_layer_sizes': (8, 4, 2), 'baggingclassifier__n_estimators': 20}
