In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV,StratifiedKFold
from sklearn import datasets
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import pickle
##### scale your data ######
##### MinMaxScaler: This estimator scales and translates each feature individually into (0,1) #######
##### StandardScaler: Standardize features by removing the mean and scaling to unit variance #####
##### RobustScaler: Scale features using statistics that are robust to outliers ######
#scaler = [MinMaxScaler(),StandardScaler(), RobustScaler()]
scaler = [StandardScaler()]
from sklearn.utils import resample
#Get data loaded
df=pd.read_csv("df_feat_small.csv").drop('Unnamed: 0',axis=1).dropna().copy()
#df=df.drop('P/B_RATIO_Q_0',axis=1)
list_etf = []
df_etf = df[['DWAS', 'EWSC',
       'FNDA', 'IUSS', 'OMFS', 'PBSM', 'PSCD', 'PSCF', 'PSCH', 'PSCI', 'PSCT',
       'PSCU', 'RWJ', 'RZG', 'RZV', 'SLY', 'SLYG', 'SLYV', 'SMLV', 'SPMD',
       'SPSM', 'VB', 'VBK', 'VBR', 'VIOG', 'VIOO', 'VIOV', 'VSS', 'XSHD',
       'XSHQ', 'XSMO', 'XSVM']].copy()
###### Create list of ETFs
ETF_list=df_etf.columns
df_features = df.drop(ETF_list, axis=1)
feature_list=df_features.columns
n=ETF_list.size
###### Define a Function that Balances Scale Dataset
def upSample(df):
    n=ETF_list.size # number of ETFs
    Dataset_X=[]
    Dataset_y=[]
    X_train, X_test, y_train, y_test = [], [], [], []
    #Dataset_X=np.array([] for j in range(n))
    #Dataset_y = np.array([] for j in range(n))
    for i in range(n):
        #Count the number of ones and zeros
        numbFalse = (df[ETF_list[i]] == 0).sum()
        numbTrue = (df[ETF_list[i]] == 1).sum()
        if numbFalse > numbTrue:
            upSample = numbFalse
        else:
            upSample = numbTrue
        # Separate majority and minority classes
        df_majority = df[df[ETF_list[i]] == 0]
        df_minority = df[df[ETF_list[i]] == 1]
        # Upsample minority class
        df_minority_upsampled = resample(df_minority,
                                             replace=True,     # sample with replacement
                                             n_samples = upSample,    # to match majority class
                                             random_state=123) # reproducible results
        df_upsampled = pd.concat([df_majority, df_minority_upsampled])
        # Separate input features (X) and target variable (y)
        y = df_upsampled[ETF_list[i]]
        X = df_upsampled.drop(ETF_list[i], axis=1)
        X_train_temp, X_test_temp, y_train_temp, y_test_temp = train_test_split(X, y, test_size = 0.3, random_state = 1)
        #print(y.size)
        X_train.append(X_train_temp)
        X_test.append(X_test_temp)
        y_train.append(y_train_temp)
        y_test.append(y_test_temp)
        #Dataset_X.append(X)
        #Dataset_y.append(y)
    #X_train, X_test, y_train, y_test = train_test_split(Dataset_X, Dataset_y, test_size = 0.3, random_state = 1)
    #result=X_train, X_test, y_train, y_test
    return X_train, X_test, y_train, y_test
###### generate dataset #####
X_train, X_test, y_train, y_test=upSample(df)
###### use Random Forest as classifier #####
clf=RandomForestClassifier(random_state=0)
###### cross validation: separate dataset ##########
kfold= StratifiedKFold(n_splits=3, shuffle = True, random_state=0)
####set up pipline here ######
#### try different scaler here try [MinMaxScaler(),StandardScaler(), RobustScaler()] ####
steps = [('scaler', StandardScaler()),('rf',clf)]
pipeline= Pipeline(steps)
######## grid search for hyperparameter #########
######## Need to tune parameter, max_depth, min_samples_leaf and max_features #####
parameters = {
    #'rf__scaler': scaler,
    'rf__max_depth':  [3,4],#np.linspace(2,4,2),
    'rf__min_samples_leaf': [3,4],#np.linspace(1,5,5),
    'rf__max_features': [5,6,7],#np.linspace(2,4,2),
    #"rf__criterion": ["gini", "entropy"]
}
##### model ####
mdl=GridSearchCV(pipeline,param_grid = parameters,n_jobs=-1,cv=kfold,scoring='roc_auc')
##### fit model with data #####
##### Here we generate a matrix , 32 columns gives predicted result of 32 classifier ######
Predict_M=[]
for i in range(n):
    cur_df=X_train[i]
    num_of_features = cur_df.columns.size - n
    X=cur_df[cur_df.columns[-num_of_features:]]
    Y=y_train[i]
    X=X.drop('TICKER',axis=1)
    mdl.fit(X,Y)
    result=mdl.predict(X)
    Predict_M.append(result)
    filename = 'model'+str(i+1)+'sav'
    pickle.dump(mdl, open(filename, 'wb'))
###### Test the model ######
###### Load models by pickle #####
Test_M=[]
for i in range(n):
    cur_df=X_test[i]
    num_of_features = cur_df.columns.size - n
    X_cur_test=cur_df[cur_df.columns[-num_of_features:]]
    #X_cur_test = X_test[i]
    y_cur_test = y_test[i]
    filename = 'model'+str(i+1)+'sav'
    mdl = pickle.load(open(filename, 'rb'))
    X_cur_test=X_cur_test.drop('TICKER',axis=1)
    result= mdl.predict(X_cur_test)
    Test_M.append(result)
    
##### Accuracy
diff=0
total=0
for i in range(len(y_test)):
    total+=len(y_test[i])
    diff+=np.sum(abs(y_test[i]*1-Test_M[i].T*1))
Accuracy = 1-diff/total