In [9]:
from sklearn.model_selection import GridSearchCV
from stratmanager import StrategyManager

# Data Management
import pandas as pd
import numpy as np
from ta.momentum import RSIIndicator

from ta import add_all_ta_features
import ta

# Statistics
from statsmodels.tsa.stattools import adfuller

# Unsupervised Machine Learning
from sklearn.decomposition import PCA

# Supervised Machine Learning
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,StackingClassifier
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import precision_score
import pickle 
from datetime import datetime


# Reporting
import matplotlib.pyplot as plt
import yfinance
from sklearn.tree import plot_tree
def print_results(results):
    print('BEST PARAMS: {}\n'.format(results.best_params_))
    means = results.cv_results_['mean_test_score']
    stds = results.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, results.cv_results_['params']):
        print('{} (+/-{}) for {}'.format(round(mean, 3), round(std * 2, 3), params))

In [8]:
stock_name = "TATACONSUM.NS"
df = yfinance.download (tickers = "{}".format(stock_name),start="2017-01-01",
                              interval = "1d", group_by = 'ticker', auto_adjust = True)


[*********************100%***********************]  1 of 1 completed


In [3]:
#df = add_all_ta_features(df, open="Open", high="High", low="Low", close="Close", volume="Volume", fillna=True)

In [16]:
df.describe()
df["Returns"] = df["Close"].pct_change()

df["RSI"] =  RSIIndicator(close=df["Close"], window=14).rsi()
df["Range"] = df["High"] / df["Low"] - 1
df["MA_50"] = df["Close"].rolling(window=50).mean()
df["MA_20"] = df["Close"].rolling(window=20).mean()
df["MA_200"] = df["Close"].rolling(window=200).mean()
df["Avg_Range"] = df["Range"].rolling(window=30).mean()
df['OBV'] = ta.volume.on_balance_volume(df['Close'], df['Volume'])


df['date'] = pd.to_datetime(df.index)
df.set_index("date", inplace=True)
df["TARGET"] = -1
df.loc[df["Close"].shift(-1) > df["Close"], "TARGET"] = 1
df.dropna(inplace=True)

In [17]:
# Identify non-stationary columns
df = df.dropna()
non_stationaries = []
for col in df.columns:
    if col != "volatility_bbli" and col !="Date":
        #print(col)
        dftest = adfuller(df[col].values)
        p_value = dftest[1]
        t_test = dftest[0] < dftest[4]["1%"]
        if p_value > 0.05 or not t_test:
            non_stationaries.append(col)
            
print(f"Non-Stationary Features Found: {len(non_stationaries)}")

df_stationary = df.copy()
df_stationary[non_stationaries] = df_stationary[non_stationaries].pct_change()
df_stationary = df_stationary.iloc[1:]


# Find NaN Rows
na_list = df_stationary.columns[df_stationary.isna().any().tolist()]
df_stationary.drop(columns=na_list, inplace=True)


# Handle inf values
df_stationary.replace([np.inf, -np.inf], 0, inplace=True)
df_stationary.head()

df_stationary = df_stationary.reset_index()


df_stationary = df_stationary.drop("date",axis=1)


Non-Stationary Features Found: 8


In [18]:
# Split Target from Featureset
X = df_stationary.iloc[:, :-1]
y = df_stationary.iloc[:, -1]

# Feature Scaling
df_sc = df_stationary.copy()
X_fs = StandardScaler().fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_fs, y, test_size=0.2, random_state=42)

In [19]:
gb = RandomForestClassifier()
estimators = [
    ('rf', RandomForestClassifier()),
    ('gb', GradientBoostingClassifier())]
sc = StackingClassifier(estimators=estimators)
sc.get_params()


{'cv': None,
 'estimators': [('rf', RandomForestClassifier()),
  ('gb', GradientBoostingClassifier())],
 'final_estimator': None,
 'n_jobs': None,
 'passthrough': False,
 'stack_method': 'auto',
 'verbose': 0,
 'rf': RandomForestClassifier(),
 'gb': GradientBoostingClassifier(),
 'rf__bootstrap': True,
 'rf__ccp_alpha': 0.0,
 'rf__class_weight': None,
 'rf__criterion': 'gini',
 'rf__max_depth': None,
 'rf__max_features': 'sqrt',
 'rf__max_leaf_nodes': None,
 'rf__max_samples': None,
 'rf__min_impurity_decrease': 0.0,
 'rf__min_samples_leaf': 1,
 'rf__min_samples_split': 2,
 'rf__min_weight_fraction_leaf': 0.0,
 'rf__n_estimators': 100,
 'rf__n_jobs': None,
 'rf__oob_score': False,
 'rf__random_state': None,
 'rf__verbose': 0,
 'rf__warm_start': False,
 'gb__ccp_alpha': 0.0,
 'gb__criterion': 'friedman_mse',
 'gb__init': None,
 'gb__learning_rate': 0.1,
 'gb__loss': 'log_loss',
 'gb__max_depth': 3,
 'gb__max_features': None,
 'gb__max_leaf_nodes': None,
 'gb__min_impurity_decrease': 0.0

In [20]:
parameters = {
    'gb__n_estimators': [250],
    'rf__n_estimators': [50],
    'final_estimator': [ LogisticRegression(C=10)],
    'passthrough': [True, False]
}
cv = GridSearchCV(sc, parameters, cv=5)
cv.fit(X_train,y_train)

print_results(cv)

BEST PARAMS: {'final_estimator': LogisticRegression(C=10), 'gb__n_estimators': 250, 'passthrough': False, 'rf__n_estimators': 50}

0.517 (+/-0.033) for {'final_estimator': LogisticRegression(C=10), 'gb__n_estimators': 250, 'passthrough': True, 'rf__n_estimators': 50}
0.53 (+/-0.019) for {'final_estimator': LogisticRegression(C=10), 'gb__n_estimators': 250, 'passthrough': False, 'rf__n_estimators': 50}


In [22]:
parameters = {
    'n_estimators': [5, 12, 250, 500],
    'max_depth': [1,3],
    "random_state": [45]
}
#cv = GridSearchCV(gb, parameters, cv=5)
#cv.fit(X_train,y_train)
#print_results(cv)


In [24]:
classifier = cv.best_estimator_
y_prob = classifier.predict_proba(X_test)
y_pred = classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy}")
print(f"Test Precision: {precision}")
high = []
for prob in y_prob:
    if(prob[0] > prob[1]):
        high.append(prob[0])
    else:
        high.append(prob[1])
analysis = pd.DataFrame({ "predic" : y_pred, "y_test":y_test,"prob":high})
correct = analysis[analysis["predic"] == analysis["y_test"]]
error = analysis[analysis["predic"] != analysis["y_test"]]

print("out of ",analysis.shape[0]," :","Correct :",correct.shape,"Error",error.shape)
limit = 0.51
filterd_full = analysis[analysis["prob"] > limit ]

correct_filter = filterd_full[filterd_full["predic"] == filterd_full["y_test"]]
error_filter = filterd_full[filterd_full["predic"] != filterd_full["y_test"]]

print("Filter of ",filterd_full.shape[0],"Correct :",correct_filter.shape,"Error",error_filter.shape)

Test Accuracy: 0.506993006993007
Test Precision: 0.5174418604651163
out of  286  : Correct : (145, 3) Error (141, 3)
Filter of  218 Correct : (113, 3) Error (105, 3)


In [11]:
# Plot Feature Importances
fig = plt.figure(figsize=(22, 5))
importance_labels = X.columns
importance_features = classifier.feature_importances_
plt.bar(importance_labels, importance_features)
plt.show()

AttributeError: 'StackingClassifier' object has no attribute 'feature_importances_'

<Figure size 2200x500 with 0 Axes>

In [None]:
# Select Best Features
recommended_feature_labels = importance_features.mean()
i = 0
recommended_feature_labels = []
recommended_feature_score = []
for fi in importance_features:
    if fi > mean_feature_importance:
        recommended_feature_labels.append(importance_labels[i])
        recommended_feature_score.append(fi)
    i += 1

In [None]:
fig, ax = plt.subplots(figsize=(35, 6))

ax.bar(recommended_feature_labels, recommended_feature_score)
fig.show()

In [None]:
individual_tree = classifier.estimators_[0]

In [None]:
individual_tree

In [None]:
plt.figure(figsize=(12, 8))
plot_tree(individual_tree, filled=True,feature_names= list(X.columns),class_names=list('Target'))
plt.show()