In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import time
import sqlite3
import json
import keras


In [2]:
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split

In [None]:
from google.colab import drive
drive.mount("/content/drive", force_remount=True)

Mounted at /content/drive


In [4]:
db_path = '/content/drive/MyDrive/ISA_proj-20240208T060848Z-001/ISA_proj/output_derived.db'
conn = sqlite3.connect(db_path)

In [5]:
cursor = conn.cursor()
cursor.execute("select distinct industry from Clustered_output")
res = cursor.fetchall()

In [6]:
res = [ele[0] for ele in res]
print(res)

['Banks—Regional', 'Software—Application', 'Software—Infrastructure', 'Information Technology Services', 'Capital Markets', 'Internet Content & Information', 'Electronic Components', 'Consumer Electronics', 'Entertainment', 'Medical Devices']


In [None]:
# def preprocessing(df, cat_col):
#   label_encoder = LabelEncoder()
#   df[cat_col] = label_encoder.fit_transform(df[cat_col])
#   df.drop(columns=['Stock', 'industry'], axis=1, inplace=True)

#   return df

In [7]:
def preprocessing(df, cat_col):
    # Apply one-hot encoding

    df.drop(columns=['Stock', 'Stock_Splits', 'industry'], axis=1, inplace=True)
    df_encoded  = pd.get_dummies(df, columns=[cat_col], drop_first=False)
    encoded_columns = [col for col in df_encoded.columns if cat_col in col]

    return df_encoded, encoded_columns


In [8]:
json_mappings_path = '/content/drive/MyDrive/ISA_proj-20240208T060848Z-001/ISA_proj/json_mappings'

In [9]:
cat_col = 'Risk_Label'

In [10]:
industry_to_df_map = {}

In [11]:
for industry in res:
  qry = f"select * from Clustered_output where industry = '{industry}'"
  df = pd.read_sql(qry, conn)

  #print(df.columns)
  df, mapping_cols = preprocessing(df, cat_col)
  with open(f'{json_mappings_path}/{industry}.json', 'w') as fp:
    json.dump(mapping_cols, fp)
  #print(df.columns)
  df.dropna(inplace=True)
  #break
  industry_to_df_map[industry] = df



In [12]:
import tensorflow as tf
from tensorflow.keras import layers, models
import numpy as np

In [13]:
# @keras.saving.register_keras_serializable(package="MyLayers")
# class FuzzificationLayer(layers.Layer):
#     def __init__(self, num_variables, num_sets):
#         super(FuzzificationLayer, self).__init__(name="fuzzification_layer")
#         self.num_variables = num_variables
#         self.num_sets = num_sets

#         self.means = self.add_weight(shape=(num_variables, num_sets),
#                                      initializer='random_normal',
#                                      trainable=True)
#         self.sigmas = self.add_weight(shape=(num_variables, num_sets),
#                                       initializer='random_normal',
#                                       trainable=True)

#     def get_membership_function_parameters(self):
#         return self.means.numpy(), self.sigmas.numpy()

#     def plot_random_membership_functions(self, num_to_plot=5):
#         import matplotlib.pyplot as plt

#         means, sigmas = self.get_membership_function_parameters()
#         x = np.linspace(-1, 1, 100)  # Change this range according to your input data range

#         fig, axs = plt.subplots(num_to_plot, figsize=(10, num_to_plot * 2))

#         for i in range(num_to_plot):
#             var_idx = np.random.randint(self.num_variables)
#             set_idx = np.random.randint(self.num_sets)
#             mean = means[var_idx, set_idx]
#             sigma = sigmas[var_idx, set_idx]

#             y = np.exp(-0.5 * ((x - mean) ** 2) / sigma ** 2)
#             axs[i].plot(x, y, label=f'Variable {var_idx} Set {set_idx}')
#             axs[i].set_title(f'Variable {var_idx} Set {set_idx}')
#             axs[i].legend()

#         plt.tight_layout()
#         plt.show()

#     def call(self, inputs):
#         # Apply a Gaussian membership function
#         tmp = tf.exp(-0.5 * tf.square((tf.expand_dims(inputs, -1) - self.means) / self.sigmas))
#         #print("fuzzification layer shape", tmp.shape)
#         return tmp

@keras.saving.register_keras_serializable(package="MyLayers")
class FuzzificationLayer(layers.Layer):
    def __init__(self, num_variables, num_sets, **kwargs):
        super(FuzzificationLayer, self).__init__(**kwargs)
        self.num_variables = num_variables
        self.num_sets = num_sets
        self.means = self.add_weight(name='means',
                                     shape=(num_variables, num_sets),
                                     initializer='random_normal',
                                     trainable=True)
        self.sigmas = self.add_weight(name='sigmas',
                                      shape=(num_variables, num_sets),
                                      initializer='random_normal',
                                      trainable=True)

    def call(self, inputs):
        tmp = tf.exp(-0.5 * tf.square((tf.expand_dims(inputs, -1) - self.means) / self.sigmas))
        return tmp

    def get_config(self):
        config = super(FuzzificationLayer, self).get_config()
        config.update({
            'num_variables': self.num_variables,
            'num_sets': self.num_sets
        })
        return config

In [14]:
@keras.saving.register_keras_serializable(package="MyLayers")
class RuleApplicationLayer(layers.Layer):
    def __init__(self, num_classes, num_variables, num_rules, **kwargs):
        super(RuleApplicationLayer, self).__init__(**kwargs)
        self.num_classes = num_classes
        self.num_variables = num_variables
        self.num_rules = num_rules
        self.rule_combination_layer = layers.Dense(num_rules, use_bias=False, activation='sigmoid')

    def call(self, fuzzified_inputs):
        num_features = self.num_variables * self.num_classes
        reshaped_inputs = tf.reshape(fuzzified_inputs, [-1, num_features])
        rule_firing_strengths = self.rule_combination_layer(reshaped_inputs)
        return rule_firing_strengths

    def get_config(self):
        config = super(RuleApplicationLayer, self).get_config()
        config.update({
            'num_classes': self.num_classes,
            'num_variables': self.num_variables,
            'num_rules': self.num_rules
        })
        return config




In [15]:
@keras.saving.register_keras_serializable(package="MyLayers")
class DefuzzificationLayer(layers.Layer):
    def __init__(self, num_rules, num_classes, **kwargs):
        super(DefuzzificationLayer, self).__init__(**kwargs)
        self.num_rules = num_rules
        self.num_classes = num_classes
        self.rule_weights = self.add_weight(name='rule_weights',
                                            shape=(num_rules, num_classes),
                                            initializer='random_normal',
                                            trainable=True)

    def call(self, rule_outputs):
        weighted_sum = tf.matmul(rule_outputs, self.rule_weights)
        return weighted_sum

    def get_config(self):
        config = super(DefuzzificationLayer, self).get_config()
        config.update({
            'num_rules': self.num_rules,
            'num_classes': self.num_classes
        })
        return config


In [23]:
@keras.saving.register_keras_serializable(package="MyLayers")
class ANFISModel(models.Model):
    def __init__(self, num_variables, num_sets, num_rules, num_classes, **kwargs):
        super(ANFISModel, self).__init__(name="anfis_model", **kwargs)
        self.num_variables = num_variables
        self.num_sets = num_sets
        self.num_rules = num_rules
        self.num_classes = num_classes
        self.fuzzification_layer = FuzzificationLayer(num_variables, num_sets)
        self.rule_application_layer = RuleApplicationLayer(num_classes, num_variables, num_rules)
        self.defuzzification_layer = DefuzzificationLayer(num_rules, num_classes)

    def call(self, inputs):
        fuzzified = self.fuzzification_layer(inputs)
        #print("fuzzified shape ", fuzzified.shape)
        rule_applied = self.rule_application_layer(fuzzified)
        #print("rule_applied shape ", rule_applied.shape)
        logits = self.defuzzification_layer(rule_applied)
        #print("softmax shape ", tf.nn.softmax(logits).shape)
        return tf.nn.softmax(logits)  # Use softmax to convert logits to probabilities

    def get_config(self):
        # Implement get_config to enable model saving and loading
        config = super(ANFISModel, self).get_config()
        config.update({
            'num_variables': self.num_variables,
            'num_sets': self.num_sets,
            'num_rules': self.num_rules,
            'num_classes': self.num_classes,
        })
        return config

    @classmethod
    def from_config(cls, config):
        # Implement from_config to enable model loading
        return cls(**config)


In [17]:
df = industry_to_df_map['Entertainment']

In [18]:
'Risk_Label' in df.columns

False

In [19]:
X = df.drop('Risk_Label', axis=1)
y = df['Risk_Label']

KeyError: "['Risk_Label'] not found in axis"

In [None]:
X.columns

Index(['Open', 'High', 'Low', 'Close', 'Volume', 'Dividends', 'Stock_Splits',
       'Price_Range', 'Pct_Change', 'SMA_7', 'EMA_7', 'Volatility', 'RSI',
       'MACD', 'MACD_Signal', 'VWAP', 'RiskAdjustedReturn', 'IndustryRankRSI',
       'CloseToVWAPRatio', 'MACDSignalDiff', 'Percentage_Bandwidth'],
      dtype='object')

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
y_train.shape

(3573,)

In [None]:
model = ANFISModel(num_variables=X_train.shape[1], num_sets=3, num_rules=100, num_classes=3)

In [None]:
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',  # Use this if labels are integers
              metrics=['accuracy'])

In [21]:
models_dir = '/content/drive/MyDrive/ISA_proj-20240208T060848Z-001/ISA_proj/models'
os.makedirs(models_dir, exist_ok=True)

In [24]:
industry_to_hist_map = {}

In [25]:
for industry, df in industry_to_df_map.items():
  risk_label_cols = [col for col in df.columns if 'Risk_Label' in col]
  X = df.drop(columns=risk_label_cols, axis=1)
  y = df[risk_label_cols].values
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
  model = ANFISModel(num_variables=X_train.shape[1], num_sets=3, num_rules=100, num_classes=3)
  model.compile(optimizer='adam',
              loss='categorical_crossentropy',  # Use this if labels are integers
              metrics=['accuracy'])
  s = time.time()
  industry_to_hist_map[industry] = model.fit(X_train, y_train, epochs=100, batch_size=32, validation_data=(X_test, y_test))

  model.save(f'{models_dir}/{industry}.keras')
  print(f'Time elapsed for {industry} is {time.time() - s}')

Epoch 1/100
[1m1331/1331[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.8345 - loss: 0.4418 - val_accuracy: 0.9753 - val_loss: 0.0902
Epoch 2/100
[1m1331/1331[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.9718 - loss: 0.0875 - val_accuracy: 0.9775 - val_loss: 0.0775
Epoch 3/100
[1m1331/1331[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - accuracy: 0.9733 - loss: 0.0814 - val_accuracy: 0.9671 - val_loss: 0.0780
Epoch 4/100
[1m1331/1331[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - accuracy: 0.9719 - loss: 0.0791 - val_accuracy: 0.9753 - val_loss: 0.0733
Epoch 5/100
[1m1331/1331[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.9737 - loss: 0.0777 - val_accuracy: 0.9688 - val_loss: 0.0758
Epoch 6/100
[1m1331/1331[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - accuracy: 0.9746 - loss: 0.0754 - val_accuracy: 0.9772 - val_loss: 0.0715
Epoch 7/10

In [3]:
!pip install --upgrade tensorflow




In [None]:
s = time.time()
history = model.fit(X_train, y_train, epochs=100, batch_size=32, validation_data=(X_test, y_test))
print(f'Time elapsed is {time.time() - s}')

In [26]:
plots_dir = '/content/drive/MyDrive/ISA_proj-20240208T060848Z-001/ISA_proj/plots'
os.makedirs(plots_dir, exist_ok=True)

In [27]:
for industry, history in industry_to_hist_map.items():
  train_loss = history.history['loss']
  train_accuracy = history.history['accuracy']
  val_loss = history.history['val_loss']
  val_accuracy = history.history['val_accuracy']

  # Plotting both training and validation loss
  plt.figure(figsize=(14, 6))

  plt.subplot(1, 2, 1)
  plt.plot(train_loss, label='Training Loss')
  plt.plot(val_loss, label='Validation Loss')
  plt.title('Training and Validation Loss')
  plt.xlabel('Epochs')
  plt.ylabel('Loss')
  plt.legend()

  # Plotting both training and validation accuracy
  plt.subplot(1, 2, 2)
  plt.plot(train_accuracy, label='Training Accuracy')
  plt.plot(val_accuracy, label='Validation Accuracy')
  plt.title('Training and Validation Accuracy')
  plt.xlabel('Epochs')
  plt.ylabel('Accuracy')
  plt.legend()

  plt.tight_layout()
  plt.savefig(f'{plots_dir}/{industry}.png')
  plt.close()

In [None]:
y

array([[0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       ...,
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0]], dtype=uint8)

In [None]:
industry_to_df_map.keys()

dict_keys(['Banks—Regional', 'Software—Application', 'Software—Infrastructure', 'Information Technology Services', 'Capital Markets', 'Internet Content & Information', 'Electronic Components', 'Consumer Electronics', 'Entertainment', 'Medical Devices'])

In [None]:
df = industry_to_df_map['Entertainment']
correlation_matrix = df.corr()
target_correlation = correlation_matrix['Risk_Label'].sort_values(key=np.abs, ascending=False)
print(target_correlation)

Risk_Label      1.000000
Low            -0.641563
EMA_7          -0.638991
Open           -0.638287
SMA_7          -0.637989
Close          -0.637904
High           -0.634603
BB_Middle      -0.632745
BB_Upper       -0.614209
BB_Lower       -0.611125
Price_Range     0.267352
VWAP           -0.184558
MACD_Signal    -0.104874
MACD           -0.096867
Volume          0.043926
Volatility      0.029520
RSI             0.028988
Dividends       0.011150
Pct_Change      0.011149
Stock_Splits         NaN
Name: Risk_Label, dtype: float64


In [None]:
df.columns

Index(['Open', 'High', 'Low', 'Close', 'Volume', 'Dividends', 'Stock_Splits',
       'Risk_Label', 'Price_Range', 'Pct_Change', 'SMA_7', 'EMA_7',
       'Volatility', 'RSI', 'MACD', 'MACD_Signal', 'BB_Upper', 'BB_Middle',
       'BB_Lower', 'VWAP'],
      dtype='object')

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Getting feature importances
importances = model.feature_importances_
feature_names = X.columns
feature_importance_dict = dict(zip(feature_names, importances))
sorted_feature_importance = sorted(feature_importance_dict.items(), key=lambda item: item[1], reverse=True)

for feature, importance in sorted_feature_importance:
    print(f"{feature}: {importance}")

VWAP: 0.20489554368334523
Low: 0.15450880300070635
Volume: 0.09252436225510673
BB_Lower: 0.08941974519641618
Open: 0.0700501844648665
BB_Middle: 0.06800163911093533
EMA_7: 0.06607322677705339
SMA_7: 0.06367969485632376
Close: 0.05640712830890985
BB_Upper: 0.0433212289852434
High: 0.04302852200951276
Price_Range: 0.03164095765781694
MACD_Signal: 0.004654971438699895
MACD: 0.0043053327820677155
Volatility: 0.00428060557731239
RSI: 0.0021641700233969953
Pct_Change: 0.001043883872286533
Dividends: 0.0
Stock_Splits: 0.0
