<a href="https://colab.research.google.com/github/the-ml-bull/Hello_World/blob/main/5_Features.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import numpy as np
import pandas as pd 
from datetime import datetime 

def load_data():
  url = 'https://raw.githubusercontent.com/the-ml-bull/Hello_World/main/Fx60.csv'
  dateparse = lambda x: datetime.strptime(x, '%d/%m/%Y %H:%M')

  df = pd.read_csv(url, parse_dates=['date'], date_parser=dateparse)

  return df 

In [8]:
 def create_x_values(df, feature_names):

  x_values_df = pd.DataFrame()

  # loop thorugh feature name and "back periods" to go back 
  x_feature_names = []
  for feature in feature_names:
    for period in [1,2,3,4]:
      # create the name (eg 'x_audusd_close_t-1')
      feature_name = 'x_' + feature + '_t-' + str(period)
      x_feature_names.append(feature_name)
      x_values_df[feature_name] = df[feature].shift(period)

  # Add "starting" values when used in normalization 
  x_values_df['x_audusd_open'] = df['audusd_open'].shift(4)
  x_values_df['x_eurusd_open'] = df['eurusd_open'].shift(4)
  x_values_df['audusd_open'] = df['audusd_open']
  x_values_df['eurusd_open'] = df['eurusd_open']
  
  # add all future y values for future periods
  for period in [0,1,2,3]:
    name = 'y_t-' + str(period)
    x_values_df[name] = df['audusd_close'].shift(-period)

  # y is points 4 periods into the future - the open price now (not close)
  x_values_df['y_future'] = df['audusd_close'].shift(-3)
  x_values_df['y_change_price'] = x_values_df['y_future'] - df['audusd_open']
  x_values_df['y_change_points'] = x_values_df['y_change_price'] * 100000 
  x_values_df['y'] = np.where(x_values_df['y_change_points'] >= 200, 1, 0)

  # and reset df and done 
  x_values_df = x_values_df.copy()
  return x_values_df, x_feature_names

In [30]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler, StandardScaler

def normalize_data(df, x_fields, method):
  
  norm_df = df.copy()
  y_fields = ['y_t-0', 'y_t-1', 'y_t-2', 'y_t-3']

  if method == 'price':
    for field in x_fields:
      norm_df[field + '_norm'] = df[field] 
      
    for field in y_fields:
      norm_df[field + '_norm'] = df[field] 
    
  if method == 'points': 
    for field in x_fields:
      if 'volume' in field:
        norm_df[field + '_norm'] = df[field] / 100
      elif 'audusd' in field:
        norm_df[field + '_norm'] = (df[field] - df['x_audusd_open']) * 100000 
      elif 'eurusd' in field:
        norm_df[field + '_norm'] = (df[field] - df['x_eurusd_open']) * 100000

    for field in y_fields:
      norm_df[field + '_norm'] = (df[field] -  df['audusd_open']) * 100000 

  if method == 'percentage':
    for field in x_fields:
      if 'volume' in field:
        norm_df[field + '_norm'] = df[field] / 10000
      elif 'audusd' in field:
        norm_df[field + '_norm'] = (df[field] - df['x_audusd_open']) / df[field] * 100 
      elif 'eurusd' in field:
        norm_df[field + '_norm'] = (df[field] - df['x_eurusd_open']) / df[field] * 100
      
    for field in y_fields:
      norm_df[field + '_norm'] = (df[field] - df['audusd_open']) / df[field] * 100

  if method == 'minmax':
    scaler = MinMaxScaler()
    scaled = scaler.fit_transform(df[x_fields + y_fields])
    norm_field_names = [x + '_norm' for x in x_fields + y_fields]
    norm_df[norm_field_names] = scaled

  if method == 'stddev':
    scaler = StandardScaler()
    scaled = scaler.fit_transform(df[x_fields + y_fields])
    norm_field_names = [x + '_norm' for x in x_fields + y_fields]
    norm_df[norm_field_names] = scaled

  x_feature_names_norm = [x + '_norm' for x in x_fields]
  return norm_df, x_feature_names_norm

In [32]:
def get_train_val(df, x_feature_names_norm):
  #
  # Create Train and Val datasets 
  # 

  x = df[x_feature_names_norm] 
  y = df['y']
  y_points = df['y_change_points']

  # Note Fx "follows" (time series) so randomization is NOT a good idea
  # create train and val datasets. 
  no_train_samples = int(len(x) * 0.7)
  x_train = x[4:no_train_samples]
  y_train = y[4:no_train_samples]

  x_val = x[no_train_samples:-3]
  y_val = y[no_train_samples:-3]
  y_val_change_points = y_points[no_train_samples:-3]

  return x_train, y_train, x_val, y_val, y_val_change_points

In [21]:
def get_class_weights(y_train, display=True):
  
  #
  # Create class weights 
  #
  from sklearn.utils.class_weight import compute_class_weight

  num_ones = np.sum(y_train)
  num_zeros = len(y_train) - num_ones 
  
  classes = np.unique(y_train)
  class_weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
  class_weights = dict(zip(classes, class_weights))

  if display:
    print('In the training set we have 0s {} ({:.2f}%), 1s {} ({:.2f}%)'.format(num_zeros, num_zeros/len(y_train)*100, num_ones, num_ones/len(y_train)*100))
    print('class weights {}'.format(class_weights))

  return class_weights

In [36]:
from sklearn.metrics import log_loss, confusion_matrix, precision_score, recall_score, f1_score

def show_metrics(lr, x, y_true, y_change_points, display=True):
  
  # predict from teh val set meas we have predictions and true values as binaries  
  y_pred = lr.predict(x)

  #basic error types 
  log_loss_error = log_loss(y_true, y_pred)
  score = lr.score(x, y_true)
  
  #
  # Customized metrics  
  #
  tp = np.where((y_pred == 1) & (y_change_points >= 0), 1, 0).sum()
  fp = np.where((y_pred == 1) & (y_change_points < 0), 1, 0).sum()
  tn = np.where((y_pred == 0) & (y_change_points < 0), 1, 0).sum()
  fn = np.where((y_pred == 0) & (y_change_points >= 0), 1, 0).sum()

  precision = 0
  if (tp + fp) > 0:
    precision = tp / (tp + fp)

  recall = 0
  if (tp + fn) > 0:
    recall = tp / (tp + fn)

  f1 = 0
  if (precision + recall) > 0:
    f1 = 2 * precision * recall / (precision + recall)

  # output the errors 
  if display:
    print('Errors Loss: {:.4f}'.format(log_loss_error))
    print('Errors Score: {:.2f}%'.format(score*100))
    print('Errors tp: {} ({:.2f}%)'.format(tp, tp/len(y_val)*100))
    print('Errors fp: {} ({:.2f}%)'.format(fp, fp/len(y_val)*100))
    print('Errors tn: {} ({:.2f}%)'.format(tn, tn/len(y_val)*100))
    print('Errors fn: {} ({:.2f}%)'.format(fn, fn/len(y_val)*100))
    print('Errors Precision: {:.2f}%'.format(precision*100))
    print('Errors Recall: {:.2f}%'.format(recall*100))
    print('Errors F1: {:.2f}'.format(f1*100))

  errors = {
      'loss': log_loss_error,
      'score': score, 
      'tp': tp,
      'fp': fp,
      'tn': tn,
      'fn': fn,
      'precision': precision,
      'recall': recall,
      'f1': f1
      }

  return errors

In [38]:

for norm_method in ['price', 'points', 'percentage', 'minmax', 'stddev']:
  df = load_data()

  feature_names =['audusd_open', 'audusd_close', 'audusd_high', 'audusd_low', 'audusd_volume', \
              'eurusd_open', 'eurusd_close', 'eurusd_high', 'eurusd_low', 'eurusd_volume']
  df, x_feature_names = create_x_values(df, feature_names)

  norm_df, x_feature_names_norm = normalize_data(df, x_feature_names, method=norm_method)
  x_train, y_train, x_val, y_val, y_val_change_points = get_train_val(norm_df, x_feature_names_norm)
  class_weights = get_class_weights(y_train, display=False)
  
  lr = LogisticRegression(class_weight=class_weights)
  lr.fit(x_train, y_train)

  print('Errrors for method {}'.format(norm_method))
  errors = show_metrics(lr, x_val, y_val, y_val_change_points, display=True)


Errrors for method price
Errors Loss: 12.1699
Errors Score: 66.24%
Errors tp: 2453 (16.18%)
Errors fp: 2422 (15.98%)
Errors tn: 4984 (32.88%)
Errors fn: 5299 (34.96%)
Errors Precision: 50.32%
Errors Recall: 31.64%
Errors F1: 38.85


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Errrors for method points
Errors Loss: 28.0255
Errors Score: 22.25%
Errors tp: 6799 (44.85%)
Errors fp: 6426 (42.39%)
Errors tn: 980 (6.47%)
Errors fn: 953 (6.29%)
Errors Precision: 51.41%
Errors Recall: 87.71%
Errors F1: 64.82


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Errrors for method percentage
Errors Loss: 14.1055
Errors Score: 60.87%
Errors tp: 3000 (19.79%)
Errors fp: 2983 (19.68%)
Errors tn: 4423 (29.18%)
Errors fn: 4752 (31.35%)
Errors Precision: 50.14%
Errors Recall: 38.70%
Errors F1: 43.68


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Errrors for method minmax
Errors Loss: 12.9094
Errors Score: 64.18%
Errors tp: 2712 (17.89%)
Errors fp: 2658 (17.54%)
Errors tn: 4748 (31.32%)
Errors fn: 5040 (33.25%)
Errors Precision: 50.50%
Errors Recall: 34.98%
Errors F1: 41.34
Errrors for method stddev
Errors Loss: 13.7131
Errors Score: 61.95%
Errors tp: 2959 (19.52%)
Errors fp: 2905 (19.16%)
Errors tn: 4501 (29.69%)
Errors fn: 4793 (31.62%)
Errors Precision: 50.46%
Errors Recall: 38.17%
Errors F1: 43.46


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
