# Machine Learning in Forex Trading
### Data: USDJPY Historical Exchange Rate, from 3/31 to 4/14, 15 minutes chart

In [2]:
import pandas as pd
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score

In [3]:
raw = pd.read_csv('USDJPY.csv')
raw.head()

Unnamed: 0,Date,Open,High,Low,Close,Change (Pips),Change (%)
0,4/14/2017 23:45,108.631,108.657,108.589,108.616,-1.5,-0.01%
1,4/14/2017 23:30,108.625,108.653,108.613,108.629,0.4,0.00%
2,4/14/2017 23:15,108.613,108.628,108.603,108.623,1.0,0.01%
3,4/14/2017 23:00,108.605,108.637,108.573,108.616,1.1,0.01%
4,4/14/2017 22:45,108.578,108.63,108.578,108.609,3.1,0.03%


##### Data Cleaning

In [4]:
df = raw.iloc[::-1].reset_index(drop=True)
df.columns = df.columns.str.lower()
df.rename(columns={'change (pips)':'pip_change', 'change (%)':'pct_change'}, inplace=True)
df.head()

Unnamed: 0,date,open,high,low,close,pip_change,pct_change
0,3/31/2017 13:45,111.822,111.836,111.802,111.808,-1.4,-0.01%
1,3/31/2017 14:00,111.806,111.857,111.806,111.844,3.8,0.03%
2,3/31/2017 14:15,111.845,111.871,111.778,111.798,-4.7,-0.04%
3,3/31/2017 14:30,111.796,111.81,111.758,111.789,-0.7,-0.01%
4,3/31/2017 14:45,111.793,111.833,111.774,111.777,-1.6,-0.01%


##### Generate feature columns from raw prices

In [5]:
def build_feature_rsi(df, rsi_n):
    '''
    Generate RSI feature column inplace.
    Input:
        - df: Dataframe
        - rsi_n: Number of periods used in RSI calculation
    '''
    # Construct a new column and assign the values to NaN
    df['rsi'] = pd.np.nan
    
    iteration = df.shape[0]
    
    # initialize average list
    rsi_gain_avg, rsi_loss_avg = [], []
    
    # calculate the first value for RSI indicator
    rsi_gain, rsi_loss = 0, 0
    for i in range(0, rsi_n):
        pip = df.ix[i,'pip_change']
        if pip >= 0 :
            rsi_gain += pip
        else:
            rsi_loss += abs(pip)
    rsi_gain_avg.append(rsi_gain/rsi_n)
    rsi_loss_avg.append(rsi_loss/rsi_n)
    df.ix[rsi_n-1, 'rsi'] = 100 - (100/(1+rsi_gain_avg[0]/rsi_loss_avg[0]))
    
    # calculate the subsequent values for RSI indicator
    for i in range(rsi_n, iteration):
        rsi_gain, rsi_loss = 0, 0
        pip = df.ix[i,'pip_change']
        if pip >=0:
            rsi_gain = rsi_gain_avg[i-rsi_n] * (rsi_n-1) + pip
            rsi_loss = rsi_loss_avg[i-rsi_n] * (rsi_n-1)
        else:
            rsi_gain = rsi_gain_avg[i-rsi_n] * (rsi_n-1)
            rsi_loss = rsi_loss_avg[i-rsi_n] * (rsi_n-1) + abs(pip)
        rsi_gain_avg.append(rsi_gain/rsi_n)
        rsi_loss_avg.append(rsi_loss/rsi_n)
        df.ix[i, 'rsi'] = 100 - (100/(1+rsi_gain_avg[i+1-rsi_n]/rsi_loss_avg[i+1-rsi_n]))
        
    # backfill NA
    df.fillna(method='bfill',inplace=True)


def build_feature_ewma(df, ewma_n1, ewma_n2):
    '''
    Generate EWMA feature column inplace.
    Input:
        - df: Dataframe
        - ema_n1: Number of periods used in first EWMA calculation,
                  and this number should be smaller than ema_n2.
        - ema_n2: Number of periods used in second EWMA calculation
    '''
    
    # calculate EWMA for two periods
    # df['ewma1'] = pd.ewma(df['close'], span=ewma_n1, min_periods=ewma_n1)
    # df['ewma2'] = pd.ewma(df['close'], span=ewma_n2, min_periods=ewma_n2)
    df['ewma1'] = df['close'].ewm(span=ewma_n1, min_periods=ewma_n1).mean()
    df['ewma2'] = df['close'].ewm(span=ewma_n2, min_periods=ewma_n2).mean()
    
    # backfill NA
    df.fillna(method='bfill',inplace=True)
    
    # calculate cross values for two EWMA
    df['ewma'] = df['ewma1'] - df['ewma2']
    
    # drop the ewma columns of single period
    df.drop(['ewma1','ewma2'],inplace=True,axis=1)

    
def build_feature_bollinger(df, bollinger_n):
    '''
    Generate Bollinger Band feature column inplace.
    Input:
        - df: Dataframe
        - bollinger_n: Number of periods used in calculation
    '''
    # calculate Upper Band and Lower Band
    df['middle'] = df['close'].rolling(bollinger_n, min_periods=bollinger_n).mean()
    df['std'] = df['close'].rolling(bollinger_n, min_periods=bollinger_n).std()   
    df['upper_band'] = df['middle'] + df['std'] * 2
    df['lower_band'] = df['middle'] - df['std'] * 2
    
    # backfill NA
    df.fillna(method='bfill',inplace=True)
    
    # calculate Bollinger Percent
    df['bollinger'] = (df['close'] - df['lower_band']) / (df['upper_band'] - df['lower_band'])
    
    # drop the intermediate calculation columns
    df.drop(['middle','std','upper_band','lower_band'],inplace=True,axis=1)
    
    # In bollinger column, NaN would be resulted from the scenario that the values of upper_band and lower_band are the same,
    # thus replace NaN with 50%  
    df.fillna(0.5,inplace=True)

In [6]:
# below parameters performs best by manual Grid Search
build_feature_rsi(df, 4)
build_feature_ewma(df, 2, 4)
build_feature_bollinger(df, 4)
df.head(15)

Unnamed: 0,date,open,high,low,close,pip_change,pct_change,rsi,ewma,bollinger
0,3/31/2017 13:45,111.822,111.836,111.802,111.808,-1.4,-0.01%,35.849057,0.032533,0.481859
1,3/31/2017 14:00,111.806,111.857,111.806,111.844,3.8,0.03%,35.849057,0.032533,0.855054
2,3/31/2017 14:15,111.845,111.871,111.778,111.798,-4.7,-0.04%,35.849057,0.006918,0.378193
3,3/31/2017 14:30,111.796,111.81,111.758,111.789,-0.7,-0.01%,35.849057,-0.006842,0.284894
4,3/31/2017 14:45,111.793,111.833,111.774,111.777,-1.6,-0.01%,29.842932,-0.008264,0.286629
5,3/31/2017 15:00,111.773,111.872,111.773,111.823,5.0,0.04%,58.684481,0.005085,0.836538
6,3/31/2017 15:15,111.824,111.901,111.824,111.875,5.1,0.05%,73.50033,0.019659,0.836038
7,3/31/2017 15:30,111.869,111.968,111.84,111.9,3.1,0.03%,79.467681,0.023807,0.756364
8,3/31/2017 15:45,111.901,111.965,111.866,111.881,-2.0,-0.02%,66.572112,0.013287,0.585388
9,3/31/2017 16:00,111.879,111.93,111.691,111.728,-15.1,-0.14%,25.278353,-0.032792,0.128394


##### Generate action column from raw prices (only long action is considered, using 0/1 to represent no action and long order respectively)

In [7]:
def action_long(df, tp = 0.08, sl = 0.12):
    '''
    Generate long action column inplace.
    Input:
        - df: Dataframe
        - tp: take profit level, in pips (1 pip is equal to 0.01)
        - sl: stop loss level, in pips (1 pip is equal to 0.01)
    '''
    # initialize action column to 0 (no action)
    df['action_long'] = 0
    
    # determine action value for each row
    for i in range(df.shape[0]):
        price = df.ix[i,'close']
        date = df.ix[i,'date'].split()[0]
        for j in range(i+1, df.shape[0]):
            # a quiet and dull day has passed
            if date != df.ix[j,'date'].split()[0]:
                break
            # price hit stop loss level
            if (price - df.ix[j,'low']) >= sl:
                break
            # price hit take profit level
            if (df.ix[j,'high'] - price) >= tp:
                df.ix[i,'action_long'] = 1
                break

In [8]:
action_long(df)
df.head(15)

Unnamed: 0,date,open,high,low,close,pip_change,pct_change,rsi,ewma,bollinger,action_long
0,3/31/2017 13:45,111.822,111.836,111.802,111.808,-1.4,-0.01%,35.849057,0.032533,0.481859,1
1,3/31/2017 14:00,111.806,111.857,111.806,111.844,3.8,0.03%,35.849057,0.032533,0.855054,1
2,3/31/2017 14:15,111.845,111.871,111.778,111.798,-4.7,-0.04%,35.849057,0.006918,0.378193,1
3,3/31/2017 14:30,111.796,111.81,111.758,111.789,-0.7,-0.01%,35.849057,-0.006842,0.284894,1
4,3/31/2017 14:45,111.793,111.833,111.774,111.777,-1.6,-0.01%,29.842932,-0.008264,0.286629,1
5,3/31/2017 15:00,111.773,111.872,111.773,111.823,5.0,0.04%,58.684481,0.005085,0.836538,1
6,3/31/2017 15:15,111.824,111.901,111.824,111.875,5.1,0.05%,73.50033,0.019659,0.836038,1
7,3/31/2017 15:30,111.869,111.968,111.84,111.9,3.1,0.03%,79.467681,0.023807,0.756364,0
8,3/31/2017 15:45,111.901,111.965,111.866,111.881,-2.0,-0.02%,66.572112,0.013287,0.585388,0
9,3/31/2017 16:00,111.879,111.93,111.691,111.728,-15.1,-0.14%,25.278353,-0.032792,0.128394,0


##### Apply Machine Learning to Forex Trading

In [9]:
y = df['action_long'].values
x = df[['rsi', 'ewma', 'bollinger']].values
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)

In [10]:
LR = LogisticRegression(n_jobs=-1, random_state=1)
LR.fit(x_train,y_train)
LR_result = LR.predict(x_test)
LR.score(x_test, y_test)

0.48756218905472637

In [11]:
LR_CM = confusion_matrix(y_test, LR_result)

LR_TN = LR_CM[0][0]
LR_FN = LR_CM[1][0]
LR_TP = LR_CM[1][1]
LR_FP = LR_CM[0][1]

print (LR_CM)
print (LR_TP, LR_FP)
precision_score(y_test, LR_result)

[[63 33]
 [70 35]]
35 33


0.51470588235294112

In [12]:
Ada = AdaBoostClassifier(random_state=1)
Ada.fit(x_train,y_train)
Ada_result = Ada.predict(x_test)
Ada.score(x_test, y_test)

0.57213930348258701

In [13]:
Ada_CM = confusion_matrix(y_test, Ada_result)

Ada_TN = Ada_CM[0][0]
Ada_FN = Ada_CM[1][0]
Ada_TP = Ada_CM[1][1]
Ada_FP = Ada_CM[0][1]
print (Ada_CM)
print (Ada_TP, Ada_FP)
precision_score(y_test, Ada_result)

[[61 35]
 [51 54]]
54 35


0.6067415730337079

### Epilogue

With the booming trend in algorithmic trading, the next iteration for the industry will incorporate more advanced techniques in trading, such as machine learning. This project merely explores the possibilities of this new field by utilizing machine learning in forex trading.

Among the various options to obtain training data from trading, I chose Oanda - a third party broker - to obtain the historical data. Raw data from Oanda only contains bid time and bid price, therefore developing indicators to perform further analysis is necessary. Hypothetically, you should build various indicators which could be evaluated by **Random Forest** then use selected indicators to initiate the feature matrix. Due to the complexity and laboriousness of that procedure, I chose only three features, that have been empirically proven to correspond to a short term strategy. After generating the features **RSI (Relative Strength Index), EWMA (Exponentially Weighted Moving Average) and Bollinger Percent**, we need to label what action to take on each bid, i.e. a long or no action. 

Since only two actions are considered, what we want to predict is actually a binary result. Thus, because it is a binary result, we know this problem is best solved with a **classification** algorithm. Though many traders often start with more complicated algorithms for prediction, a simple **Logistic Regression** actually is an appropriate start for our first model to train. Then, a more targeted algorithm, **AdaBoost Decision Tree** is applied. However to be thorough, I used other classification models such as SVM, Random Forest Classifier and Gradient Boosting Classifier to test their results.

The next step in developing a powerful model is to tune the appropriate parameters. To perform better, tuning not only means tuning the model but also the parameters used to instantiate features. Normally this process should be done by **Grid Search**, but since I wrote the functions myself, I performed a manual grid search of the feature parameters.

So how we gauge the prediction? Do we use R-squared or mean accuracy? A more appropriate metric is **Precision** in confusion matrix. When we make a negative prediction, we won't take any action; hence we have no profit or loss. However, in a positive predictive scenario, if the prediction is false positive, our order would be "stopped out," i.e. your order hits stop loss level. Thus, the ratio of true positive to prediction positive, i.e. **Precision**, is the benchmark for our trading performance. In this algorithm, because the default "take profit" level is 8 pips and the stop loss level is 12 pips, a precision score of at least _**60%**_ is the threshold we use to sustain an account balance. As demonstrated above in the model output, by implementing an **AdaBoost Decision Tree** and briefly tuning certain feature parameters, the model achieved a precision score of _**60.67%**_. Although to some, a precision score of 60.67% may appear slightly better than chance, it is not a bad start for an amateur algorithmic trader. In future iterations of this project, I should aim to not only increase Precision but also enhance **Sensitivity**.  

To improve trading performance, below measures could be considered:

+ Optimize the rule of action labeling
+ Build more comprehensive features
+ Utilize Random Forest to extract important feature
+ Use charts of different time frame
+ Grid Search feature parameter