In [31]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# FUNCS

In [32]:
def BINPLOT_VA1_parted_by_categorical_VA2(
        df, 
        va1, 
        va2, 
        ls = None,

        whether_plot = True,
        fig_size = (16,4)
    ): 
    """
    Params:
        
    Returns: 
        DataFrame of means, stds & counts
    Note:
        1. Check va2 is categorical
        2. Require no repeating index, for time series index, 
            consider push repeating index by 1ns, repeat 
            until no repetition left.
            
    TODO: extend to multiple categorical va2 values
    TODO: allow multiple category values to take one bin!!!!!!
    """
        
    if ls == None:
        ls = list(df[va2].unique())
    
    # groupby / partition
    df_temp = df[(df[va2].apply(lambda x: (x in ls)))]
    unstacked = df_temp.groupby(va2).apply(lambda x: x[va1]).unstack(level = 0)

    # plot
    if whether_plot:
        ax = unstacked.plot(kind='box',figsize = fig_size)
        _ = ax.set_xlabel("CATEGORY of {" + va2 + "}")
        _ = ax.set_ylabel(va1)
    
#     # return
    ls_means = list(unstacked.mean(axis = 0, skipna = True))
    ls_stds = list(unstacked.std(axis = 0, skipna = True))
    ls_counts =list((-1*(unstacked.isna()-1)).sum())
    df_ret = pd.DataFrame(data = ([ls_means, ls_stds,ls_counts]),
                          index = ['mean','std','count'],
                          columns = unstacked.columns)
    df_ret = df_ret.transpose()
    df_ret.rename_axis("CATEGORY of {" + va2 + "}", inplace = True)
    return df_ret

In [33]:
def BINPLOT_VA1_parted_by_list_of_pairs_of_VA2(
        df, 
        va1, 
        va2, 
        ls,
        va2_is_index = False, 

        whether_plot = True,
        fig_size = (16,4),
        show_x_ticks = True
    ): 
    """
    Params:
        【va2_is_index】 True:
                Then 【va2】 is just a name of x axis
    Returns: 
        DataFrame of means, stds & counts
    Note:
        1. Check no overlapping region! Region is LEFT <= x < RIGHT
        2. Require no repeating index, for time series index, 
            consider push repeating index by 1ns, repeat 
            until no repetition left.
    """
    
    # setup
    UNIQUE_ID_COL = 'unique_group_identifier_to_be_deleted_by_nathan_3434344' 
    UNIQUE_VA2_INDEX_COL = 'unique_va2_index_to_be_deleted_by_nathan_3434344' 
    STORE_va2 = va2
    if va2_is_index:
        va2 = UNIQUE_VA2_INDEX_COL
        df[va2] = df.index
    
    # groupby / partition
    df[UNIQUE_ID_COL] = -1
    counter = 0
    for pair in ls:
        left, right = pair[0], pair[1]
        if (left != None and right != None): df.loc[(left <= df[va2]) & (df[va2] < right), UNIQUE_ID_COL] = counter
        elif (left == None): df.loc[(df[va2] < right), UNIQUE_ID_COL] = counter
        elif (right == None): df.loc[(left <= df[va2]), UNIQUE_ID_COL] = counter
        else : df[UNIQUE_ID_COL] = counter
        counter += 1
    df_temp = df[df[UNIQUE_ID_COL] != -1]
    unstacked = df_temp.groupby(UNIQUE_ID_COL).apply(lambda x: x[va1]).unstack(level = 0)

    # plot
    if whether_plot:
        ax = unstacked.plot(kind='box',figsize = fig_size)
        if show_x_ticks:
            _ = ax.set_xticklabels(unstacked.columns)
        _ = ax.set_xlabel("RANGE of {" + STORE_va2 + "}")
        _ = ax.set_ylabel(va1)
    
    # restore original df
    df.drop(UNIQUE_ID_COL, axis = 1, inplace = True)
    if va2_is_index: 
        df.drop(UNIQUE_VA2_INDEX_COL, axis = 1, inplace = True)
    
    # return
    ls_means = list(unstacked.mean(axis = 0, skipna = True))
    ls_stds = list(unstacked.std(axis = 0, skipna = True))
    ls_counts =list((-1*(unstacked.isna()-1)).sum())
    columns_ls = list(map(lambda x: "[" + str(x[0]) +", " + str(x[1]) + ")", ls))
    df_ret = pd.DataFrame(data = ([ls_means, ls_stds,ls_counts]),
                          index = ['mean','std','count'],
                          columns = columns_ls)
    df_ret = df_ret.transpose()
    df_ret.rename_axis("RANGE of {" + STORE_va2 + "}", inplace = True)
    return df_ret

# Clean Data

In [34]:
df = pd.read_csv('train.csv')
df.drop('Name', axis = 1, inplace = True)

def Convert_sex(x):
    if x == 'male':
        return 1
    else:
        return 0
df['Sex'] = df['Sex'].apply(lambda x: Convert_sex(x))

df.drop('PassengerId', axis = 1, inplace = True)
df.drop('Ticket', axis =1, inplace = True)

def ConvertCabin(x):
    if pd.isna(x):
        return 'NAN'
    else:
        return x[0]
df['Cabin'] = df['Cabin'].apply(lambda x: ConvertCabin(x))

def Create_DUMMY(
    df,
    va1
):
    ls = sorted(df[va1].unique())
    for i in ls[:-1]:
        str_temp = va1 + '___' + str(i)
        df[str_temp] = 0
        df.loc[df[va1] == i, str_temp] = 1
    df.drop(va1, axis =1, inplace = True)
    
df.loc[df['Age'].isnull(), 'Age'] = df['Age'].mean()
df.loc[df['Embarked'].isnull(), 'Embarked'] = df['Embarked'].mode().iloc[0]

Create_DUMMY(df, 'Pclass')
Create_DUMMY(df, 'Cabin')
Create_DUMMY(df, 'Embarked')

df['y'] = df['Survived']
df.drop('Survived', axis = 1, inplace = True)

# Train-test Split

In [35]:
train_rand = np.random.choice(a = [True, False],p = [0.7, 0.3], size = df.shape[0])
test_rand = train_rand == False

In [36]:
df_train = df[train_rand].copy()
df_test = df[test_rand].copy()

In [37]:
INDEX = ['Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Pclass___1', 'Pclass___2',
       'Cabin___A', 'Cabin___B', 'Cabin___C', 'Cabin___D', 'Cabin___E',
       'Cabin___F', 'Cabin___G', 'Cabin___NAN', 'Embarked___C', 'Embarked___Q']

In [38]:
X_train = df_train[INDEX]
y_train = df_train['y']

X_test = df_test[INDEX]
y_test = df_test['y']

# SKLEARN

In [39]:
from sklearn.linear_model import LogisticRegression

In [40]:
clf = LogisticRegression(random_state=0).fit(X_train, y_train)



### Get Accuracy

In [41]:
# y_train_pred = clf.predict(X_train) 
y_train_pred = clf.predict_proba(X_train)[:, 1] > 0.5
(y_train_pred == y_train).sum() / y_train.shape[0]

0.826580226904376

In [42]:
y_test_pred = clf.predict(X_test)
(y_test_pred == y_test).sum() / y_test.shape[0]

0.781021897810219

### Prepare for C++
* Retrieve coef & bias, then get probability
* COEF & BIAS pass to 【.conf】, formula of prob implement in 【.cc】 

In [48]:
COEF = clf.coef_
BIAS = clf.intercept_
clf.classes_

array([0, 1])

In [49]:
raw_val = X_test.iloc[0].values @ COEF.reshape(-1,1) + BIAS
actual_prob = np.exp(-1 * raw_val)  / (np.exp(-1 * raw_val) + np.exp(0))
print(actual_prob)
print(clf.predict_proba(X_test)[0])

[0.92724499]
[0.92724499 0.07275501]


### Y_hat VS true prob

In [45]:
df_test['sklearn_raw'] = df_test[INDEX].values @ COEF.reshape(-1,1) + BIAS
df_test['sklearn_prob'] = np.exp(-1 * df_test['sklearn_raw'])  / (np.exp(-1 * df_test['sklearn_raw']) + np.exp(0))
df_test['sklearn_prob'] = 1 - df_test['sklearn_prob']

In [46]:
BINPLOT_VA1_parted_by_list_of_pairs_of_VA2(
        df = df_test, 
        va1 = 'y', 
        va2 = 'sklearn_prob', 
        ls = [[None, 0.2],[0.5,0.7],[0.8,1]],
        va2_is_index = False, 

        whether_plot = False,
        fig_size = (16,4),
        show_x_ticks = True
    )

Unnamed: 0_level_0,mean,std,count
RANGE of {sklearn_prob},Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"[None, 0.2)",0.169492,0.376785,118.0
"[0.5, 0.7)",0.627907,0.489083,43.0
"[0.8, 1)",0.906977,0.293903,43.0


# TORCH

In [111]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.autograd import Variable

In [122]:
class LogisticClassifier(nn.Module):
    # output_dim MATCHES whether_binary
        # binary ==> output dim = 2
        # n-classes ==> output dim = n
    
    def __init__(self, input_dim, hidden_dim, output_dim = 2):
        super(LogisticClassifier, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        [self.W1, self.B1] = list(self.fc1.parameters())
        [self.W2, self.B2] = list(self.fc2.parameters())

    def forward(self, x_in, whether_predict = False):
        a_1 = self.fc1(x_in)
        a_1 = torch.relu(a_1)
        y_pred = self.fc2(a_1)
        if whether_predict:
            y_pred = F.softmax(y_pred, dim=1)    # dim ???
        return y_pred

In [123]:
model = LogisticClassifier(input_dim = X_train.shape[1], 
                           hidden_dim = 5, 
                           output_dim = 2)
print (model.named_modules)

<bound method Module.named_modules of LogisticClassifier(
  (fc1): Linear(in_features=17, out_features=5, bias=True)
  (fc2): Linear(in_features=5, out_features=2, bias=True)
)>


In [124]:
criterion = torch.nn.CrossEntropyLoss()

In [125]:
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)

In [127]:
iter = 0
for epoch in range(100000):
    optimizer.zero_grad()
    
    inputs = Variable(torch.tensor(X_train.values, dtype = torch.float32))
    labels = Variable(torch.tensor(y_train.values, dtype = torch.long))
    
    
    outputs = model(inputs)
    loss = criterion(outputs, labels)
    
    loss.backward()
    optimizer.step()
    
    iter += 1
    if iter % 1000 == 0:
        test_inputs = Variable(torch.tensor(X_test.values, dtype = torch.float32))
        test_labels = Variable(torch.tensor(y_test.values, dtype = torch.long))
        outputs = model(test_inputs, whether_predict = True)
        _, predicted = torch.max(outputs.data, dim = 1)
        total = test_labels.size(0)
        correct = (predicted == test_labels).sum()
        print(100 * correct/total)

# ALGO
### At anytime: no huge imbalance
* 【Hard margin】 keep an imbalance variable, abs() of which, if approaching certain bound, nothing that will make it cross this bound is allowed.
    * How big of a margin? How to decide?
    * Smaller than or equal to the "warning limit" by a factor
* 【Soft margin】 keep a counter variable (regarding trade volumes), make every addition to this counter increasingly hard 
    * Make sure that this counter should zero out when there is no imbalance ==> choose trade volume as subject (not # trades)
    * This counter variable can be used to decide "how much to trade each time"
* 【Hybrid margin】 within a "hard semi-margin", we can do anything; outside this margin, we start getting more and more "reluctant" to get more imbalanced.

* 【Reverse hybrid margin】 "reluctancy" soft margin always happens, but there is an outer hard margin that one can never cross.
    * This is may be the way to go, since it inherits the good properties of soft margin == efficiently allocate buy size based on "reluctancy" & it will never trigger "huge imbalance" warning by setting hard margin to be the actual "warning limit".
    
* 【Triple hybrid margin】 
    * IF WE CAN set the soft margin's "reluctancy constant" to gently adhere to the actual "warning limit", the above approach is not really that "hybrid", & soft margin that does not adhere well is not an efficient soft margin.
    * We need a hard "freedom margin", with an imbalance underneath which, one can do anything.
    * We have a soft margin kicking in above this "freedom margin" that gently adhere to the "warning limit"
    * We add a hard "dead margin" that is the actual "warning limit" just to make sure.

### End of day: no position

##### Time in day based:
1. 18 hours to trade based on any kind of margin
2. 5 hours to hopefully tie loose ends:
    * For new opportunities: "able to buys at px that will rise" / "able to short at px that will drop", we only take those that may reduce imbalance. 
        * (Since "reluctancy score" is on the other side, we would natually take these quite willingly, BUT here, we say that we only take these)
        * When these "balancing new opportunities" are captured: two cases, one we make profit, but imbalance remains, two the opportunity half-completes, we reduce imbalance.
        
    * Reflection: how did we arrive at this point of imbalance.?
        * Say: we are in a state where we hold a lot of BTC
        * We must have bought BTC a lot, with regression telling me that px would rise in 30s, we bought BTC instantly, and set a limit order at a higher price, hoping it would get picked up, it did not. 
        * This means that, at later time than that initial trade, the px never rised to the point of the limit order.
        * That AOC order might mark a peak of the entire day's trading: "We have bought at the high peak of the day", SHIT!!!
        * Takeaway 1: Want to AVOID peak, with some kind of mechanism. Want to be a part of any slope. (NOT TOO DO-ABLE)
        * Having a ton of imbalance, means that, we have bought at a lot of peaks / bought once at peak with large amount ==> the day has not been a very volatile (with large fluctuation) day (or hours) in general.
        * Takeaway 2: For less volatile days (or hours), we are more likely to get on lots of peaks on one side, need a tighter "freedom margin", soft margin, and "dead margin".
            * We have the volatility in 30s thing, as "y" of regression (need log since FutureStd always positive)
            * We might need some "Volatility in past" signal, or just use existing 5 past px signals in the regression.
            * If see "futureStd" is large, more willing to initiate an opportunity (larger amount)
        
    * Mode recognition: if we balances imbalance earlier than 6 hour finishes?
        * TEMPORARY: Cease trading & cancel hanging orders that might create new imbalances.
        * If lots of these happens, want to shorten the previously determined 5 hours.
    
3. 1 hours to tie loose ends at cost of profit loss
    * 

    
### General logic of Trade size (first 18h)
* More volatile in 30s (larger "futureStd"), larger amount <== less scared to get stuck
* Having had huge imbalance on the same side, less amount <== chance to worsen the current imbalance
* Having had imbalance on the other side, larger amount (not 2 times larger though) <== chance to alleviate the current imbalance
* Larger regressed change in px, larger amount
* Larger classification prob of "profit making change", larger amount (based on Kelly Criterion?)

### Need to check out for return info from limit orders, have a "floating imbalance" 
* If a limit order fails on exchange side, send a same order with worse price (MAYBE)