In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
import math
from sklearn.neural_network import MLPClassifier
from sklearn.externals import joblib
import pickle
import os

def get_weekday(date):
    return str(datetime.strptime(date, "%Y-%m-%d").weekday())

In [2]:
def analyse_ticker(ticker, history=60):
    data = pd.read_csv("raw/" + ticker + ".csv")
    temp = list(data["Close"].copy())
    temp = [0] + temp[:-1]
    data["prev_close"] = temp
    temp1 = list(data["Close"].copy())
    temp1 = [0, 0, 0, 0, 0] + temp1[:-5]
    data["prev_week_close"] = temp1
    data["change"] = data["Close"] - data["prev_close"]
    data["change_week"] = data["Close"] - data["prev_week_close"]
    data["percentage_change"] = data["change"] * 100 / data["Close"]
    data["percentage_change_week"] = data["change_week"] * 100 / data["Close"]
    data = data.drop(data.index[:5])
    data = data.dropna(axis=0, how="any")
    data = data.reset_index(drop=True)
    data["weekday"] = data["Date"].apply(lambda x: get_weekday(x))
    data["label"] = data["percentage_change"].apply(lambda x: int(math.floor(x)))
    data["label"] = data["label"].apply(lambda x: 10 if x > 10 else x)
    data["label"] = data["label"].apply(lambda x: -10 if x < -10 else x)
    data["label"] = data["label"].apply(lambda x: str(x + 10))
    data["label_week"] = data["percentage_change_week"].apply(lambda x: int(math.floor(x)))
    data["label_week"] = data["label_week"].apply(lambda x: 10 if x > 10 else x)
    data["label_week"] = data["label_week"].apply(lambda x: -10 if x < -10 else x)
    data["label_week"] = data["label_week"].apply(lambda x: str(x + 10))
    temp2 = list(data["label_week"].copy())
    temp2 = temp2[5:] + [np.nan, np.nan, np.nan, np.nan, np.nan]
    data["label_next_week"] = temp2
    for i in range(history, len(data)):
        for j in range(1, history + 1):
            data.at[i, 'prev_label_' + str(j)] = data.iloc[i - j]['label']
            data.at[i, 'prev_label_week_' + str(j)] = data.iloc[i - j]['label_week']
    data = data.dropna(axis=0, how="any")
    data = data.reset_index(drop=True)
    return data

In [3]:
def get_train_test(df, num_train, skip_days=0):
    data = df.copy()
    del data["Volume"]
    del data["prev_close"]
    del data["change"]
    del data["percentage_change"]
    del data["Adj Close"]
    del data["Open"]
    del data["High"]
    del data["Low"]
    del data["Close"]
    del data["Date"]
    labels = data["label"].copy()
    del data["label"]
    #del data["label_week"]
    del data["label_next_week"]
    if skip_days:
        for i in range(1, skip_days + 1):
            del data["prev_label_" + str(i)]
            del data["prev_label_week_" + str(i)]
    data = pd.get_dummies(data)
    train_data = data[:num_train].copy()
    train_labels = labels[:num_train].copy()
    test_data = data[num_train:].copy()
    test_labels = labels[num_train:].copy()
    return train_data, train_labels, test_data, test_labels

def get_average_prediction(probabilities, classes):
    ret = []
    for p in probabilities:
        indexes = p.argsort()[-3:]
        labels = classes[indexes]
        label = round(np.mean([int(el) for el in labels]))
        ret.append(str(int(label)))
    return ret

def evaluate_confusion_matrix(mat):
    results = []
    totals = []
    correct = 0
    incorrect = 0
    for i in range(21):
        if i > 10:
            correct += sum(mat[i][11:])
            incorrect += sum(mat[i][:11])
        else:
            correct += sum(mat[i][i:])
            incorrect += sum(mat[i][:i])
    total = correct + incorrect
    #print correct, incorrect, total
    return correct / float(total)

def get_model(train_data, train_labels, test_data, test_labels):
    #print len(train_data), len(train_labels), len(test_data), len(test_labels)
    clf = MLPClassifier(alpha=1e-5, hidden_layer_sizes=(200, 100, 100, 100, 100, 200), random_state=1, max_iter=1000)
    clf.fit(train_data, train_labels)
    train_accuracy = clf.score(train_data, train_labels)
    test_accuracy = clf.score(test_data, test_labels)
    #print "Training accuracy(exact):", train_accuracy
    #print "Test accuracy(exact):", test_accuracy
    #print "Loss:", clf.loss_
    return clf, clf.loss_, train_accuracy, test_accuracy

def evaluate_model(clf, test_data, test_labels):
    predicted_labels = clf.predict(test_data)
    cm1 = confusion_matrix(predicted_labels, test_labels, labels=[str(i) for i in range(21)])
    predicted_probabilities = clf.predict_proba(test_data)
    average_labels = get_average_prediction(predicted_probabilities, clf.classes_)
    cm2 = confusion_matrix(average_labels, test_labels, labels=[str(i) for i in range(21)])
    soft_accuracy_exact = evaluate_confusion_matrix(cm1)
    soft_accuracy_avg = evaluate_confusion_matrix(cm2)
    #print "Accuracy exact(soft):", soft_accuracy_exact
    #print "Accuracy average(soft):", soft_accuracy_avg
    return soft_accuracy_exact, soft_accuracy_avg

#print datasets[0][["Date", "Close", "prev_close", "label_next_week", "label_week"]]


In [None]:
#tickers = ["INFY", "SBIN", "APOLLOTYRE", "LUMAXTECH", "JUBLFOOD", "PRAKASH"]
tickers = os.listdir("raw")
for symbol in tickers:
    results = pickle.load(open("results-daily.pkl")) if os.path.isfile("results-daily.pkl") else dict()
    try:
        symbol = symbol.split(".")[0]
        if os.path.isfile("models_daily/" + symbol + ".pkl"): 
            print "Skipping:", symbol
            continue
        dataset = analyse_ticker(symbol)
        pickle.dump(dataset, open("data_daily/" + symbol + ".pkl", "wb"))
        train_split_index = len(dataset) - 20
        train_data, train_labels, test_data, test_labels = get_train_test(dataset, train_split_index)
        pickle.dump(train_data, open("train_daily/" + symbol + "_data.pkl", "wb"))
        pickle.dump(train_labels, open("train_daily/" + symbol + "_labels.pkl", "wb"))
        pickle.dump(test_data, open("test_daily/" + symbol + "_data.pkl", "wb"))
        pickle.dump(test_labels, open("test_daily/" + symbol + "_labels.pkl", "wb"))
        clf, loss, train_accuracy, test_accuracy = get_model(train_data, train_labels, test_data, test_labels)
        soft_accuracy_exact, soft_accuracy_avg = evaluate_model(clf, test_data, test_labels)
        results[symbol] = dict()
        results[symbol]["loss"] = loss
        results[symbol]["train_accuracy"] = train_accuracy
        results[symbol]["test_accuracy"] = test_accuracy
        results[symbol]["soft_accuracy_exact"] = soft_accuracy_exact
        results[symbol]["soft_accuracy_avg"] = soft_accuracy_avg
        print symbol, soft_accuracy_avg
        joblib.dump(clf, "models_daily/"+ symbol + ".pkl")
    except Exception as e:
        print symbol, str(e)
    pickle.dump(results, open("results-daily.pkl", "wb"))

MAHLOG 0.8571428571428571
OISL 0.8
GKWLIMITED 0.6
NAGREEKCAP 0.65
GUFICBIO 0.65
LYCOS 0.4
AUSOMENT 0.6
GESHIP 0.7
THEMISMED 0.6
DOLPHINOFF 0.7
KOTAKBANK 0.65
INDIANHUME 0.55
ORBTEXP 0.5
DTIL 0.65
BANKBEES 0.6
FMGOETZE 0.55
KOTAKGOLD 0.55
TVSELECT 0.75
