In [None]:
import pickle
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
%pylab inline

## Read PCA parameters

In [None]:
D=pickle.load(open('data/PCA.pickle','rb'))
col=D['columns']
eigvec=D['eigvec']
eigval=D['eigval']

### Sanity check
The following scatterplot should be very similar to the scatter plot you produced in notebook 3 for eigvec 1, eigvec 2 (indexing starts with 1)

In [None]:
scatter(eigvec[:,0],eigvec[:,1])

## compute features

In [None]:
features=eigvec[:,:20]
features.shape

##  compute labels (sectors)

In [None]:
TickerInfo=pd.read_csv('data/TickerInfo.tsv',sep='\t')
print(TickerInfo.shape)
TickerInfo.head()

In [None]:
Sectors={'Consumer Discretionary':'CD',
 'Consumer Staples':'CS',
 'Energy':'EN',
 'Financials':'FIN',
 'Health Care':'HC',
 'Industrials':'INDS',
 'Information Technology':'IT',
 'Materials':'MAT',
 'Real Estate':'RE',
 'Telecommunication Services':'TS',
 'Utilities':'UTIL'}
sector2number={}
i=0;
for name,short in Sectors.items():
    sector2number[short]=i
    i+=1
sector2number

In [None]:
labels=[]
feature_vectors=[]
for i in range(len(col)):
    c=col[i]
    if 'train' in c:
        ticker=c[6:-2]
        answer=list(TickerInfo[TickerInfo.Ticker==ticker]['SECTOR_ID'])
        if len(answer)==1:
            sector_no=sector2number[answer[0]]
            labels.append(sector_no)
            feature_vectors.append(features[i,:])
        else:
            print('error: could not find sector for ticker:',ticker)


In [None]:
len(labels),len(feature_vectors)

In [None]:
sectorID2sectorName = {key:value for (value, key) in Sectors.items()}
sectorID2sectorName

In [None]:
number2sectorID = {key:value for (value, key) in sector2number.items()}
number2sectorID

In [None]:
number2sectorName = {i:sectorID2sectorName[number2sectorID[i]] for i in number2sectorID}
number2sectorName

In [None]:
y=np.array(labels)
X=np.array(feature_vectors) #.transpose()
X.shape, y.shape

In [None]:
feature_vectors_test=[]
test_nos = []
for i in range(len(col)):
    c=col[i]
    if 'test' in c:
        test_nos.append(c[5:-2])
        feature_vectors_test.append(features[i,:])

In [None]:
len(feature_vectors_test)

In [None]:
X_test = np.array(feature_vectors_test)
y_test = np.array(test_nos)

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.30, random_state=6)
X_train.shape, X_valid.shape

In [None]:
# dtrain = xgb.DMatrix(X_train, label=y_train)
# dvalid = xgb.DMatrix(X_valid, label=y_valid)

In [None]:
# dtrain.num_col(),dtrain.num_row(),dvalid.num_col(),dvalid.num_row()

In [None]:
#You can change this cell if you wish to, but you aren't expected to
param = {}
param['max_depth']= 3   # depth of tree
param['eta'] = 0.3      # shrinkage parameter
param['silent'] = 1     # not silent
param['objective'] = 'multi:softmax'
param['nthread'] = 7 # Number of threads used
param['num_class']=11

num_round = 100

In [None]:
def get_margin_scores(X_train, X_valid, y_train, y_valid, X_test, param):
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dvalid = xgb.DMatrix(X_valid, label=y_valid)
    evallist = [(dtrain, 'train'), (dvalid, 'eval')]
    plst = param.items()
    bst = xgb.train(plst, dtrain, num_round, evallist, verbose_eval=False)
    y_pred_valid = bst.predict(dvalid, ntree_limit=bst.best_ntree_limit, output_margin=True)
    y_valid = dvalid.get_label()
    dtest = xgb.DMatrix(X_test)
    y_pred_test = bst.predict(dtest, ntree_limit=bst.best_ntree_limit, output_margin=True)
    return y_valid, y_pred_valid , y_pred_test

In [None]:
y_valid, y_pred_valid, y_pred_test = get_margin_scores(X_train, X_valid, y_train, y_valid, X_test, param)
predictions_valid = [np.argmax(pred) for pred in y_pred_valid]
accuracy_valid = sum(predictions_valid == y_valid)/len(y_valid)

In [None]:
np.array(predictions_valid[:5]).T

In [None]:
#Top5 values
arr = y_pred_valid[0]
print(arr)
arr.argsort()[::-1][:5]

In [None]:
predictions_topn = np.array([pred.argsort()[::-1] for pred in y_pred_valid])

In [None]:
for i in range(11):
    print(sum(predictions_topn[:, i]==y_valid), "\t", sum(predictions_topn[:, i]==y_valid)/len(y_valid))

In [None]:
y_valid.shape, predictions_topn.shape

In [None]:
conf = np.zeros((11,11), dtype=int)

In [None]:
i=0
for entry in predictions_topn[:, :2]:
    if entry[1] == y_valid[i]:
        conf[entry[0]][entry[1]] += 1
    i += 1
    #conf[entry[1]][entry[0]] += 1

In [None]:
for i in range(conf.shape[0]):
    print("%25s" % number2sectorName[i], "\t", conf[i, :])

In [None]:
predictions_test = [np.argmax(pred) for pred in y_pred_test]

In [None]:
np.array(predictions_test).T

In [None]:
np.array(test_nos).T