In [0]:
from google.colab import drive
drive.mount('/content/drive')

In [0]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os

In [0]:
os.chdir("/content/drive/My Drive/Colab Notebooks/ntu_bda/bda2020_dataset")
! ls

## Part 1 & 2

### EDA

In [0]:
df = pd.read_csv("keyword.csv", encoding="utf-8")

In [0]:
df.shape

(7198, 202)

In [0]:
df.head()

In [0]:
X = df.iloc[:,1:-1]
y = df['up'].values

In [0]:
# formatting the column names
keyword_list = X.columns
X.columns = ['x'+str(i) for i in range(1, X.shape[1]+1)]

Evaluating the correlation between the keywords to further tweak vectorization methods.

In [0]:
X.corr()

In [0]:
import seaborn as sns

plt.figure(figsize=(13,10))
sns.heatmap(X.corr(), cmap=sns.cm.rocket_r)

In [0]:
# evaluating the most occuring keywords
top_keywords = X.sum().sort_values(ascending=False).head(20).index.tolist()
top_keywords

### Preprocessing

Standardizing the training and testing sets.

In [0]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=20)

In [0]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.fit_transform(X_test)

In [0]:
print(scaled_X_train.shape)
print(y_train.shape)

(70592, 200)
(70592,)


Testing PCA's viability.

In [0]:
from sklearn.decomposition import PCA

pca = PCA().fit(scaled_X_train)
np.cumsum(pca.explained_variance_ratio_)

In [0]:
pca = PCA(n_components=60)
pca_scaled_X_train = pca.fit_transform(scaled_X_train)
pca_scaled_X_test = pca.fit_transform(scaled_X_test)

In [0]:
print(pca_scaled_X_train.shape)

### Testing Models

In [0]:
# importing the modules
from sklearn.model_selection import cross_val_score, cross_validate, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.utils import class_weight

In [0]:
class_weights = class_weight.compute_class_weight('balanced', np.unique(y_train), y_train)

#### Naive Bayes

In [0]:
nb_param = {'alpha': np.arange(0, 0.51, 0.01)}

nb = BernoulliNB()
nb_cv = GridSearchCV(nb, nb_param, cv=4, refit=True, verbose=3) 
nb_cv.fit(scaled_X_train, y_train)

In [0]:
print(nb_cv.best_score_)
print(nb_cv.best_params_)

0.7325468351864721
{'alpha': 0.0}


#### KNN

In [0]:
scores = []

for i in range(3,10):
    knn = KNeighborsClassifier(n_neighbors = i)
    score = cross_val_score(knn, scaled_X_train, y_train, cv=4, verbose=3).mean()
    scores.append(score)

In [0]:
plt.plot(np.arange(3,10), scores, '-o')

#### Random Forest

In [0]:
rf_param = {'max_depth': np.arange(2, 11),
            'min_samples_leaf': [1, 2, 4],
            'min_samples_split': [2, 5, 10],
            'n_estimators': np.arange(50, 201, 50)}

param = {'alpha':np.arange(0,1,0.1)}

rf = RandomForestClassifier()
rf_cv = GridSearchCV(rf, rf_param, cv=4, refit=True, verbose=3) 
rf_cv.fit(scaled_X_train, y_train)

In [0]:
print(rf_cv.best_score_)
print(rf_cv.best_params_)

0.7375824019380742
{'max_depth': 9, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 200}


#### SVM


In [0]:
svm_param = {'C': [0.1, 1, 10, 100, 1000],  
             'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 
             'kernel': ['rbf']}

svm = SVC()
svm_cv = GridSearchCV(svm, svm_param, cv=2, refit=True, verbose=3) 
svm_cv.fit(scaled_X_train, y_train)

In [0]:
print(svm_cv.best_score_)
print(svm_cv.best_params_)

0.738971865230983
{'C': 10, 'gamma': 0.001, 'kernel': 'rbf'}


#### DNN

In [0]:
from keras.models import Sequential
from keras.layers import Dense
from keras import losses
from keras import optimizers
from keras import metrics

def build_model():
    model = Sequential()
    model.add(Dense(64, activation="relu", input_shape=(scaled_X_train.shape[1],)))
    model.add(Dense(64, activation="relu"))
    model.add(Dense(16, activation="relu"))
    model.add(Dense(1, activation="sigmoid"))
    
    # assigning optimizer, loss function, and evaluation metric
    model.compile(optimizer=optimizers.RMSprop(lr=0.001),
                  loss=losses.binary_crossentropy,
                  metrics=[metrics.binary_accuracy]) # using custom MCC metric to calculate accuracy
    return model

Using TensorFlow backend.


In [0]:
from sklearn.model_selection import StratifiedKFold

# hyperparameters
epochs = 100
batch_size = 1000

# splitting the data into 4 folds for cross validation 
kfold = StratifiedKFold(n_splits=4)
all_history = []

# fitting model & cross validation
for index, (train, val) in enumerate(kfold.split(scaled_X_train, y_train)):
    print("Fold #", index+1)
    model = build_model()
    history = model.fit(scaled_X_train[train],
                        y_train[train],
                        validation_data=(scaled_X_train[val], y_train[val]),
                        epochs=epochs,
                        batch_size=batch_size,
                        verbose=1)
    acc_history = history.history['val_binary_accuracy']
    all_history.append(acc_history)

print(all_history)

# used to optimize number of epochs
average_acc_history = [np.mean([x[i] for x in all_history]) for i in range(epochs)]

In [0]:
plt.plot(range(1, len(average_acc_history) + 1), average_acc_history)
plt.xlabel('Epochs')
plt.ylabel('Validation Accuracy')
plt.show()

In [0]:
model = build_model()
model.fit(scaled_X_train,
          y_train,
          epochs=10,
          batch_size=1000,
          verbose=1,
          class_weight=class_weights)

y_prob = model.predict(scaled_X_test)
y_pred = np.where(y_prob>0.5, 1, 0)

In [0]:
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

#### Gradient Boosting

In [0]:
gb_param = {
    "learning_rate": [0.0001, 0.001, 0.01],
    'max_depth': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'min_samples_split': [2, 5, 10],
    "n_estimators":[10]
    }

gb = GradientBoostingClassifier()
gb_cv = GridSearchCV(gb, gb_param, cv=4, refit=True, verbose=3) 
gb_cv.fit(scaled_X_train, y_train)

In [0]:
print(gb_cv.best_score_)
print(gb_cv.best_params_)

0.6932963670759014
{'learning_rate': 0.0001, 'max_depth': 2, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 10}


### Comparing Models

In [0]:
classifiers = [KNeighborsClassifier(n_neighbors=6),
               BernoulliNB(alpha=0.1),
               SVC(kernel="rbf", C=1000, gamma=0.0001),
               DecisionTreeClassifier(max_depth=10, min_samples_leaf=2, min_samples_split=2),
               RandomForestClassifier(max_depth=10, min_samples_leaf=2, min_samples_split=2, n_estimators=100),
               GradientBoostingClassifier(learning_rate=0.0001, max_depth=2, min_samples_leaf=1, min_samples_split=2, n_estimators=10)]

In [0]:
acc = []
pre = []
rec = []
f1 = []

for classifier in classifiers:
  classifier.fit(scaled_X_train, y_train)
  y_pred = classifier.predict(scaled_X_test)
  acc.append(accuracy_score(y_test, y_pred))
  pre.append(precision_score(y_test, y_pred, average='weighted', labels=np.unique(y_pred)))
  rec.append(recall_score(y_test, y_pred, average='weighted', labels=np.unique(y_pred)))
  f1.append(f1_score(y_test, y_pred, average='weighted', labels=np.unique(y_pred)))

#### Voting Classifier

In [0]:
estimators = [('knn', KNeighborsClassifier(n_neighbors=6)),
              ('nb', BernoulliNB(alpha=0.1)),
              ('svm', SVC(kernel="rbf", C=1000, gamma=0.0001)),
              ('dt', DecisionTreeClassifier(max_depth=10, min_samples_leaf=2, min_samples_split=2)),
              ('rf', RandomForestClassifier(max_depth=10, min_samples_leaf=2, min_samples_split=2, n_estimators=100)),
              ('gb', GradientBoostingClassifier(learning_rate=0.0001, max_depth=2, min_samples_leaf=1, min_samples_split=2, n_estimators=10))]

voting = VotingClassifier(estimators=estimators, voting = 'hard')
voting.fit(scaled_X_train,y_train)
y_pred = voting.predict(scaled_X_test)
acc.append(accuracy_score(y_test, y_pred))
pre.append(precision_score(y_test, y_pred, average='weighted', labels=np.unique(y_pred)))
rec.append(recall_score(y_test, y_pred, average='weighted', labels=np.unique(y_pred)))
f1.append(f1_score(y_test, y_pred, average='weighted', labels=np.unique(y_pred)))

#### DNN

In [0]:
model = build_model()
model.fit(scaled_X_train,
          y_train,
          epochs=10,
          batch_size=1000,
          verbose=1)

y_prob = model.predict(scaled_X_test)
y_pred = np.where(y_prob>0.5, 1, 0)

In [0]:
acc.append(accuracy_score(y_test, y_pred))
pre.append(precision_score(y_test, y_pred, average='weighted'))
rec.append(recall_score(y_test, y_pred, average='weighted'))
f1.append(f1_score(y_test, y_pred, average='weighted'))

#### Full Comparison

In [0]:
models = pd.DataFrame({'Model': ['KNN','Naive Bayes','SVM','Decision Tree','Random Forest',
                                 'Gradient Boosting','Voting Classifier','DNN'],
                       'Accuracy':acc,
                       'Precision':pre,
                       'Recall':rec,
                       'F1':f1})

models.sort_values(by='F1', ascending=False)

Unnamed: 0,Model,Accuracy,Precision,Recall,F1
5,Gradient Boosting,0.703472,0.703472,1.0,0.825927
2,SVM,0.736111,0.718544,0.736111,0.689497
4,Random Forest,0.74375,0.753603,0.74375,0.682746
7,DNN,0.726389,0.700736,0.726389,0.679805
6,Voting Classifier,0.740972,0.748056,0.740972,0.678608
1,Naive Bayes,0.739583,0.74728,0.739583,0.675462
3,Decision Tree,0.733333,0.723142,0.733333,0.673694
0,KNN,0.717361,0.68509,0.717361,0.670126


## Part 3

### Loading & Processing Data

Predetermining the keywords of our interest, i.e., the variables for our model. 

In [0]:
keyword = pd.read_csv('keyword.csv', encoding='utf-8').iloc[:,1:-1].columns.tolist() # count_matrix_0001_y9bbs5.csv
news = pd.read_csv('news.csv', encoding='utf-8')
news.post_time = pd.to_datetime(news.post_time).dt.date
news['all_content'] = news['title'] + news['content'] # concatenating title and content 

In [0]:
print(keyword)

In [0]:
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta

Determining whether the stock price of D+5 increased or decreased (using an absolute value of 0.3% growth).

In [0]:
stock_data_2016 = pd.read_csv("stock_data_2016.csv", usecols=(0,1,5), thousands=',')
stock_data_2017 = pd.read_csv("stock_data_2017.csv", usecols=(0,1,5), thousands=',')
stock_data_2018 = pd.read_csv("stock_data_2018.csv", usecols=(0,1,5), thousands=',')
stock_data = pd.concat([stock_data_2016, stock_data_2017, stock_data_2018])
y9999 = stock_data.loc[stock_data['證券代碼'] == 'Y9999 加權指數'].drop(columns='證券代碼')
y9999.columns = ['time','price']
y9999.time = pd.to_datetime(y9999.time).dt.date

In [0]:
y9999.sort_values(by = 'time', inplace = True)
y9999.reset_index(drop = True, inplace = True)
y9999['diff'] = y9999.price.pct_change()

In [0]:
y9999.loc[y9999['diff'].shift(-5)>0.003, "up"] = 1 # determining movement based on the daily stock return of D+5
y9999.loc[y9999['diff'].shift(-5)<-0.003, "up"] = 0
y9999.fillna(-1, inplace=True)

In [0]:
ans = y9999[['time', 'up']]

Determining dates with number of posts higher than the 3rd quartile.

In [0]:
forum = pd.read_csv('forum.csv')
bbs = pd.read_csv('bbs.csv')
discuss = forum.append(bbs)
discuss.post_time = discuss.post_time.astype('datetime64[ns]').apply(lambda x: x.date())
n_discuss = discuss.groupby('post_time').count().sort_values('id', ascending=False)
dates = n_discuss[n_discuss.id>np.percentile(n_discuss.id, 75)]
date_list = dates.index.tolist() # list of dates with high volume

### Functions & Modeling Preps

In [0]:
# obtaining each keyword's occurences in each piece of news
def get_keyword_ct(news_qt):
  ct_dicts = []
  for i in range(0, len(news_qt)):
    ct_dict = {}
    for key in keyword:
      try:
        # count number of occurences in "all_content"
        key_ct = news_qt.iloc[i].all_content.count(key) 
      except AttributeError:
        key_ct = 0
      ct_dict[key] = key_ct
    ct_dicts.append(ct_dict)
  df = pd.DataFrame(ct_dicts)
  df.index = news_qt.id
  df.insert(0, "post_time", news_qt.post_time.values)
  return df

In [0]:
# keeping 0s and 1s, producing final confusion matrix
def get_results(y_ans, predictions):
  y_ans_keep = [i for i, x in enumerate(y_ans) if x==0 or x==1] 
  predictions_keep = [i for i, x in enumerate(predictions) if x==0 or x==1]
  keep = set(y_ans_keep).intersection(predictions_keep) # list of indices with 0s and 1s in both the answer list and the prediction list
  predictions_final = [predictions[i] for i in keep]
  y_ans_final = [y_ans[i] for i in keep]
  CM = confusion_matrix(y_ans_final, predictions_final)
  return CM

In [0]:
# determining voting classifier estimators (SVM & RF)
# hyperparameters predetermined using GridSearchCV
estimators = [('svm', SVC(kernel="rbf", C=10, gamma=0.001,
                          class_weight="balanced")),
              ('rf', RandomForestClassifier(max_depth=9, min_samples_leaf=2, 
                                      min_samples_split=5, n_estimators=200,
                                      class_weight="balanced"))]

                                            

### Modeling & Final Results

In [0]:
TN = 0 # true negatives (correct sell prediction)
FN = 0 # false negatives (wrong sell prediction)
TP = 0 # true positives (correct buy prediction)
FP = 0 # false positives (wrong buy prediction)

for i in range(33): # split into 33 sets / months
  print('Processing Month #%d' % (i+1))
  start_date = datetime.strptime('2016-01-01', '%Y-%m-%d').date() + relativedelta(months=i)
  end_date = start_date + relativedelta(months=4)
  test_month = end_date - relativedelta(months=1)
  news_qt = news[(news.post_time >= start_date) & (news.post_time < end_date)] # df of news in our training / testing sets
  news_price = get_keyword_ct(news_qt).merge(y9999[['time', 'up']], how='left', left_on='post_time', right_on='time').drop(columns='time')
  test_all = news_price[news_price.post_time >= test_month] # leaving out the last month as testing sets
  train = news_price[(news_price.up.isin([0,1])) & (news_price.post_time.isin(date_list)) & (news_price.post_time < test_month)]
  X_train = train.iloc[:,1:-1]
  y_train = train['up'].values
  try:
    scaled_X_train = scaler.fit_transform(X_train)
  except ValueError:
    continue
  # fit voting classifier
  voting = VotingClassifier(estimators=estimators, voting = 'hard')
  try:
    voting.fit(scaled_X_train,y_train)
  except ValueError:
    continue
  
  predictions = []
  ans_time = ans.loc[(ans.time>=test_month) & (ans.time<end_date), 'time']
  # predict with testing sets (for each day in month) 
  for date in ans_time:
    test = test_all[test_all.post_time==date]
    X_test = test.iloc[:,1:-1]
    try:
      scaled_X_test = scaler.fit_transform(X_test)
    except ValueError:
      continue
    y_pred = voting.predict(scaled_X_test)

    # if (number of 1 predictions/number of 0 predictions) -1 >= 0.2, return 1; if <= -0.2, return 0; otherwise, -1
    try:
      y_pred_prob = np.count_nonzero(y_pred==1)/np.count_nonzero(y_pred==0)-1
    except ZeroDivisionError:
      y_pred_prob = 1
    if y_pred_prob >= 0.2:
      predictions.append(1)
    elif y_pred_prob <= -0.2:
      predictions.append(0)
    else:
      predictions.append(-1)

  y_ans = ans.loc[(ans.time>=test_month) & (ans.time<end_date), 'up'].tolist() # true stock movements

  # tally final results
  try:
    CM = get_results(y_ans, predictions)
  except IndexError:
    continue

  TN += CM[0][0] 
  FN += CM[1][0] 
  TP += CM[1][1] 
  FP += CM[0][1] 

At long last, the final results!

In [0]:
final_results = pd.DataFrame([[TP,FP],[FN,TN]], columns=['真實為漲','真實為跌'], index=['預測為漲','預測為跌'])
final_results.to_csv('final_results.csv')

In [0]:
final_results