# 0. Import and train_test_split

## (1) Import

In [103]:
# Basic
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn

# ML
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

from sklearn.metrics import classification_report, confusion_matrix
from sklearn.pipeline import Pipeline

# others
from datetime import datetime, timedelta

In [104]:
df = pd.read_csv('data/processed_data_for_training.csv')

## (2) train_test_split

In [105]:
# 1. Set up X, y
X = df.drop(columns=['Tomorrow_trend_cate'])
y = df['Tomorrow_trend_cate']

In [106]:
# Check X, y shape
print('X:', X.shape)
print('y:', y.shape)

X: (2518, 6)
y: (2518,)


In [107]:
# 2. train_test_split
# val dataset for final examination

# Problem: Shuffle or not
# It's panel data but the features already contain the temporal information 

suffle_param = True

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, shuffle=suffle_param, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, shuffle=suffle_param, random_state=42)

In [108]:
# Check 
print('X_train:', X_train.shape)
print('X_test:', X_test.shape)
print('y_train:', y_train.shape)
print('y_test:', y_test.shape)

X_train: (2039, 6)
X_test: (252, 6)
y_train: (2039,)
y_test: (252,)


# 1. ML methods

Benchmark

Overal Accuracy

1. Logistic Regression: 0.61
2. Random Forest: 0.63
3. Gradient Boosting: 0.61

Uptrend Accuracy

1. Logistic Regression: 0.58
2. Random Forest: 0.61
3. Gradient Boosting: 0.59

Downtrend Accuracy

1. Logistic Regression: 0.66
2. Random Forest: 0.68
3. Gradient Boosting: 0.63


## (0) Before training, Check some numbers


In [109]:
# min()

In [110]:
# Time period
print('Time Period')
# print('From:', time_start)
# print('To:', time_end, '\n')

# Sample size
print('Sample size:', X.shape[0])
print('Feature:', X.shape[1])
print(X.columns.values, '\n')
print('Target:', y.name, '\n')
print('Train: Test: Val = 0.81: 0.1: 0.09=', X_train.shape[0], X_test.shape[0], X_val.shape[0])

Time Period
Sample size: 2518
Feature: 6
['High' 'Close' 'Sma' 'title_sentiment_score' 'title_midterm_sentiment'
 'title_longterm_sentiment'] 

Target: Tomorrow_trend_cate 

Train: Test: Val = 0.81: 0.1: 0.09= 2039 252 227


## (1) Logistic Regression

In [111]:
model = LogisticRegression(multi_class='multinomial', random_state=42)

pipeline = Pipeline([('scaler', MinMaxScaler()), ('classifier', model)])

# Don't forget. fit doesn't accept string input
pipeline.fit(X_train, y_train)

Pipeline(steps=[('scaler', MinMaxScaler()),
                ('classifier',
                 LogisticRegression(multi_class='multinomial',
                                    random_state=42))])

In [112]:
# Evaluation

# Accuracy
print('Overall Accuracy:')
score_train = pipeline.score(X_train, y_train)
score_test = pipeline.score(X_test, y_test)
print('Training Accuracy:', score_train)
print('Testing Accuracy:', score_test, '\n')

# Train confusion matrix
y_pred = pipeline.predict(X_train)

print('===========================')
print('Training Result:')
print(classification_report(y_train, y_pred))
print(confusion_matrix(y_train, y_pred), '\n')

# Test confusion matrix
y_pred = pipeline.predict(X_test)

print('===========================')
print('Testing Result:')
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

Overall Accuracy:
Training Accuracy: 0.5384992643452673
Testing Accuracy: 0.5317460317460317 

Training Result:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       941
           1       0.54      1.00      0.70      1098

    accuracy                           0.54      2039
   macro avg       0.27      0.50      0.35      2039
weighted avg       0.29      0.54      0.38      2039

[[   0  941]
 [   0 1098]] 

Testing Result:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       118
           1       0.53      1.00      0.69       134

    accuracy                           0.53       252
   macro avg       0.27      0.50      0.35       252
weighted avg       0.28      0.53      0.37       252

[[  0 118]
 [  0 134]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## (2) Random Forest

In [113]:
model = RandomForestClassifier(random_state=42)
pipeline = Pipeline([('scaler', MinMaxScaler()), ('classifier', model)])

# parameter
param_grid = [
    {'classifier__n_estimators': [50, 100, 500, 1000]},
    {'classifier__max_depth': [1, 5, 10, 25]},
    {'classifier__max_features': [*np.arange(0.1, 1.1, 0.1)]}
]

gridsearch = GridSearchCV(pipeline, param_grid)

gridsearch.fit(X_train, y_train)

GridSearchCV(estimator=Pipeline(steps=[('scaler', MinMaxScaler()),
                                       ('classifier',
                                        RandomForestClassifier(random_state=42))]),
             param_grid=[{'classifier__n_estimators': [50, 100, 500, 1000]},
                         {'classifier__max_depth': [1, 5, 10, 25]},
                         {'classifier__max_features': [0.1, 0.2,
                                                       0.30000000000000004, 0.4,
                                                       0.5, 0.6,
                                                       0.7000000000000001, 0.8,
                                                       0.9, 1.0]}])

In [114]:
# Evaluation

# Accuracy
print('Overall Accuracy:')
score_train = gridsearch.score(X_train, y_train)
score_test = gridsearch.score(X_test, y_test)
print('Training Accuracy:', score_train)
print('Testing Accuracy:', score_test, '\n')

# Train confusion matrix
y_pred = gridsearch.predict(X_train)

print('===========================')
print('Training Result:')
print(classification_report(y_train, y_pred))
print(confusion_matrix(y_train, y_pred), '\n')

# Test confusion matrix
y_pred = gridsearch.predict(X_test)

print('===========================')
print('Testing Result:')
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

Overall Accuracy:
Training Accuracy: 0.5473271211378127
Testing Accuracy: 0.5357142857142857 

Training Result:
              precision    recall  f1-score   support

           0       0.72      0.03      0.06       941
           1       0.54      0.99      0.70      1098

    accuracy                           0.55      2039
   macro avg       0.63      0.51      0.38      2039
weighted avg       0.63      0.55      0.41      2039

[[  29  912]
 [  11 1087]] 

Testing Result:
              precision    recall  f1-score   support

           0       0.60      0.03      0.05       118
           1       0.53      0.99      0.69       134

    accuracy                           0.54       252
   macro avg       0.57      0.51      0.37       252
weighted avg       0.57      0.54      0.39       252

[[  3 115]
 [  2 132]]


## (3) Gradient Boosting

In [115]:
# Train
model = GradientBoostingClassifier(random_state=42)
pipeline = Pipeline([('scaler', MinMaxScaler()), ('classifier', model)])

# parameter
param_grid = [
    # {"classifier__learning_rate": [0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2]},
    # {"classifier__min_samples_split": np.linspace(0.1, 0.5, 12)},
    # {"classifier__min_samples_leaf": np.linspace(0.1, 0.5, 12)},
    # {"classifier__max_depth":[3,5,8]},
    # {"classifier__max_features":["log2","sqrt"]},
    # {"classifier__criterion": ["friedman_mse",  "absolute_error"]},
    # {"classifier__subsample":[0.5, 0.618, 0.8, 0.85, 0.9, 0.95, 1.0]},
    {"classifier__n_estimators":[100]}
    ]

gridsearch = GridSearchCV(pipeline, param_grid)

gridsearch.fit(X_train, y_train)

GridSearchCV(estimator=Pipeline(steps=[('scaler', MinMaxScaler()),
                                       ('classifier',
                                        GradientBoostingClassifier(random_state=42))]),
             param_grid=[{'classifier__n_estimators': [100]}])

In [116]:
# Evaluation

# Accuracy
print('Overall Accuracy:')
score_train = gridsearch.score(X_train, y_train)
score_test = gridsearch.score(X_test, y_test)
print('Training Accuracy:', score_train)
print('Testing Accuracy:', score_test, '\n')

# Train confusion matrix
y_pred = gridsearch.predict(X_train)

print('===========================')
print('Training Result:')
print(classification_report(y_train, y_pred))
print(confusion_matrix(y_train, y_pred), '\n')

# Test confusion matrix
y_pred = gridsearch.predict(X_test)

print('===========================')
print('Testing Result:')
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

Overall Accuracy:
Training Accuracy: 0.704757233938205
Testing Accuracy: 0.5238095238095238 

Training Result:
              precision    recall  f1-score   support

           0       0.85      0.44      0.58       941
           1       0.66      0.93      0.77      1098

    accuracy                           0.70      2039
   macro avg       0.75      0.69      0.68      2039
weighted avg       0.75      0.70      0.68      2039

[[ 412  529]
 [  73 1025]] 

Testing Result:
              precision    recall  f1-score   support

           0       0.49      0.28      0.35       118
           1       0.54      0.74      0.62       134

    accuracy                           0.52       252
   macro avg       0.51      0.51      0.49       252
weighted avg       0.51      0.52      0.50       252

[[33 85]
 [35 99]]


# 2. Evaluation