# Models

Here are the final models and results. Many other things were tried and most I saved in the `models_failed.py`, but some were even deleted from there. The data was processed in `process_data.py` and details of the statements were explored in `EDA.py`. 

The purpose of this analysis was to determine word clustering for the 141 statements issued by the FOMC. Relatedly, I attempt to train a model to predict statement 'sentiment'. I measure sentiment by the daily change in the 10-year treasury yield after the statement is released. 

For the clustering analysis, I found the straight CountVectorizer was the best way to extract features. For the sentiment analysis, I used TfidfVectorizer, which is the same as CountVectorizer followed by the TfidfTransformer. 

In [41]:
# Import standard libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Import sklearn libraries
from sklearn import svm
import sklearn.model_selection as ms
import sklearn.feature_extraction.text as text
import sklearn.naive_bayes as nb
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [11]:
# Read in the statements
statements = pd.read_csv("statements_with_labels.csv")
print(statements.head())
print(statements.labels.value_counts())

   Unnamed: 0        date                                               text  \
0           0  1994-02-04  Chairman Alan Greenspan announced today that t...   
1           1  1994-03-22  Chairman Alan Greenspan announced today that t...   
2           2  1994-04-18  Chairman Alan Greenspan announced today that t...   
3           3  1994-05-17  The Federal Reserve today announced two action...   
4           4  1994-08-16  The Federal Reserve announced today the follow...   

   labels  
0       1  
1       0  
2       1  
3       0  
4       0  
1    91
0    80
Name: labels, dtype: int64


## I. Clustering

In [43]:
# Extract features with CountVectorizer
# Ignore words with over 95% frequency and in less than 2 documents
vec = CountVectorizer(max_df=0.90, min_df=1, stop_words='english')

vectorized = vec.fit_transform(statements['text'])
print(vectorized.shape)

(171, 1592)


In [44]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

In [45]:
lda_model = LatentDirichletAllocation(n_components=5, max_iter=5,
                                      learning_method='online',
                                      learning_offset=50.,
                                      random_state=0)
lda_model.fit(vectorized)

print("\nTopics in LDA model:")
feature_names = vec.get_feature_names()
print_top_words(lda_model, feature_names, 10)


Topics in LDA model:
Topic #0: markets states policy emergence financial forces forward inflationary levels conditions
Topic #1: inflation policy securities labor conditions longer agency term monetary mortgage
Topic #2: bank growth inflation central markets demand banks conditions information arrangements
Topic #3: growth approved discount basis today sustainable jr chairman 25 board
Topic #4: growth inflation resource markets chairman housing utilization voting kevin warsh



## II. Sentiment Analysis

#### II (a). Naive Bayes

In [28]:
# Identify labels
y = statements['labels']

# Implement Tfid vectorizer
tf = text.TfidfVectorizer()
X = tf.fit_transform(statements['text'])
print(X.shape)

(171, 1764)


In [29]:
# Identify how many samples have non-zero features
p = 100 * X.nnz / float(X.shape[0] * X.shape[1])
print(f"Each sample has ~{p:.2f}% non-zero features.")

Each sample has ~11.18% non-zero features.


In [30]:
# Separate train/test data
(X_train, X_test, y_train, y_test) = ms.train_test_split(X, y, test_size=.2)

In [34]:
# Use GridSearchCV to find optimal alpha
nb_model = ms.GridSearchCV(nb.BernoulliNB(), param_grid={'alpha': np.logspace(-2., 2., 50)})
nb_model.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'alpha': array([  1.00000e-02,   1.20679e-02,   1.45635e-02,   1.75751e-02,
         2.12095e-02,   2.55955e-02,   3.08884e-02,   3.72759e-02,
         4.49843e-02,   5.42868e-02,   6.55129e-02,   7.90604e-02,
         9.54095e-02,   1.15140e-01,   1.38950e-01,   1.67683e-01,
         2....    3.90694e+01,   4.71487e+01,   5.68987e+01,   6.86649e+01,
         8.28643e+01,   1.00000e+02])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [35]:
nb_model.score(X_test, y_test)

0.65714285714285714

#### II (b). SVM 

In [39]:
# Use GridSearchCV 
svm_model = ms.GridSearchCV(svm.SVC(kernel='rbf'), 
                            {'C': [0.001, 0.01, 0.1, 1, 10], 'gamma' : [0.001, 0.01, 0.1, 1]}, 
                            cv=5)
svm_model.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': [0.001, 0.01, 0.1, 1, 10], 'gamma': [0.001, 0.01, 0.1, 1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [40]:
svm_model.score(X_test, y_test)

0.59999999999999998