In [68]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier

import numpy as np

# Question 2: Applied Machine Learning

We first obtain the data and build the tfidf document matrix:

In [47]:
data = fetch_20newsgroups()
tfidf_dm = TfidfVectorizer().fit_transform(data.data)

In [80]:
tfidf_dm.shape # shape=(num_docs, num_terms)

(11314, 130107)

then we split the data:

 - 10% for validation
 - 10% for testing
 - 80% for training

In [59]:
N = len(data.data)
t_N = int(0.1 * N)

indices = np.random.choice(range(len(data.data)),
                           size=2*t_N,
                           replace=False)

test_ids, valid_ids = indices[:t_N], indices[t_N:]
train_ids = np.setdiff1d(np.arange(N), indices, assume_unique=True)

assert len(train_ids) + len(test_ids) + len(valid_ids) == N

In [71]:
data.target

array([7, 4, 4, ..., 3, 1, 8])

In [None]:
# grid search n_estimators and max_depth

y_train = data.target[train_ids]
x_train = tfidf_dm[train_ids]
x_test = tfidf_dm[test_ids]
y_test = data.target[test_ids]

accuracies = []

for n in range(1, 100):
    # run once with no max to get upper bound
    for depth in range(1, 100):
        rfc = RandomForestClassifier(n_estimators=n, max_depth=depth)
        rfc.fit(x_train, y_train)
        score = rfc.score(x_test, y_test)
        accuracies.append((n, depth, score))
    rfc = RandomForestClassifier(n_estimators=n).fit(x_train, y_train)
    score = rfc.score(x_test, y_test)
    accuracies.append((n, np.inf, score))
    print('Done with n=', n)

Done with n= 1
Done with n= 2
Done with n= 3
Done with n= 4
Done with n= 5
Done with n= 6
Done with n= 7
Done with n= 8
