In [12]:
import pandas as pd
import os
from sklearn.externals import joblib
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold
import numpy as np
from time import time

np.random.seed(123)

In [2]:
PATH_PROCESSED_DATA = '../../bld/out/data_processed'

In [3]:
df = joblib.load(os.path.join(PATH_PROCESSED_DATA, 'twitter_stanford.p.lzma'))

In [16]:
# Data
y, X = df.TWEET_SENTIMENT.cat.codes, df.TWEET_CONTENT

# Classifier
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', SVC()),
                    ])

# Cross validator
skf = StratifiedKFold(n_splits=5)

# Utility function to report best scores
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")


# specify parameters and distributions to sample from
param_dist = {
    'clf__C': np.arange(0.7, 1.05, 0.1),
    'clf__kernel': ['rbf'],
    'clf__tol': [1e-3],
    'clf__max_iter': [10000],
}

# run randomized search
n_iter_search = 3
random_search = RandomizedSearchCV(
    text_clf, param_distributions=param_dist, cv=skf, n_iter=n_iter_search)

start = time()
random_search.fit(X, y)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))
report(random_search.cv_results_)

KeyboardInterrupt: 