In [14]:
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
import time
pd.set_option("display.width", 500)
pd.set_option("display.max_columns", 100)
pd.set_option("display.notebook_repr_html", True)
import seaborn as sns
sns.set_style("darkgrid")
sns.set_context("poster")

##TF-IDF term weighting 
TF-IDF (term-frequency inverse document-frequency) is a method for extracting the usefulness of words and terms. It weights up for term frequency in document, and weights down for the number of documents a term appears in. The formula is:  $$\text{new_frequency} = tf * (idf + 1)$$
$tf = \text{number of times term appears in document}/\text{number of words in document}$, $idf = \text{total number of documents}/\text{number of documents in which term appears}$

See: http://scikit-learn.org/stable/modules/feature_extraction.html#tfidf-term-weighting, http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfTransformer.html

The below function takes a bag-of-words matrix (size n_documents\*vocab_size), and outputs the corresponding tfidf matrix. We show an example below. 

In [15]:
from sklearn.feature_extraction.text import TfidfTransformer
#this function takes a matrix of size n_documents*vocab_size, and outputs the corresponding tfidf matrix 
def tfidf_mat_creator(wordmatrix):
    tf_idf_transform=TfidfTransformer()
    return(tf_idf_transform.fit_transform(wordmatrix).toarray())

#Example
samplematrix = np.random.randint(0, high=5, size=(3,4))
print "Sample bag-of-words matrix of 3 documents, vocab size 4 words: "
print samplematrix
print "tfidf transformation of above matrix: " 
print tfidf_mat_creator(samplematrix)

Sample bag-of-words matrix of 3 documents, vocab size 4 words: 
[[0 3 1 1]
 [1 4 4 4]
 [1 2 4 3]]
tfidf transformation of above matrix: 
[[ 0.          0.90453403  0.30151134  0.30151134]
 [ 0.18273153  0.56762934  0.56762934  0.56762934]
 [ 0.23256045  0.36120787  0.72241573  0.5418118 ]]


##SVM
We try running some SVMs on nouns. 

First, we must import the noun training and test matrices, and we convert them to tfidf matrices.

In [33]:
#GET TRAINING AND TEST MATRICES FOR NOUNS
from numpy import genfromtxt
##%%time
noun_train_df = pd.read_csv('noun_train_mat.csv',sep=',',header=None)
noun_train_mat = noun_train_df.values
noun_test_df = pd.read_csv('noun_test_mat.csv',sep=',',header=None)
noun_test_mat = noun_test_df.values
noun_train_issue_areas_df = pd.read_csv('noun_train_issue_areas.csv',sep=',',header=None)
noun_train_issue_areas = noun_train_issue_areas_df.values.ravel()
noun_test_issue_areas_df = pd.read_csv('noun_test_issue_areas.csv',sep=',',header=None)
noun_test_issue_areas = noun_test_issue_areas_df.values.ravel()

In [34]:
#convert the matrices to tfidf matrices 
noun_train_tfidf = tfidf_mat_creator(noun_train_mat)
noun_test_tfidf = tfidf_mat_creator(noun_test_mat)

###Simple practice SVM
We train and run a simple linear SVM. We follow https://www.quantstart.com/articles/Supervised-Learning-for-Document-Classification-with-Scikit-Learn.

In [38]:
from sklearn.svm import SVC
##%%time
def train_svm(X, y):
    #this creates and trains the SVM
    svm = SVC(C=1000000.0, gamma=0.0, kernel='rbf')
    svm.fit(X, y)
    return svm
svm1 = train_svm(noun_train_tfidf,noun_train_issue_areas)

In [39]:
pred = svm1.predict(noun_test_tfidf)
#accuracy rate on test set 
print(svm1.score(noun_test_tfidf, noun_test_issue_areas))

0.663736263736


In [40]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(pred, noun_test_issue_areas))

[[76 11  7  5  1  0  0  3  5  1  0  0]
 [ 3 60 10 10  1  1  1  4  9  3  2  1]
 [ 0  0 19  0  0  0  2  0  1  0  0  0]
 [ 0  1  0  4  0  0  0  0  0  0  0  1]
 [ 0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  2  0  0  0  0  0  0]
 [ 0  1  0  0  0  0 22  0  1  3  0  0]
 [ 1  5  0  4  0  0  2 82  7 10  0  7]
 [ 1  6  2  1  0  1  1  7 20  2  0  0]
 [ 0  2  0  1  0  0  0  3  0 10  1  0]
 [ 0  1  0  0  0  0  0  0  0  0  1  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  6]]


In [41]:
len(noun_test_issue_areas)

455

###Trying multiple SVMs
We now test a variety of parameters and do cross-validation to optimize the nouns SVM. To do this, we implement functions cv_optimize and do_classify, which are similar to (and largely taken from) the equivalently named functions in problem set 3.

####NOTE: SHOULD ALSO TRY THIS WITHOUT DOING TFIDF, TO SHOW WE TRIED MANY POSSIBILITIES. THEN, SHOULD TRY THINGS OTHER THAN NOUNS, AND COMBINATIONS OF WORDS (E.G., NOUNS + PRECEDENTS)

In [53]:
"""
Function
--------
cv_optimize

Inputs
------
clf : an instance of a scikit-learn classifier
parameters: a parameter grid dictionary thats passed to GridSearchCV
X: a document-word matrix (e.g., noun_train_tfidf). Should be training data.
y: the response vector (e.g., noun_train_issue_areas)
n_folds: the number of cross-validation folds (default 5)
score_func: a score function we might want to pass (default python None)
   
Returns
-------
Two things: (1) The best estimator from the GridSearchCV, after the GridSearchCV has been used to
fit the model; and (2) the best parameter. 
     
Notes
-----
see do_classify below for an example of how this is used
"""
from sklearn.grid_search import GridSearchCV
#note: this code comes directly from lab 6
def cv_optimize(clf, parameters, X, y, n_folds=5, score_func=None):
    if score_func:
        gs = GridSearchCV(clf, param_grid=parameters, cv=n_folds, scoring=score_func)
    else:
        gs = GridSearchCV(clf, param_grid=parameters, cv=n_folds)
    gs.fit(X, y)
    best_estimator = gs.best_estimator_
    best_param = gs.best_params_
    return (best_estimator,best_param)

In [97]:
"""
Function
--------
do_classify

Inputs
------
clf : an instance of a scikit-learn classifier
parameters: a parameter grid dictionary thats passed to GridSearchCV
Xtrain: a training matrix document-word matrix (e.g., noun_train_tfidf)
ytrain: the corresponding training response vector (e.g., noun_train_issue_areas)
Xtest: a test matrix document-word matrix (e.g., noun_test_tfidf)
ytest: the corresponding test resonse vector (e.g., noun)
n_folds: the number of cross-validation folds (default 5)
score_func: a score function we might want to pass (default python None)
   
Returns
-------
4 things, in the following order: (1) an array of predicted y values (ie, topic areas) for the test data; 
                                  (2) the accuracy score; (3) the confusion matrix of the test data predictions;
                                  (4) the best parameter from the gridsearch.

"""

from sklearn.metrics import confusion_matrix
def do_classify(clf, parameters, Xtrain, ytrain, Xtest, ytest, score_func=None, n_folds=5):
    if parameters:
        clf,best_param = cv_optimize(clf, parameters, Xtrain, ytrain, n_folds=n_folds, score_func=score_func)
    clf=clf.fit(Xtrain, ytrain)
    training_accuracy = clf.score(Xtrain, ytrain)
    test_accuracy = clf.score(Xtest, ytest)
    print "############# based on standard predict ################"
    print "Accuracy on training data: %0.2f" % (training_accuracy)
    print "Accuracy on test data:     %0.2f" % (test_accuracy)
    print "Best parameter: ", best_param
    print confusion_matrix(ytest, clf.predict(Xtest))
    print "########################################################"
    return(clf.predict(Xtest),test_accuracy,confusion_matrix(ytest,clf.predict(Xtest)),best_param)

####Linear SVMs

In [109]:
from sklearn.svm import LinearSVC
#Try do_classify on best 6 parameters
%time
parameters1 = {"C": [0.0001, 0.01, 1.0, 100.0, 1000.0, 100000.0]}
predictions1,accuracy1,confusion_matrix1,best_param1=do_classify(LinearSVC(loss='hinge'), parameters1, 
                                                             noun_train_tfidf,noun_train_issue_areas,
                                                             noun_test_tfidf,noun_test_issue_areas)

CPU times: user 7 µs, sys: 3 µs, total: 10 µs
Wall time: 14.1 µs
############# based on standard predict ################
Accuracy on training data: 0.97
Accuracy on test data:     0.68
Best parameter:  {'C': 1.0}
[[76  2  0  0  0  0  0  1  2  0  0  0]
 [10 63  0  0  0  0  1  8  3  1  1  0]
 [ 9  4 24  0  0  0  0  0  1  0  0  0]
 [ 9  6  1  1  0  0  1  5  2  0  0  0]
 [ 1  0  0  0  1  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  3  0  1  0  0  0  0]
 [ 0  0  1  0  0  0 22  3  2  0  0  0]
 [ 3  3  1  0  0  0  1 86  4  1  0  0]
 [ 6  3  1  0  0  0  1 11 21  0  0  0]
 [ 1  1  0  0  0  1  5 11  2  4  4  0]
 [ 0  0  0  0  0  0  0  1  0  1  2  0]
 [ 0  0  0  1  0  0  0  9  0  0  0  5]]
########################################################


In [105]:
#Rerun do_classify based on the best parameter above
paramvalue = best_param1.values()[0]
parameters2 = {"C": [paramvalue/10.,paramvalue,paramvalue*10]}
predictions2,accuracy2,confusion_matrix2,best_param2=do_classify(LinearSVC(loss='hinge'), parameters2, 
                                                             noun_train_tfidf,noun_train_issue_areas,
                                                             noun_test_tfidf,noun_test_issue_areas)
#oddly, the best parameter here is 10.0, even though it gives a worse result than the prior function.

############# based on standard predict ################
Accuracy on training data: 1.00
Accuracy on test data:     0.66
Best parameter:  {'C': 10.0}
[[72  4  0  0  0  0  0  2  3  0  0  0]
 [10 59  1  0  0  0  1  6  7  2  1  0]
 [ 6  5 25  0  0  0  0  0  2  0  0  0]
 [ 6  6  0  5  0  0  1  6  0  1  0  0]
 [ 1  0  0  0  1  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  3  0  1  0  0  0  0]
 [ 1  0  2  0  0  0 18  5  2  0  0  0]
 [ 5  3  0  0  0  0  2 77  6  4  0  2]
 [ 3  5  1  1  0  0  1 11 21  0  0  0]
 [ 1  3  0  0  0  1  2  9  1 11  1  0]
 [ 0  0  0  0  0  0  0  1  0  1  2  0]
 [ 0  1  0  1  0  0  0  7  1  0  0  5]]
########################################################


####Non-linear SVCs

In [114]:
from sklearn.svm import SVC
##%time
#parameters grid
##parameters3 = {"C": [1e-5,1e-3,1,1e3,1e5],"gamma":[0,1e-3,1e-5,1e-7]}
#test non-linear (kernelized) SVMs
##predictions3,accuracy3,confusion_matrix3,best_param3=do_classify(SVC(), parameters3, 
##                                                             noun_train_tfidf,noun_train_issue_areas,
##                                                             noun_test_tfidf,noun_test_issue_areas)
accuracy3

CPU times: user 3 µs, sys: 1 µs, total: 4 µs
Wall time: 8.82 µs
############# based on standard predict ################
Accuracy on training data: 0.99
Accuracy on test data:     0.69
Best parameter:  {'C': 100000.0, 'gamma': 1e-05}
[[77  2  0  0  0  0  0  1  1  0  0  0]
 [ 9 63  0  1  0  0  1  5  5  2  1  0]
 [ 9  3 23  0  0  0  0  2  1  0  0  0]
 [ 7  6  0  4  0  0  0  6  2  0  0  0]
 [ 1  0  0  0  0  0  0  1  0  0  0  0]
 [ 0  0  0  0  0  2  0  1  1  0  0  0]
 [ 0  0  2  0  0  0 22  3  1  0  0  0]
 [ 3  1  0  0  0  0  0 88  6  1  0  0]
 [ 7  3  1  0  0  0  1 11 20  0  0  0]
 [ 1  2  0  0  0  0  4 11  1  9  1  0]
 [ 0  1  0  0  0  0  0  1  0  1  1  0]
 [ 0  0  0  1  0  0  0  8  0  0  0  6]]
########################################################


0.69230769230769229

###Visualization ideas (for this and other parts of the project) 
- Graph of the SVM confusion matrix to show which issue areas were best predicted, and to show overlap between areas. Maybe a 12x12 (or 14x14) heat map showing overlap between columns. For instance, if issue 12 is frequently predicted as issue 10, then the 12-10 (or 10-12) box of the heat map would be dark.
- Bar graph comparing the accuracy rating of the best iteration of each model 
- Show the words that have the greatest influence (e.g., highest absolute coefficients) in each model 
- Exporatory data analysis: show the most common words; visualize the TF-IDF matrix. Perhaps take the sum of each column (word) of the TF-IDF matrix, as this represents "overall influence" of a given word. Then make a plot of these values -- maybe even a visualization that represents importance with word size.  