In [None]:
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
import time
pd.set_option("display.width", 500)
pd.set_option("display.max_columns", 100)
pd.set_option("display.notebook_repr_html", True)
import seaborn as sns
sns.set_style("darkgrid")
sns.set_context("poster")

TF-IDF converter function

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer
#this function takes a training matrix of size n_documents_training*vocab_size and a test matrix
#of size n_documents_test*vocab_size. The function outputs the corresponding tfidf matrices.
#Note that we fit on the training data, and then apply that fit to the test data.
def tfidf_mat_creator(trainmatrix,testmatrix):
    tf_idf_transformer=TfidfTransformer()
    tfidf_fit = tf_idf_transformer.fit(trainmatrix)
    tfidf_train = tfidf_fit.transform(trainmatrix).toarray()
    tfidf_test = tfidf_fit.transform(testmatrix).toarray()
    return(tfidf_train,tfidf_test)

THE BELOW CELL WILL BE DIFFERENT - NOT THIS EXACT CODE. Basically, you just need to create the noun training and test matrices. Don't convert to TF-IDF yet.

In [None]:
%%time
#GET TRAINING AND TEST MATRICES FOR NOUNS
noun_train_df = pd.read_csv('noun_train_mat.csv',sep=',',header=None)
noun_train_mat = noun_train_df.values
noun_test_df = pd.read_csv('noun_test_mat.csv',sep=',',header=None)
noun_test_mat = noun_test_df.values
#TRAINING AND TEST MATRICES FOR VERBS
verb_train_df = pd.read_csv('verb_train_mat.csv',sep=',',header=None)
verb_train_mat = verb_train_df.values
verb_test_df = pd.read_csv('verb_test_mat.csv',sep=',',header=None)
verb_test_mat = verb_test_df.values
#TRAIN AND TEST ISSUE AREAS 
train_issue_areas_df = pd.read_csv('train_issue_areas.csv',sep=',',header=None)
train_issue_areas = train_issue_areas_df.values.ravel()
test_issue_areas_df = pd.read_csv('test_issue_areas.csv',sep=',',header=None)
test_issue_areas = test_issue_areas_df.values.ravel()

Below function will combine verb and noun matrices into one.

In [None]:
def matrix_combine(matrix_list):    
    train_mat = np.concatenate(([element[0] for element in matrix_list]),axis=1)
    test_mat = np.concatenate(([element[1] for element in matrix_list]),axis=1)
    return(train_mat,test_mat)

Combine verb and noun; convert to TF-IDF. 

In [None]:
#combine verb and noun matrices
matrix_list = [(verb_train_mat,verb_test_mat),(noun_train_mat,noun_test_mat)]
train_verbnouns,test_verbnouns = matrix_combine(matrix_list)
#convert the combined matrix to TF-IDF
train_verbnouns_tfidf,test_verbnouns_tfidf = tfidf_mat_creator(train_verbnouns,test_verbnouns)

SVM-specific functions

In [None]:
"""
Function
--------
confusion_mat_creator

Inputs
------
predictions: a list of length n_documents. Each value in the list is the predicted
             issue area of the corresponding document. ("Predicted" as predicted by the SVM.)
actuals: a list of length n_documents. Each value is the actual issue area of the corresponding document.

Returns
-------
A 14*14 confusion matrix. Cell i,j is the number of cases with actual issue area j 
that were predicted as issue area i. Thus, the diagonal represents correct predictions.

Notes
-----
see do_classify below for an example of how this is used
"""

def confusion_mat_creator(predictions,actuals): 
    confusion_mat = np.zeros((14,14))
    for i in range(len(predictions)):
        #get predicted and actual issue ares; subtract by 1 since matrix is 0-indexed
        p_val = predictions[i]-1
        a_val = actuals[i]-1
        confusion_mat[p_val,a_val]+=1 #Matrix is thus predicted values*actual values 
    return(confusion_mat)

In [None]:
"""
Function
--------
cv_optimize

Inputs
------
clf : an instance of a scikit-learn classifier
parameters: a parameter grid dictionary thats passed to GridSearchCV
X: a document-word matrix (e.g., noun_train_tfidf). Should be training data.
y: the response vector (train_issue_areas)
n_folds: the number of cross-validation folds (default 5)
score_func: a score function we might want to pass (default python None)
   
Returns
-------
Two things: (1) The best estimator from the GridSearchCV, after the GridSearchCV has been used to
fit the model; and (2) the best parameter. 
     
Notes
-----
see do_classify below for an example of how this is used
"""
from sklearn.grid_search import GridSearchCV
#note: this code comes directly from lab 6
def cv_optimize(clf, parameters, X, y, n_folds=5, score_func=None):
    if score_func:
        gs = GridSearchCV(clf, param_grid=parameters, cv=n_folds, scoring=score_func)
    else:
        gs = GridSearchCV(clf, param_grid=parameters, cv=n_folds)
    gs.fit(X, y)
    best_estimator = gs.best_estimator_
    best_param = gs.best_params_
    return (best_estimator,best_param)

In [None]:
"""
Function
--------
do_classify

Inputs
------
clf : an instance of a scikit-learn classifier
parameters: a parameter grid dictionary thats passed to GridSearchCV
Xtrain: a training matrix document-word matrix (e.g., noun_train_tfidf)
ytrain: the corresponding training response vector (train_issue_areas)
Xtest: a test matrix document-word matrix (e.g., noun_test_tfidf)
ytest: the corresponding test resonse vector (e.g., noun)
n_folds: the number of cross-validation folds (default 5)
score_func: a score function we might want to pass (default python None)
   
Returns
-------
4 things, in the following order: (1) an array of predicted y values (ie, topic areas) for the test data; 
                                  (2) the accuracy score; (3) the confusion matrix of the test data predictions;
                                  (4) the best parameter from the gridsearch.

"""

##from sklearn.metrics import confusion_matrix
def do_classify(clf, parameters, Xtrain, ytrain, Xtest, ytest, score_func=None, n_folds=5):
    if parameters:
        clf,best_param = cv_optimize(clf, parameters, Xtrain, ytrain, n_folds=n_folds, score_func=score_func)
    clf=clf.fit(Xtrain, ytrain)
    training_accuracy = clf.score(Xtrain, ytrain)
    test_accuracy = clf.score(Xtest, ytest)
    print "############# based on standard predict ################"
    print "Accuracy on training data: %0.2f" % (training_accuracy)
    print "Accuracy on test data:     %0.2f" % (test_accuracy)
    print "Best parameter: ", best_param
    pred = clf.predict(Xtest)
    print "########################################################"
    print confusion_mat_creator(pred, ytest)
    return(pred,test_accuracy,confusion_mat_creator(pred,ytest),best_param,clf)

Run optimized SVM on the tfidf matrices.

In [None]:
best_predictions,best_accuracy,best_con_mat,best_param,best_clf=do_classify(LinearSVC(loss='hinge'), {'C':[1.0]},
                                                                            train_verbnouns_tfidf,train_issue_areas,
                                                                            test_verbnouns_tfidf, test_issue_areas)