In [1]:
import scipy
import scipy.sparse.linalg
from scipy.spatial.distance import cdist
import numpy
from collections import defaultdict
import json
import codecs
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier

In [2]:
f = codecs.open('parsed.json', 'r', encoding='utf-8')
data = json.load(f)
f.close()

f = codecs.open('win_loss_labels.json', 'r', encoding='utf-8')
results = json.load(f)
f.close()

f = codecs.open('parsed_testing.json', 'r', encoding='utf-8')
test_data = json.load(f)
f.close()

In [3]:
def tfidf_docterm(corpus, freqthresh):
    """Estimate document-term TF-IDF vectors for each document (line in filename),
    where each column is a word, in decreasing order of frequency.
    Ignore words that appear fewer than freqthresh times.
    Return a list consisting of
    1. a list of the m word types with at least freqthresh count, sorted in decreasing order of frequency.
    2. an array with d rows and m columns,
    where row i is the vector for the ith document in filename,
    and col j represents the jth word in the above list.
    """
    tfidf_dict = dict()
    candidate_dict = {}
    wordcounts = defaultdict(int)
    new_corpus = []
    labels = []
    for candidate in corpus.keys(): 
        if candidate == 'Hillary Clinton republican 2008':
            continue
        labels.append(results[candidate])
        debates = [debate for debates in corpus[candidate].values() for debate in debates]
        for word in debates:
                if word not in common:
                    wordcounts[word] +=1
        new_corpus.append([candidate, debates])
    
    sorted_words = sorted(filter(lambda x: wordcounts[x] >= freqthresh, wordcounts.keys()), key=lambda x: wordcounts[x], reverse=True)
    thresholded_words = set(sorted_words)
    word_indices = dict((word, index) for index, word in enumerate(sorted_words))
    context = numpy.zeros((len(new_corpus), len(sorted_words)))
    for di, doc in enumerate(new_corpus):
        for word in doc[1]:
            try:
                #print word
                if word in thresholded_words:
                    context[di,word_indices[word]] +=1
            except: 
                pass
    return [sorted_words, context, labels]

common = stopwords.words('english')

tfidf_vectors = tfidf_docterm(data,50)
vectorizer = tfidf_vectors[1]
result = tfidf_vectors[2]
print vectorizer.shape
print result

(53, 1399)
[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0]


In [5]:
def dimensionality_reduce(vectors, ndims):
    """Apply SVD on original sparse matrix, return reduced vectors."""
    # Do not modify
    U, s, Vh = scipy.sparse.linalg.svds(vectors, k=ndims)
    sigmatrix = scipy.matrix(scipy.diag(s))
    return U * sigmatrix

train_data_features = dimensionality_reduce(vectorizer, 20)
print train_data_features, type(train_data_features), train_data_features.shape

[[   3.82670867   -1.56969255    0.95818609 ...,   -9.11066367
    -2.13700424  -77.6101968 ]
 [   8.25540067  -11.83665582    3.33705647 ...,  -25.68261294
   -15.38189016 -479.80194031]
 [  50.69721222   80.07391666   27.89746094 ...,  -70.80272083
   -35.77168279 -512.9112537 ]
 ..., 
 [  10.12864488   20.8345388    -4.57299857 ...,   13.95186028
   -11.83080993 -191.22507628]
 [  71.3857324     7.78533655  -16.79635823 ...,   64.28246144
    -3.29635289 -319.76082287]
 [  44.15076785   -5.71621032   -7.47670045 ...,   17.38807814   24.4171316
  -298.50243687]] <class 'numpy.matrixlib.defmatrix.matrix'> (53, 20)


In [6]:
test_tfidf_vectors = tfidf_docterm(test_data,100)
test_vectorizer = test_tfidf_vectors[1]
test_result = test_tfidf_vectors[2]
print test_vectorizer
print test_result
test_data_features = dimensionality_reduce(test_vectorizer, 20)

[[ 111.   87.  141. ...,    6.    0.    6.]
 [ 214.  117.   92. ...,    1.    1.    2.]
 [ 336.  147.  113. ...,    4.    3.    1.]
 ..., 
 [ 526.  226.  269. ...,   23.    0.    4.]
 [ 575.  564.  360. ...,    6.   10.    6.]
 [   0.    0.    0. ...,    0.    0.    0.]]
[1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0]


In [29]:
def knn(trainpoints, traincats, testpoints, k):
    """Given training data points
    and a 1-d array of the corresponding categories of the points,
    predict category for each test point,
    using k nearest neighbors (with cosine distance).
    Return a 1-d array of predicted categories.
    """
    predictions = []
    distances = cdist(testpoints, trainpoints)
    print 'distances calculated'
    for test_dist in distances:
        sorted_first_k = sorted(enumerate(test_dist), key = lambda e: e[1])[:k]
        best_cat = ''
        best_count = 0
        cat_counts = defaultdict(int) 
        for ci, cat_dist in sorted_first_k:
            cat_counts[traincats[ci]] += 1
            if cat_counts[traincats[ci]] > best_count:
                best_cat = traincats[ci]
                best_count = cat_counts[traincats[ci]]
        predictions.append(best_cat)
    return predictions


# Marena
def knn(trainpoints, traincats, testpoints, k):
    """Given training data points
    and a 1-d array of the corresponding categories of the points,
    predict category for each test point,
    using k nearest neighbors (with cosine distance).
    Return a 1-d array of predicted categories.
    """
    cd = cdist(testpoints, trainpoints, 'cosine') # Creates array where i,j is distance between testpoints[i] and trainpoints[j]
    sorted_dist = numpy.argsort(cd, axis=1)
    testcats = []
    for row in sorted_dist:
        training_labels = defaultdict(lambda: 0)
        nearest_neighbors = row[0:k]
        for index in nearest_neighbors:
            training_labels[traincats[index]] += 1
        winning_label = max(training_labels.iteritems(), key=operator.itemgetter(1))[0]
        testcats.append(winning_label)
    return testcats

test_predict_label = knn(train_data_features,result,test_data_features,2)

distances calculated


In [30]:
print test_predict_label
count = 0 
for index,label in enumerate(test_predict_label):
    if label == test_result[index]:
        count+=1
print count/33. 

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
0.818181818182
