In [1]:
from collections import Counter
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import numpy as np

In [2]:
%matplotlib inline

In [3]:
import feature_extraction
import scoring

from text_preprocessor import TextPreprocessor
from main import get_candidates

In [4]:
citations = TextPreprocessor()

In [5]:
def show_wordcloud(data):
    plt.figure(figsize = (8,6))
    wordcloud = WordCloud().fit_words(data)
    plt.imshow(wordcloud, aspect='auto')
    plt.axis("off")

In [6]:
def generate_ranking(method, pmid, bigger_is_better=True):
    candidates = get_candidates(citations, pmid)
    if len(candidates) == 0: print(pmid)
    scored = [(method(citations, pmid, c), c) for c in candidates]
    return [candidate for _, candidate in sorted(scored, reverse=bigger_is_better)]

In [7]:
def get_predictions(method, bigger_is_better=True):
    return [generate_ranking(method, pmid, bigger_is_better) for pmid in citations.articles]

In [8]:
def evaluate_for_k(actual, predicted, k=25):
    trimmed_rankings = []
    for ranking in predicted:
        if k < len(ranking): trimmed_rankings.append(ranking[:k])
        else: trimmed_rankings.append(ranking)
    return scoring.get_scores(trimmed_rankings, actual)

In [9]:
def get_roc_vals(method, bigger_is_better=True):
    predicted = get_predictions(method, bigger_is_better)
    actual = [citations[pmid]['mesh'] for pmid in citations.articles]
    return [(k, evaluate_for_k(actual, predicted, k=k)) for k in range(1,30)]

In [10]:
def get_plot_points(label, method, color, bigger_is_better=True):
    values = get_roc_vals(method, bigger_is_better)
    precision = [precision for k, (precision, recall, fscore, map_) in values]
    recall = [recall for k, (precision, recall, fscore, map_) in values]
    fscore = [fscore for k, (precision, recall, fscore, map_) in values]
    k = [k for k, (precision, recall, fscore, map_) in values]
    map_ = [map_ for k, (precision, recall, fscore, map_) in values]
    
    return {
        'pr' : {'x': precision, 'y': recall, 'label': label, 'c':color},
        'kf' : {'x': k, 'y': fscore, 'label': label, 'c':color},
        'km' : {'x': k, 'y': map_, 'label': label, 'c':color}
    }  

In [11]:
def plot_figures(features, title='Ranking based on individual features'):
    data = []
    for label, (method, color) in features.items():
        points = get_plot_points(label, method, color)
        data.append(points)
    
    plt.figure(1, figsize=(40,12))
    label = plt.suptitle(title)
    label.set_fontsize(20)
    
    pr_plot = plt.subplot(1, 3, 1)
    kf_plot = plt.subplot(1, 3, 2)
    km_plot = plt.subplot(1, 3, 3)

    
    for points in data:
        pr_plot.scatter(**points['pr'])
        kf_plot.scatter(**points['kf'])
        km_plot.scatter(**points['km'])
        
    plt.subplot(1, 3, 1)
    plt.xlabel('Precision')
    plt.ylabel('Recall')
    plt.title('Recall vs Precision')
    plt.legend(loc='upper right')
    
    plt.subplot(1, 3, 2)
    plt.xlabel('Number of items in ranking')
    plt.ylabel('F-Score')
    plt.title('F-Score vs k')
    plt.legend(loc='lower right')
    
    plt.subplot(1, 3, 3)
    plt.xlabel('Number of items in ranking')
    plt.ylabel('MAP')
    plt.title('Mean Average Precision (MAP) vs k')
    plt.legend(loc='lower right')

In [12]:
import random
from collections import OrderedDict
features = OrderedDict([
    ('Bigram Overlap', (feature_extraction.bigram_overlap, 'pink')),
    ('Unigram Overlap', (feature_extraction.unigram_overlap, 'purple')),
    ('Neighboring Count', (feature_extraction.neighboring_count, 'r')),
    ('Neighboring Similarity', (feature_extraction.neigboring_similarities, 'g')), 
    ('Citation Count', (feature_extraction.citation_count, 'y')),
    ('Citation Similarity', (feature_extraction.citation_similarities, 'orange')),
    ('Random', (lambda x,y,z:random.randint(0,200), 'b')),
    ('Composite', (lambda x,y,z:feature_extraction.neighboring_count(x,y,z)+feature_extraction.citation_count(x,y,z), 'limegreen'))
])

In [None]:
plot_figures(features)

In [13]:
from functools import partial
from feature_extraction import features as all_features
class LNet():
    def __init__(self):
        self.current_file = None

    def switch(self, num_iters, learning_rate):
        with open('models/model_iter{0}_gamma{1}'.format(num_iters, learning_rate)) as params_file:
            self.weights = np.asarray([float(weight) for weight in params_file.readlines()])

    def get_score(self, citations, pmid, mesh_term):
        feats = np.asarray([func(citations, pmid, mesh_term) for func in all_features])
        return self.weights @ feats
                


def listnet_score(lnet, citations, pmid, mesh_term):
    return lnet.get_score(citations, pmid, mesh_term)

model = LNet()

In [14]:
def run_model(citations, pmid, mesh_term, iterations, gamma):
    model.switch(iterations, gamma)
    return listnet_score(model, citations, pmid, mesh_term)

In [None]:
import random
from collections import OrderedDict

from itertools import product
iterations = list(range(5,55,5))
gamma = [0.01, 0.001]

graphs = [('Citation Count', (feature_extraction.citation_count, 'y'))]
for it, gm in product(iterations, gamma):
    graphs.append(
        (
            'Model it: %d, gamma: %0.1f' % (it, gm),
            (partial(run_model, iterations=it, gamma=gm),'r')
        )
    )

huytay = OrderedDict(graphs)
plot_figures(huytay)