In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from gensim.test.utils import common_texts, get_tmpfile
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import spacy
from sklearn.model_selection import train_test_split
import gensim
import os
import collections
import smart_open
import random
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import cosine

pd.set_option("display.max_columns",100)

In [2]:
def get_and_clean_data():
    df = pd.read_csv('../data/rt_data_dump.csv')
    # Drop duplicate columns
    df.drop(['Unnamed: 0', 'rt_id.1', '_id'], axis=1, inplace=True)
    # Drop non-text annotations
    img_only_idxs = df[df['tate_text'].isna()].index
    df.drop(img_only_idxs, axis=0, inplace=True)
    # All songs are "False" -- therefore, this doesn't add anything!
    df.drop('hot_song', axis=1, inplace=True)
    # Create standardized "votes" feature (takes pageviews into account)
    df['votes_per_1000views'] = (100000 * df['votes_total'] / df['pageviews']).round(2)
    # New features for the number of characters in annotations/referents
    df['chars_in_tate'] = df['tate_text'].str.len()
    df['chars_in_referent'] = df['ref_text'].str.len()
    # list of words, in order, for referents/annotations
    df['ref_word_lst'] = df['ref_text'].str.lower().str.split()
    df['tate_word_lst'] = df['tate_text'].str.lower().str.split()
    # word count for referents/annotations
    df['ref_word_cnt'] = df['ref_word_lst'].str.len()
    df['tate_word_cnt'] = df['tate_word_lst'].str.len()

    # Removing Verse/Speaking Tags, Etc...
    short_refs = df[df['ref_word_cnt'] <= 3]['ref_text'].unique()
    tags_to_remove = []
    short_refs_to_keep = []

    for ref in short_refs:
        if ref[0] == '[' and ref[-1] == ']':
            tags_to_remove.append(ref)
        else:
            short_refs_to_keep.append(ref)

    # COMPLETELY REMOVE
    add_to_remove = ['produced by kanye west mike dean plain pat', 'Intro:', 'ENSEMBLE', 'JEFFERSON', 'Verse 2: Eminem', '[Chorus: KING GEORGE', '*Space Bar Tap*', 'BURR', 'LEE', '(Guitar Solo)', '(21st-Century schizoid man)']
    # CHANGE/EDIT
    edit_values = ['[HAMILTON]\n No', '[HAMILTON]\n Sir!', '[HAMILTON]\n Ha', '[HAMILTON]\n What?']
    # OK
    ok_keep = ['Mr. President', 'Mr. Vice President:', '“President John Adams”', 'Hamilton', 'Maty Noyes']

    replace_dict = {'[HAMILTON]\n No':'No', '[HAMILTON]\n Sir!': 'Sir!', '[HAMILTON]\n Ha': 'Ha', '[HAMILTON]\n What?': 'What?'}

    edit_idxs = []
    for bad_ref in edit_values:
        mask = df['ref_text'] == bad_ref
        bad_idxs = list(df[mask].index)
        for i in bad_idxs:
            edit_idxs.append(i)

    df['ref_text'].replace(replace_dict, inplace=True)

    for i in add_to_remove:
        tags_to_remove.append(i)
        short_refs_to_keep.remove(i)

    rt_idxs_to_drop = []
    for bad_ref in tags_to_remove:
        mask = df['ref_text'] == bad_ref
        bad_idxs = list(df[mask].index)
        for i in bad_idxs:
            rt_idxs_to_drop.append(i)

    df.drop(rt_idxs_to_drop, axis=0, inplace=True)
    return df

def perform_ttsplit(df):
    df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
#     df_train.to_csv('../data/genius_data_train_319.csv')
#     df_test.to_csv('../data/genius_data_test_319.csv')
    return df_train, df_test

In [3]:
def get_ref_tate_dfs(df_train, df_test):
    ref_df_train = df_train[['ref_text', 'rt_id']]
    tate_df_train = df_train[['tate_text', 'rt_id']]

    ref_df_test = df_test[['ref_text', 'rt_id']]
    tate_df_test = df_test[['tate_text', 'rt_id']]

    ref_df_train.reset_index(drop=True, inplace=True)
    tate_df_train.reset_index(drop=True, inplace=True)

    ref_df_test.reset_index(drop=True, inplace=True)
    tate_df_test.reset_index(drop=True, inplace=True)
    # (tate_df_train['rt_id'] == ref_df_train['rt_id']).all()
    # (tate_df_test['rt_id'] == ref_df_test['rt_id']).all()
    return ref_df_train, tate_df_train, ref_df_test, tate_df_test


In [4]:
# GONNA USE THIS TUTORIAL FOR REST OF ATTEMPT:
# https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/doc2vec-lee.ipynb
def isolate_corpuses(ref_df_train, tate_df_train, ref_df_test, tate_df_test):
    refs_train = ref_df_train['ref_text']
    tates_train = tate_df_train['tate_text']

    refs_test = ref_df_test['ref_text']
    tates_test = tate_df_test['tate_text']
    return refs_train, refs_test, tates_train, tates_test

def make_rt_doc_idx_dicts(ref_df_train, ref_df_test):
    rt_to_doc_idx_train = ref_df_train['rt_id']
    rt_to_doc_idx_test = ref_df_test['rt_id']

    rt_doc_idx_train_dict = rt_to_doc_idx_train.to_dict()
    rt_doc_idx_test_dict = rt_to_doc_idx_test.to_dict()
    return rt_doc_idx_train_dict, rt_doc_idx_test_dict


In [5]:
def read_corpus(doc_series, tokens_only=False):
    # with smart_open.smart_open(fname, encoding="iso-8859-1") as f:
    for i, line in enumerate(doc_series):
            if tokens_only:
                yield gensim.utils.simple_preprocess(line)
            else:
                # For training data, add tags
                yield gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(line), [i])

def get_rt_tt_corpuses(refs_train, refs_test, tates_train, tates_test):
    train_tate_corpus = list(read_corpus(tates_train))
    test_tate_corpus = list(read_corpus(tates_test, tokens_only=True))

    train_refs_corpus = list(read_corpus(refs_train))
    test_refs_corpus = list(read_corpus(refs_test, tokens_only=True))
    return train_tate_corpus, test_tate_corpus, train_refs_corpus, test_refs_corpus


In [6]:
def assess_model(model, mod_corpus):
    ranks = []
    second_ranks = []
    n_training_docs = len(mod_corpus)
    for doc_id in range(n_training_docs):
        inferred_vector = model.infer_vector(mod_corpus[doc_id].words)
        sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))
        rank = [docid for docid, sim in sims].index(doc_id)
        ranks.append(rank)
        second_ranks.append(sims[1])
    # Let's count how each document ranks with respect to the training corpus
    rank_counter = collections.Counter(ranks)  # Results vary between runs due to random seeding and very small corpus
    cnt_correct_self_similarity_docs = rank_counter[0]
    perc_correct_similarity = cnt_correct_self_similarity_docs / n_training_docs
    greater_than_95 = perc_correct_similarity >= 0.95
    return ranks, second_ranks, rank_counter, perc_correct_similarity, greater_than_95

In [7]:
cleaned_df = get_and_clean_data()
df_train, df_test = perform_ttsplit(cleaned_df)
ref_df_train, tate_df_train, ref_df_test, tate_df_test = get_ref_tate_dfs(df_train, df_test)
refs_train, refs_test, tates_train, tates_test = isolate_corpuses(ref_df_train, tate_df_train, ref_df_test, tate_df_test)
rt_doc_idx_train_dict, rt_doc_idx_test_dict = make_rt_doc_idx_dicts(ref_df_train, ref_df_test)
train_tate_corpus, test_tate_corpus, train_refs_corpus, test_refs_corpus = get_rt_tt_corpuses(refs_train, refs_test, tates_train, tates_test)

ref_model1 = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=40)
ref_model2 = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=80)
ref_model3 = gensim.models.doc2vec.Doc2Vec(vector_size=100, min_count=1, epochs=80)

tate_model1 = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=40)
tate_model2 = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=80)
tate_model3 = gensim.models.doc2vec.Doc2Vec(vector_size=100, min_count=2, epochs=80)

rt_model1 = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=40)
rt_model2 = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=80)
rt_model3 = gensim.models.doc2vec.Doc2Vec(vector_size=100, min_count=1, epochs=80)

ref_models = [ref_model1, ref_model2, ref_model3]
ref_mod_text = ['RM1(vs=50, mc=2, e=40)', 'RM2(vs=50, mc=2, e=80)', 'RM3(vs=100, mc=1, e=80)']
tate_models = [tate_model1, tate_model2, tate_model3]
tate_mod_text = ['TM1(vs=50, mc=2, e=40)', 'TM2(vs=50, mc=2, e=80)', 'TM3(vs=100, mc=1, e=80)']
rt_models = [rt_model1, rt_model2, rt_model3]
rt_mod_text = ['RTM1(vs=50, mc=2, e=40)', 'RTM2(vs=50, mc=2, e=80)', 'RTM3(vs=100, mc=1, e=80)']

In [8]:
def print_most_similar_examples(model, mod_corpus, doc_id):
    inferred_vector = model.infer_vector(mod_corpus[doc_id])
    sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))
    print('Document ({}): «{}»\n'.format(doc_id, ' '.join(mod_corpus[doc_id].words)))
    print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
    for label, index in [('MOST', 0), ('SECOND-MOST', 1), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
        print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(mod_corpus[sims[index][0]].words)))

def compare_second_most_similar_doc_examples(mod_corpus, mod_second_ranks):
    # We can run the next cell repeatedly to see a sampling other target-document comparisons.
    # Pick a random document from the corpus and infer a vector from the model
    doc_id = random.randint(0, len(mod_corpus) - 1)
    # Compare and print the second-most-similar document
    print('Train Document ({}): «{}»\n'.format(doc_id, ' '.join(mod_corpus[doc_id].words)))
    sim_id = second_ranks[doc_id]
    print('Similar Document {}: «{}»\n'.format(sim_id, ' '.join(modcorpus[sim_id[0]].words)))

def random_model_tests(model, train_corpus, test_corpus):
    # Testing the Model
    # Using the same approach above, we'll infer the vector for a randomly chosen test document, and compare the document to our model by eye.
    # Pick a random document from the test corpus and infer a vector from the model
    doc_id = random.randint(0, len(test_corpus) - 1)
    inferred_vector = model.infer_vector(test_corpus[doc_id])
    sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))
    # Compare and print the most/median/least similar documents from the train corpus
    print('Test Document ({}): «{}»\n'.format(doc_id, ' '.join(test_corpus[doc_id])))
    print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
    for label, index in [('MOST', 0), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
        print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(train_corpus[sims[index][0]].words)))
    return sims

In [9]:
def get_top_and_bottom_tate_docs(df, ref_df_train, ref_df_test):
    # 3 rt_ids for annotations we know to be particularly "bad"
    bottom_3_rtid = [rt_id for rt_id in df.nsmallest(3, 'votes_per_1000views')['rt_id']]
    # 3 rt_ids for annotations we know to be "good", minus the one that's only annotating a '[VERSE]' tag
    top_3_rtid = [rt_id for rt_id in df.nlargest(3, 'votes_per_1000views')['rt_id']]

    b3_doc_id = []
    b3_tot = []
    for rt_id in bottom_3_rtid:
        if rt_id in list(ref_df_train['rt_id']):    
            mask = ref_df_train['rt_id'] == rt_id 
            b3_doc_id.append(ref_df_train[mask].index[0])
            b3_tot.append('train')
        else:
            mask = ref_df_test['rt_id'] == rt_id
            b3_doc_id.append(ref_df_test[mask].index[0])
            b3_tot.append('test')

    t3_doc_id = []
    t3_tot = []
    for rt_id in top_3_rtid:
        if rt_id in list(ref_df_train['rt_id']):    
            mask = ref_df_train['rt_id'] == rt_id 
            t3_doc_id.append(ref_df_train[mask].index[0])
            t3_tot.append('train')
        else:
            mask = ref_df_test['rt_id'] == rt_id
            t3_doc_id.append(ref_df_test[mask].index[0])
            t3_tot.append('test')
    return b3_doc_id, b3_tot, t3_doc_id, t3_tot

def get_cos_sim_for_best_worst_rt_pairs(b3_doc_id, b3_tot, t3_doc_id, t3_tot, df, base_model, r_train_corpus, r_test_corpus, t_train_corpus, t_test_corpus):          
    top = []
    bottom = []
    
    for idx, b_doc_id in enumerate(b3_doc_id):
        if b3_tot[idx] == 'train':
            if b3_tot[idx] == 'train':
                r_inf_vec = base_model.infer_vector(r_train_corpus[b_doc_id].words, epochs=base_model.epochs).reshape(-1, 1)
                t_inf_vec = base_model.infer_vector(t_train_corpus[b_doc_id].words, epochs=base_model.epochs).reshape(-1, 1)
            else:
                r_inf_vec = base_model.infer_vector(r_test_corpus[b_doc_id], epochs=base_model.epochs).reshape(-1, 1)
                t_inf_vec = base_model.infer_vector(t_test_corpus[b_doc_id], epochs=base_model.epochs).reshape(-1, 1)
            # might need to just do straight up np.cosine similarity calc between vecs
            rt_iv_cs = 1 - cosine(r_inf_vec, t_inf_vec)
            bottom.append(rt_iv_cs)
#         bottom.append([mean_cs, bad_cs])

    for idx, t_doc_id in enumerate(t3_doc_id):
        if t3_tot[idx] == 'train':
            if t3_tot[idx] == 'train':
                r_inf_vec = base_model.infer_vector(r_train_corpus[t_doc_id].words, epochs=base_model.epochs).reshape(-1, 1)
                t_inf_vec = base_model.infer_vector(t_train_corpus[t_doc_id].words, epochs=base_model.epochs).reshape(-1, 1)
            else:
                r_inf_vec = base_model.infer_vector(r_test_corpus[t_doc_id], epochs=base_model.epochs).reshape(-1, 1)
                t_inf_vec = base_model.infer_vector(t_test_corpus[t_doc_id], epochs=base_model.epochs).reshape(-1, 1)
            # might need to just do straight up np.cosine similarity calc between vecs
            rt_iv_cs = 1 - cosine(r_inf_vec, t_inf_vec)
            top.append(rt_iv_cs)
#         top.append([mean_cs, good_cs])
    
    return top, bottom

In [10]:
df_train.shape

(2674, 28)

In [11]:
df_test.shape

(669, 28)

In [12]:
fname_rm1 = get_tmpfile("doc2vec_rm1")
fname_rm2 = get_tmpfile("doc2vec_rm2")
fname_rm3 = get_tmpfile("doc2vec_rm3")

fname_tm1 = get_tmpfile("doc2vec_tm1")
fname_tm2 = get_tmpfile("doc2vec_tm2")
fname_tm3 = get_tmpfile("doc2vec_tm3")

In [13]:
b3_doc_id, b3_tot, t3_doc_id, t3_tot = get_top_and_bottom_tate_docs(cleaned_df, ref_df_train, ref_df_test)

In [14]:
for idx, ref_mod in enumerate(ref_models):
    model_header = ref_mod_text[idx]
    print("EVALUATING:", model_header)
    ref_mod.build_vocab(train_refs_corpus)
    %time ref_mod.train(train_refs_corpus, total_examples=ref_mod.corpus_count, epochs=ref_mod.epochs)

    mod_ranks, mod_second_ranks, mod_rank_counter, perc_correct_similarity, greater_than_95 = assess_model(ref_mod, train_refs_corpus)
    print("Model Self-Similarity Test Passed?:", greater_than_95)
    print("Model % Self-Similar:", perc_correct_similarity)

    doc_id = 4
#     print_most_similar_examples(ref_mod, train_refs_corpus, doc_id)
#     compare_second_most_similar_doc_examples(train_refs_corpus, mod_second_ranks)

    random_model_tests(ref_mod, train_refs_corpus, test_refs_corpus)
    
    top, bottom = get_cos_sim_for_best_worst_rt_pairs(b3_doc_id, b3_tot, t3_doc_id, t3_tot, cleaned_df, ref_mod, train_refs_corpus, test_refs_corpus, train_tate_corpus, test_tate_corpus)
    print("Mean of CS between BEST 3 Annotations:", np.array(top).mean())
    print(top)
    print("Mean of CS between WORST 3 Annotations:", np.array(bottom).mean())
    print(bottom)
    print('NOW, ONTO THE NEXT MODEL!')

# ref_model1.save(fname_rm1)
# ref_model2.save(fname_rm2)
# ref_model3.save(fname_rm3)

EVALUATING: RM1(vs=50, mc=2, e=40)
CPU times: user 3.42 s, sys: 845 ms, total: 4.26 s
Wall time: 2.88 s
Model Self-Similarity Test Passed?: False
Model % Self-Similar: 0.8799551234106208
Test Document (517): «know»

SIMILAR/DISSIMILAR DOCS PER MODEL Doc2Vec(dm/m,d50,n5,w5,mc2,s0.001,t3):

MOST (2491, 0.9033986926078796): «but»

MEDIAN (2446, 0.5495301485061646): «produced by young chop co production by kanye west noah goldstein and the twilite tone»

LEAST (2643, -0.49779078364372253): «ayy got somethin hol up we gon function no assumptions»

Mean of CS between BEST 3 Annotations: 0.44861310720443726
[0.44861310720443726]
Mean of CS between WORST 3 Annotations: 0.1788015173127254
[0.48971104621887207, -0.015330487862229347, 0.06202399358153343]
NOW, ONTO THE NEXT MODEL!
EVALUATING: RM2(vs=50, mc=2, e=80)
CPU times: user 7.64 s, sys: 1.77 s, total: 9.41 s
Wall time: 6.06 s
Model Self-Similarity Test Passed?: False
Model % Self-Similar: 0.9323111443530292
Test Document (68): «produced by

In [15]:
for idx, tate_mod in enumerate(tate_models):
    model_header = tate_mod_text[idx]
    print("EVALUATING:", model_header)
    tate_mod.build_vocab(train_tate_corpus)
    %time tate_mod.train(train_tate_corpus, total_examples=tate_mod.corpus_count, epochs=tate_mod.epochs)

    mod_ranks, mod_second_ranks, mod_rank_counter, perc_correct_similarity, greater_than_95 = assess_model(tate_mod, train_tate_corpus)
    print("Model Self-Similarity Test Passed?:", greater_than_95)
    print("Model % Self-Similar:", perc_correct_similarity)

    doc_id = 4
#     print_most_similar_examples(ref_mod, train_refs_corpus, doc_id)
#     compare_second_most_similar_doc_examples(train_refs_corpus, mod_second_ranks)

    random_model_tests(tate_mod, train_tate_corpus, test_tate_corpus)

    
    top, bottom = get_cos_sim_for_best_worst_rt_pairs(b3_doc_id, b3_tot, t3_doc_id, t3_tot, cleaned_df, tate_mod, train_refs_corpus, test_refs_corpus, train_tate_corpus, test_tate_corpus)
    print("Mean of CS between BEST 3 Annotations:", np.array(top).mean())
    print(top)
    print("Mean of CS between WORST 3 Annotations:", np.array(bottom).mean())
    print(bottom)
    print('NOW, ONTO THE NEXT MODEL!')
    
    
# tate_model1.save(fname_tm1)
# tate_model2.save(fname_tm2)
# tate_model3.save(fname_tm3)

EVALUATING: TM1(vs=50, mc=2, e=40)
CPU times: user 12.9 s, sys: 1.21 s, total: 14.1 s
Wall time: 6.45 s
Model Self-Similarity Test Passed?: True
Model % Self-Similar: 0.9973821989528796
Test Document (308): «in october women accused american film producer harvey weinstein of rape sexual assault and sexual abuse over period of at least years over eighty women have since come forward and accused weinstein he was arrested in may and released on bail according to reports weinstein invited young actresses or models into hotel room or office on the pretext of discussing their career and then demanded massages or sex the accusations resulted in other women coming forward with their own experiences concerning sexual harassment and rape on social media under the hashtag metoo»

SIMILAR/DISSIMILAR DOCS PER MODEL Doc2Vec(dm/m,d50,n5,w5,mc2,s0.001,t3):

MOST (1109, 0.5570858716964722): «you may not be able to turn bad girl good but jay career is testament to the male equivalent being possible most

In [16]:
# # Persist a model to disk
# fname = get_tmpfile("my_doc2vec_model")
# model.save(fname)
# model = Doc2Vec.load(fname)  # you can continue training with the loaded model!
