In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from gensim.test.utils import common_texts, get_tmpfile
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

pd.set_option("display.max_columns",100)

In [70]:
df = pd.read_csv('../data/rt_data_dump.csv')

df.head()

Unnamed: 0.1,Unnamed: 0,ref_text,tate_text,votes_total,verified,state,song_id,song_title,full_title,song_tate_cnt,hot_song,pageviews,n_unreviewed_tates,full_lyrics,url,n_tate_contributors,primary_contributor_id,primary_contributor_IQ,has_voters,comment_cnt,artist_name,artist_id,rt_id,rt_id.1,_id
0,3910,"‘Cause I don't write shit, ‘cause I ain't got ...",Important to note that he’s saying he doesn’t ...,45,False,accepted,59,A Milli,A Milli by Lil Wayne,39,False,936461,0,"['Bangladesh', 'Young Money!', 'You dig?', ""Ma...",https://genius.com/3910/Lil-wayne-a-milli/Caus...,1,3,101534,True,2,Lil Wayne,4,3910,3910,5c8b26d28533e663329276a7
1,7445,With coke in her derriere,Playing on the familiar theme of using a girl ...,30,False,accepted,59,A Milli,A Milli by Lil Wayne,39,False,936461,0,"['Bangladesh', 'Young Money!', 'You dig?', ""Ma...",https://genius.com/7445/Lil-wayne-a-milli/With...,1,7,478892,True,0,Lil Wayne,4,7445,7445,5c8b26d28533e663329276a6
2,11740,But I would like for you to pay me by the hour,"This is a play on the previous line, where Wee...",7,False,accepted,59,A Milli,A Milli by Lil Wayne,39,False,936461,0,"['Bangladesh', 'Young Money!', 'You dig?', ""Ma...",https://genius.com/11740/Lil-wayne-a-milli/But...,4,10380,171456,True,1,Lil Wayne,4,11740,11740,5c8b26d28533e663329276a5
3,17243,And it ain't trickin' if you got it,A popular phrase (also used in T.I.’s song “Wh...,10,False,accepted,59,A Milli,A Milli by Lil Wayne,39,False,936461,0,"['Bangladesh', 'Young Money!', 'You dig?', ""Ma...",https://genius.com/17243/Lil-wayne-a-milli/And...,2,2,102789,True,0,Lil Wayne,4,17243,17243,5c8b26d28533e663329276a4
4,19669,"Bloodsuckin' succubuses, what the fuck is up w...","Succubi (it’s a LATIN plural, Em! But that wou...",45,False,accepted,561,Space Bound,Space Bound by Eminem,19,False,952636,0,"[""We touch, I feel a rush; we clutch, it isn't...",https://genius.com/19669/Eminem-space-bound/Bl...,4,7,478892,True,1,Eminem,45,19669,19669,5c8b26d28533e66332926f78


In [71]:
# Drop duplicate columns
df.drop(['Unnamed: 0', 'rt_id.1', '_id'], axis=1, inplace=True)

# Drop non-text annotations
img_only_idxs = df[df['tate_text'].isna()].index
df.drop(img_only_idxs, axis=0, inplace=True)

# All songs are "False" -- therefore, this doesn't add anything!
df.drop('hot_song', axis=1, inplace=True)

# Create standardized "votes" feature (takes pageviews into account)
df['votes_per_1000views'] = (100000 * df['votes_total'] / df['pageviews']).round(2)
# New features for the number of characters in annotations/referents
df['chars_in_tate'] = df['tate_text'].str.len()
df['chars_in_referent'] = df['ref_text'].str.len()

# Can we do this for:
#   total # of lines in song?
#   word count in referent/annotation?

# https://stackoverflow.com/questions/18936957/count-distinct-words-from-a-pandas-data-frame

# list of words, in order, for referents/annotations
df['ref_word_lst'] = df['ref_text'].str.lower().str.split()
df['tate_word_lst'] = df['tate_text'].str.lower().str.split()

# word count for referents/annotations
df['ref_word_cnt'] = df['ref_word_lst'].str.len()
df['tate_word_cnt'] = df['tate_word_lst'].str.len()


In [13]:
import spacy

In [14]:
from sklearn.model_selection import train_test_split

In [72]:
df.columns

Index(['ref_text', 'tate_text', 'votes_total', 'verified', 'state', 'song_id',
       'song_title', 'full_title', 'song_tate_cnt', 'pageviews',
       'n_unreviewed_tates', 'full_lyrics', 'url', 'n_tate_contributors',
       'primary_contributor_id', 'primary_contributor_IQ', 'has_voters',
       'comment_cnt', 'artist_name', 'artist_id', 'rt_id',
       'votes_per_1000views', 'chars_in_tate', 'chars_in_referent',
       'ref_word_lst', 'tate_word_lst', 'ref_word_cnt', 'tate_word_cnt'],
      dtype='object')

In [73]:
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

In [74]:
df_train.shape

(2858, 28)

In [75]:
df_test.shape

(715, 28)

In [23]:
df_train.to_csv('genius_data_train.csv')
df_test.to_csv('genius_data_test.csv')

In [76]:
df_train.head()

Unnamed: 0,ref_text,tate_text,votes_total,verified,state,song_id,song_title,full_title,song_tate_cnt,pageviews,n_unreviewed_tates,full_lyrics,url,n_tate_contributors,primary_contributor_id,primary_contributor_IQ,has_voters,comment_cnt,artist_name,artist_id,rt_id,votes_per_1000views,chars_in_tate,chars_in_referent,ref_word_lst,tate_word_lst,ref_word_cnt,tate_word_cnt
1040,Maybe I'm the sinner and you're the saint\n Go...,Ariana has always put up an innocent and cute ...,15,False,accepted,472285,Best Mistake,Best Mistake by Ariana Grande (Ft. Big Sean),27,283606,0,"['How soon do we forget how we felt?', 'Dealin...",https://genius.com/3903717/Ariana-grande-best-...,3,58812,65745,True,1,Ariana Grande,26507,3903717,5.29,340,78,"[maybe, i'm, the, sinner, and, you're, the, sa...","[ariana, has, always, put, up, an, innocent, a...",14,62
141,For my niggas out tonight\n And they high off ...,Shakespeare lines are known for being quite le...,56,False,accepted,56953,Initiation,Initiation by The Weeknd,16,749522,0,"['Oh, yeah, got you drinking out them white cu...",https://genius.com/392109/The-weeknd-initiatio...,6,164938,37636,True,0,The Weeknd,2358,392109,7.47,499,123,"[for, my, niggas, out, tonight, and, they, hig...","[shakespeare, lines, are, known, for, being, q...",22,84
1879,The Reynolds Pamphlet,The Reynolds Pamphlet was a 98-page document t...,28,False,accepted,2314627,Hurricane,Hurricane by Original Broadway Cast of Hamilton,27,462233,1,"['In the eye of a hurricane', 'There is quiet'...",https://genius.com/8844383/Original-broadway-c...,2,244275,11784,True,1,Original Broadway Cast of Hamilton,572149,8844383,6.06,531,21,"[the, reynolds, pamphlet]","[the, reynolds, pamphlet, was, a, 98-page, doc...",3,83
666,‘Cause I just shitted on the mic and I like ge...,The previous bars double the entendre on “bow”...,44,False,accepted,204622,Survival,Survival by Eminem (Ft. Liz Rodrigues),53,1443321,0,"['This is survival of the fittest', 'This is d...",https://genius.com/2101012/Eminem-survival/Cau...,2,1859,21209,True,3,Eminem,45,2101012,3.05,531,55,"[‘cause, i, just, shitted, on, the, mic, and, ...","[the, previous, bars, double, the, entendre, o...",12,97
1254,Small town nigga Hollywood dreams\n I know tha...,This is a challenge to all those people who sa...,30,True,verified,599407,A Tale of 2 Citiez,A Tale of 2 Citiez by J. Cole,30,1368093,0,"[""Since a youngin' always dreamed of gettin' r...",https://genius.com/5023846/J-cole-a-tale-of-2-...,1,1316816,139,True,0,J. Cole,69,5023846,2.19,313,219,"[small, town, nigga, hollywood, dreams, i, kno...","[this, is, a, challenge, to, all, those, peopl...",42,57


In [77]:
ref_df_train = df_train[['ref_text', 'rt_id']]
tate_df_train = df_train[['tate_text', 'rt_id']]

ref_df_test = df_test[['ref_text', 'rt_id']]
tate_df_test = df_test[['tate_text', 'rt_id']]

In [78]:
ref_df_train.head()

Unnamed: 0,ref_text,rt_id
1040,Maybe I'm the sinner and you're the saint\n Go...,3903717
141,For my niggas out tonight\n And they high off ...,392109
1879,The Reynolds Pamphlet,8844383
666,‘Cause I just shitted on the mic and I like ge...,2101012
1254,Small town nigga Hollywood dreams\n I know tha...,5023846


In [79]:
ref_df_train.reset_index(drop=True, inplace=True)
tate_df_train.reset_index(drop=True, inplace=True)

ref_df_test.reset_index(drop=True, inplace=True)
tate_df_test.reset_index(drop=True, inplace=True)

In [80]:
ref_df_train.head()

Unnamed: 0,ref_text,rt_id
0,Maybe I'm the sinner and you're the saint\n Go...,3903717
1,For my niggas out tonight\n And they high off ...,392109
2,The Reynolds Pamphlet,8844383
3,‘Cause I just shitted on the mic and I like ge...,2101012
4,Small town nigga Hollywood dreams\n I know tha...,5023846


In [95]:
tate_df_train.head()

Unnamed: 0,tate_text,rt_id
0,Ariana has always put up an innocent and cute ...,3903717
1,Shakespeare lines are known for being quite le...,392109
2,The Reynolds Pamphlet was a 98-page document t...,8844383
3,The previous bars double the entendre on “bow”...,2101012
4,This is a challenge to all those people who sa...,5023846


In [97]:
(tate_df_train['rt_id'] == ref_df_train['rt_id']).all()

True

In [98]:
(tate_df_test['rt_id'] == ref_df_test['rt_id']).all()

True

In [81]:
ref_df_test.head()

Unnamed: 0,ref_text,rt_id
0,I've been a menace for the longest\n But I ain...,1834794
1,[HAMILTON]\n I have an early meeting out of to...,8390453
2,And your mom was savin' money for you in a jar...,1339057
3,"Yeezy, Yeezy, Yeezy, I might do my own hotel",8455485
4,"'Cause it's my business, God as my witness",8802573


GONNA USE THIS TUTORIAL FOR REST OF ATTEMPT:
https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/doc2vec-lee.ipynb

In [30]:
import gensim
import os
import collections
import smart_open
import random

In [82]:
refs_train = ref_df_train['ref_text']
tates_train = tate_df_train['tate_text']

refs_test = ref_df_test['ref_text']
tates_test = tate_df_test['tate_text']

refs_train.head()

0    Maybe I'm the sinner and you're the saint\n Go...
1    For my niggas out tonight\n And they high off ...
2                                The Reynolds Pamphlet
3    ‘Cause I just shitted on the mic and I like ge...
4    Small town nigga Hollywood dreams\n I know tha...
Name: ref_text, dtype: object

In [83]:
refs_test.head()

0    I've been a menace for the longest\n But I ain...
1    [HAMILTON]\n I have an early meeting out of to...
2    And your mom was savin' money for you in a jar...
3         Yeezy, Yeezy, Yeezy, I might do my own hotel
4           'Cause it's my business, God as my witness
Name: ref_text, dtype: object

In [84]:
tates_train.head()

0    Ariana has always put up an innocent and cute ...
1    Shakespeare lines are known for being quite le...
2    The Reynolds Pamphlet was a 98-page document t...
3    The previous bars double the entendre on “bow”...
4    This is a challenge to all those people who sa...
Name: tate_text, dtype: object

In [85]:
tates_test.head()

0    Kanye is operating as a musician in the best p...
1    Hamilton doesn’t tell Eliza what he is going t...
2    At the time Eminem held a position as a short ...
3    Kanye previously rapped about designing his ow...
4    Ariana relies on herself and trusts her own de...
Name: tate_text, dtype: object

In [86]:
for i, line in enumerate(tates_train):
    print('i', i)
    print('line:', line)
    if i > 3:
        break

i 0
line: Ariana has always put up an innocent and cute act. She hasn’t really done anything scandalous like most child actors do after they stop acting, however now she is telling us that she is not who she pretends to be.

Essentially she’s saying “Hey, maybe I’m the bad guy and you’re the good guy. Either way, let’s not keep blaming each other.”
i 1
line: Shakespeare lines are known for being quite lengthy – that’s how lengthy the lines of cocaine  they’re creating and snorting are. Thus, there are no worries about not getting enough; this girl doesn’t have to wait for others to use the line either. She can cut the “line” of people in front of her to hit the “line” on the table.

Ironically, Shakespeare reportedly used cocaine and marijuana actively during his lifetime, making “Shakespeare lines” a direct metaphor for doing these kinds of drugs.
i 2
line: The Reynolds Pamphlet was a 98-page document that could be considered the first major political sex scandal in American history.


In [90]:
rt_to_doc_idx_train = ref_df_train['rt_id']
rt_to_doc_idx_train.head()

0    3903717
1     392109
2    8844383
3    2101012
4    5023846
Name: rt_id, dtype: int64

In [91]:
rt_to_doc_idx_test = ref_df_test['rt_id']
rt_to_doc_idx_test.head()

0    1834794
1    8390453
2    1339057
3    8455485
4    8802573
Name: rt_id, dtype: int64

In [93]:
rt_doc_idx_train_dict = rt_to_doc_idx_train.to_dict()
rt_doc_idx_train_dict

{0: 3903717,
 1: 392109,
 2: 8844383,
 3: 2101012,
 4: 5023846,
 5: 10996698,
 6: 2074369,
 7: 1846666,
 8: 14186953,
 9: 9059712,
 10: 16056982,
 11: 2409353,
 12: 12137106,
 13: 11101192,
 14: 9087785,
 15: 10943555,
 16: 508809,
 17: 1871819,
 18: 5088855,
 19: 10279769,
 20: 7695230,
 21: 2395766,
 22: 2895571,
 23: 612911,
 24: 1889390,
 25: 8786268,
 26: 12281038,
 27: 8984060,
 28: 13302882,
 29: 2358326,
 30: 12926317,
 31: 1968215,
 32: 8244220,
 33: 6852720,
 34: 436517,
 35: 575920,
 36: 4966993,
 37: 13250407,
 38: 8672408,
 39: 15206594,
 40: 8082565,
 41: 1884012,
 42: 15738896,
 43: 15296965,
 44: 8914503,
 45: 1915301,
 46: 8239763,
 47: 11671071,
 48: 15574924,
 49: 10960839,
 50: 1950646,
 51: 8217366,
 52: 15378865,
 53: 8539314,
 54: 2302454,
 55: 7966363,
 56: 2411723,
 57: 2397113,
 58: 1942178,
 59: 10979448,
 60: 2804547,
 61: 11690400,
 62: 247495,
 63: 179735,
 64: 13485679,
 65: 4211146,
 66: 5049450,
 67: 14964745,
 68: 9107219,
 69: 2339671,
 70: 8447852,
 

In [94]:
rt_doc_idx_test_dict = rt_to_doc_idx_test.to_dict()
rt_doc_idx_test_dict

{0: 1834794,
 1: 8390453,
 2: 1339057,
 3: 8455485,
 4: 8802573,
 5: 9575227,
 6: 5203239,
 7: 234706,
 8: 4913267,
 9: 4927704,
 10: 9180701,
 11: 612923,
 12: 5123161,
 13: 8112421,
 14: 8420876,
 15: 2139565,
 16: 3948385,
 17: 14370056,
 18: 16448505,
 19: 8907785,
 20: 11861128,
 21: 706887,
 22: 5061944,
 23: 326446,
 24: 3156176,
 25: 8664834,
 26: 10995122,
 27: 3906282,
 28: 475441,
 29: 3431871,
 30: 1921922,
 31: 8049903,
 32: 12756545,
 33: 10489998,
 34: 16620993,
 35: 3880506,
 36: 6922028,
 37: 8035592,
 38: 13572633,
 39: 14692598,
 40: 11720701,
 41: 8506695,
 42: 4888941,
 43: 1140033,
 44: 5063244,
 45: 8663549,
 46: 15314095,
 47: 12580479,
 48: 1004631,
 49: 13104372,
 50: 4829580,
 51: 2240479,
 52: 2729702,
 53: 9096643,
 54: 4489468,
 55: 14712135,
 56: 8665130,
 57: 2552286,
 58: 1889010,
 59: 1666307,
 60: 8608220,
 61: 969857,
 62: 1790675,
 63: 11816930,
 64: 1106682,
 65: 14841892,
 66: 3351386,
 67: 11256815,
 68: 5049418,
 69: 13214697,
 70: 12596887,
 71

# Define a Function to Read and Preprocess Text

Below, we define a function to open the train/test file (with latin encoding), read the file line-by-line, pre-process each line using a simple gensim pre-processing tool (i.e., tokenize text into individual words, remove punctuation, set to lowercase, etc), and return a list of words. Note that, for a given file (aka corpus), each continuous line constitutes a single document and the length of each line (i.e., document) can vary. Also, to train the model, we'll need to associate a tag/number with each document of the training corpus. In our case, the tag is simply the zero-based line number.

In [99]:
def read_corpus(doc_series, tokens_only=False):
    # with smart_open.smart_open(fname, encoding="iso-8859-1") as f:
    for i, line in enumerate(doc_series):
            if tokens_only:
                yield gensim.utils.simple_preprocess(line)
            else:
                # For training data, add tags
                yield gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(line), [i])

In [100]:
train_tate_corpus = list(read_corpus(tates_train))
test_tate_corpus = list(read_corpus(tates_test, tokens_only=True))

train_refs_corpus = list(read_corpus(refs_train))
test_refs_corpus = list(read_corpus(refs_test, tokens_only=True))

In [102]:
train_tate_corpus[:2]

[TaggedDocument(words=['ariana', 'has', 'always', 'put', 'up', 'an', 'innocent', 'and', 'cute', 'act', 'she', 'hasn', 'really', 'done', 'anything', 'scandalous', 'like', 'most', 'child', 'actors', 'do', 'after', 'they', 'stop', 'acting', 'however', 'now', 'she', 'is', 'telling', 'us', 'that', 'she', 'is', 'not', 'who', 'she', 'pretends', 'to', 'be', 'essentially', 'she', 'saying', 'hey', 'maybe', 'the', 'bad', 'guy', 'and', 'you', 're', 'the', 'good', 'guy', 'either', 'way', 'let', 'not', 'keep', 'blaming', 'each', 'other'], tags=[0]),
 TaggedDocument(words=['shakespeare', 'lines', 'are', 'known', 'for', 'being', 'quite', 'lengthy', 'that', 'how', 'lengthy', 'the', 'lines', 'of', 'cocaine', 'they', 're', 'creating', 'and', 'snorting', 'are', 'thus', 'there', 'are', 'no', 'worries', 'about', 'not', 'getting', 'enough', 'this', 'girl', 'doesn', 'have', 'to', 'wait', 'for', 'others', 'to', 'use', 'the', 'line', 'either', 'she', 'can', 'cut', 'the', 'line', 'of', 'people', 'in', 'front', '

In [103]:
test_tate_corpus[:2]

[['kanye',
  'is',
  'operating',
  'as',
  'musician',
  'in',
  'the',
  'best',
  'possible',
  'time',
  'by',
  'using',
  'both',
  'technical',
  'ability',
  'in',
  'the',
  'production',
  'of',
  'his',
  'beats',
  'and',
  'his',
  'plain',
  'musicianship',
  'west',
  'is',
  'easily',
  'keepin',
  'it',
  'going',
  'with',
  'album',
  'after',
  'album',
  'of',
  'hits',
  'he',
  'definitely',
  'doin',
  'it'],
 ['hamilton',
  'doesn',
  'tell',
  'eliza',
  'what',
  'he',
  'is',
  'going',
  'to',
  'do',
  'considering',
  'the',
  'hamilton',
  'family',
  'track',
  'record',
  'with',
  'duels',
  'you',
  'think',
  'he',
  'would',
  'be',
  'smarter',
  'out',
  'of',
  'town',
  'refers',
  'to',
  'as',
  'with',
  'philip',
  'duel',
  'the',
  'duel',
  'across',
  'the',
  'river',
  'in',
  'weehawken',
  'new',
  'jersey',
  'where',
  'everything',
  'is',
  'legal',
  'as',
  'agreed',
  'in',
  'your',
  'obedient',
  'servant',
  'the',
  'due

In [104]:
train_refs_corpus[:2]

[TaggedDocument(words=['maybe', 'the', 'sinner', 'and', 'you', 're', 'the', 'saint', 'gotta', 'stop', 'pretending', 'what', 'we', 'ain'], tags=[0]),
 TaggedDocument(words=['for', 'my', 'niggas', 'out', 'tonight', 'and', 'they', 'high', 'off', 'shakespeare', 'lines', 'there', 'enough', 'to', 'pass', 'around', 'you', 'don', 'gotta', 'wait', 'in', 'line'], tags=[1])]

In [105]:
test_refs_corpus[:2]

[['ve',
  'been',
  'menace',
  'for',
  'the',
  'longest',
  'but',
  'ain',
  'finished',
  'devoted',
  'and',
  'you',
  'know',
  'it',
  'and',
  'you',
  'know',
  'it'],
 ['hamilton',
  'have',
  'an',
  'early',
  'meeting',
  'out',
  'of',
  'town',
  'eliza',
  'it',
  'still',
  'dark',
  'outside']]

In [107]:
print(test_tate_corpus[:2])

[['kanye', 'is', 'operating', 'as', 'musician', 'in', 'the', 'best', 'possible', 'time', 'by', 'using', 'both', 'technical', 'ability', 'in', 'the', 'production', 'of', 'his', 'beats', 'and', 'his', 'plain', 'musicianship', 'west', 'is', 'easily', 'keepin', 'it', 'going', 'with', 'album', 'after', 'album', 'of', 'hits', 'he', 'definitely', 'doin', 'it'], ['hamilton', 'doesn', 'tell', 'eliza', 'what', 'he', 'is', 'going', 'to', 'do', 'considering', 'the', 'hamilton', 'family', 'track', 'record', 'with', 'duels', 'you', 'think', 'he', 'would', 'be', 'smarter', 'out', 'of', 'town', 'refers', 'to', 'as', 'with', 'philip', 'duel', 'the', 'duel', 'across', 'the', 'river', 'in', 'weehawken', 'new', 'jersey', 'where', 'everything', 'is', 'legal', 'as', 'agreed', 'in', 'your', 'obedient', 'servant', 'the', 'duel', 'will', 'begin', 'at', 'dawn', 'hence', 'hamilton', 'need', 'to', 'leave', 'before', 'the', 'sun', 'is', 'up']]


In [109]:
print(test_refs_corpus[:2])

[['ve', 'been', 'menace', 'for', 'the', 'longest', 'but', 'ain', 'finished', 'devoted', 'and', 'you', 'know', 'it', 'and', 'you', 'know', 'it'], ['hamilton', 'have', 'an', 'early', 'meeting', 'out', 'of', 'town', 'eliza', 'it', 'still', 'dark', 'outside']]


# Training the Model
## Instantiate a Doc2Vec Object

Now, we'll instantiate a Doc2Vec model with a vector size with 50 words and iterating over the training corpus 40 times. We set the minimum word count to 2 in order to discard words with very few occurrences. (Without a variety of representative examples, retaining such infrequent words can often make a model worse!) Typical iteration counts in published 'Paragraph Vectors' results, using 10s-of-thousands to millions of docs, are 10-20. More iterations take more time and eventually reach a point of diminishing returns.

However, this is a very very small dataset (300 documents) with shortish documents (a few hundred words). Adding training passes can sometimes help with such small datasets.

In [115]:
ref_model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=40)

In [116]:
ref_model2 = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=80)

In [117]:
tate_model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=40)
tate_model2 = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=80)

In [118]:
rt_model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=40)
rt_model2 = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=80)

Essentially, the vocabulary is a dictionary (accessible via *model.wv.vocab*) of all of the unique words extracted from the training corpus along with the count (e.g., *model.wv.vocab['penalty'].count* for counts for the word penalty).

In [119]:
ref_model.build_vocab(train_refs_corpus)

In [120]:
tate_model.build_vocab(train_tate_corpus)

In [None]:
# model2.build_vocab(train_corpus)

If the BLAS library is being used, this should take no more than 3 seconds. If the BLAS library is not being used, this should take no more than 2 minutes, so use BLAS if you value your time.

In [122]:
%time ref_model.train(train_refs_corpus, total_examples=ref_model.corpus_count, epochs=ref_model.epochs)

CPU times: user 3.98 s, sys: 962 ms, total: 4.95 s
Wall time: 3.34 s


In [123]:
%time tate_model.train(train_tate_corpus, total_examples=tate_model.corpus_count, epochs=tate_model.epochs)

CPU times: user 11.4 s, sys: 977 ms, total: 12.4 s
Wall time: 5.6 s


# Inferring a Vector

One important thing to note is that you can now infer a vector for any piece of text without having to re-train the model by passing a list of words to the model.infer_vector function. This vector can then be compared with other vectors via cosine similarity.

In [125]:
# model.infer_vector(['only', 'you', 'can', 'prevent', 'forest', 'fires'])

Note that infer_vector() does not take a string, but rather a list of string tokens, which should have already been tokenized the same way as the words property of original training document objects.

Also note that because the underlying training/inference algorithms are an iterative approximation problem that makes use of internal randomization, repeated inferences of the same text will return slightly different vectors.

# Assessing Model

To assess our new model, we'll first infer new vectors for each document of the training corpus, compare the inferred vectors with the training corpus, and then returning the rank of the document based on self-similarity. Basically, we're pretending as if the training corpus is some new unseen data and then seeing how they compare with the trained model. The expectation is that we've likely overfit our model (i.e., all of the ranks will be less than 2) and so we should be able to find similar documents very easily. Additionally, we'll keep track of the second ranks for a comparison of less similar documents.

In [126]:
# ref_model, train_refs_corpus

ref_ranks = []
ref_second_ranks = []
for doc_id in range(len(train_refs_corpus)):
    inferred_vector = ref_model.infer_vector(train_refs_corpus[doc_id].words)
    sims = ref_model.docvecs.most_similar([inferred_vector], topn=len(ref_model.docvecs))
    rank = [docid for docid, sim in sims].index(doc_id)
    ref_ranks.append(rank)
    
    ref_second_ranks.append(sims[1])

In [127]:
# tate_model, train_tate_corpus

tate_ranks = []
tate_second_ranks = []
for doc_id in range(len(train_tate_corpus)):
    inferred_vector = tate_model.infer_vector(train_tate_corpus[doc_id].words)
    sims = tate_model.docvecs.most_similar([inferred_vector], topn=len(tate_model.docvecs))
    rank = [docid for docid, sim in sims].index(doc_id)
    tate_ranks.append(rank)
    
    tate_second_ranks.append(sims[1])

Let's count how each document ranks with respect to the training corpus

In [131]:
ref_rank_counter = collections.Counter(ref_ranks)  # Results vary between runs due to random seeding and very small corpus
ref_rank_counter


Counter({0: 2388,
         14: 4,
         1: 110,
         2: 39,
         6: 17,
         21: 7,
         120: 1,
         26: 3,
         5: 14,
         83: 1,
         838: 1,
         15: 11,
         10: 4,
         16: 9,
         2701: 1,
         12: 7,
         241: 1,
         781: 1,
         7: 9,
         11: 6,
         116: 1,
         3: 19,
         530: 1,
         95: 1,
         27: 4,
         341: 1,
         30: 2,
         8: 10,
         1071: 1,
         2702: 1,
         33: 2,
         844: 1,
         296: 1,
         28: 3,
         938: 1,
         77: 2,
         76: 2,
         625: 1,
         31: 3,
         215: 1,
         149: 1,
         240: 1,
         600: 1,
         4: 15,
         516: 1,
         383: 1,
         20: 3,
         69: 1,
         337: 2,
         221: 2,
         94: 2,
         469: 1,
         404: 1,
         46: 2,
         159: 2,
         523: 1,
         22: 2,
         9: 5,
         154: 1,
         476: 1,
       

In [132]:
ref_rank_counter[0]

2388

In [135]:
n_training_docs = len(train_tate_corpus)

ref_correct_similarity_docs = ref_rank_counter[0]

ref_perc_correct_similarity = ref_correct_similarity_docs / n_training_docs

ref_perc_correct_similarity

0.8355493351994402

In [134]:
tate_rank_counter = collections.Counter(tate_ranks)  # Results vary between runs due to random seeding and very small corpus
tate_rank_counter

Counter({0: 2839, 2198: 1, 1: 12, 2: 1, 11: 2, 3: 1, 10: 1, 503: 1})

In [136]:
n_training_docs = len(train_tate_corpus)

tate_correct_similarity_docs = tate_rank_counter[0]

tate_perc_correct_similarity = tate_correct_similarity_docs / n_training_docs

tate_perc_correct_similarity

0.9933519944016795

Basically, greater than 95% of the inferred documents are found to be most similar to itself and about 5% of the time it is mistakenly most similar to another document. the checking of an inferred-vector against a training-vector is a sort of 'sanity check' as to whether the model is behaving in a usefully consistent manner, though not a real 'accuracy' value.

This is great and not entirely surprising. We can take a look at an example:

In [None]:
ref_doc_id = 4

In [144]:
train_refs_corpus[4]

TaggedDocument(words=['small', 'town', 'nigga', 'hollywood', 'dreams', 'know', 'that', 'everything', 'that', 'glitters', 'ain', 'gold', 'know', 'the', 'shit', 'ain', 'always', 'good', 'as', 'it', 'seems', 'but', 'tell', 'me', 'till', 'you', 'get', 'it', 'how', 'could', 'you', 'know', 'how', 'could', 'you', 'know', 'how', 'could', 'you', 'know'], tags=[4])

In [145]:
print('Document ({}): «{}»\n'.format(ref_doc_id, ' '.join(train_refs_corpus[ref_doc_id].words)))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % ref_model)
for label, index in [('MOST', 0), ('SECOND-MOST', 1), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(train_refs_corpus[sims[index][0]].words)))

Document (4): «small town nigga hollywood dreams know that everything that glitters ain gold know the shit ain always good as it seems but tell me till you get it how could you know how could you know how could you know»

SIMILAR/DISSIMILAR DOCS PER MODEL Doc2Vec(dm/m,d50,n5,w5,mc2,s0.001,t3):

MOST (2857, 0.9452176690101624): «kiki do you love me are you riding say you ll never ever leave from beside me cause want ya and need ya and down for you always kb do you love me are you riding say you ll never ever leave from beside me cause want ya and need ya and down for you always»

SECOND-MOST (134, 0.6661272048950195): «well that nothin»

MEDIAN (947, 0.26956313848495483): «and if those mirrors could talk it say you gotta go»

LEAST (2583, -0.15551932156085968): «verse»



In [146]:
rt_doc_idx_train_dict[2583]

1948205

In [147]:
df[df['rt_id'] == 1948205]

Unnamed: 0,ref_text,tate_text,votes_total,verified,state,song_id,song_title,full_title,song_tate_cnt,pageviews,n_unreviewed_tates,full_lyrics,url,n_tate_contributors,primary_contributor_id,primary_contributor_IQ,has_voters,comment_cnt,artist_name,artist_id,rt_id,votes_per_1000views,chars_in_tate,chars_in_referent,ref_word_lst,tate_word_lst,ref_word_cnt,tate_word_cnt
609,[Verse 3],The last verse shows Cole’s acceptance of the ...,15,False,accepted,164383,Let Nas Down,Let Nas Down by J. Cole,46,889777,1,"[""Freedom or jail, clips inserted, a baby's be...",https://genius.com/1948205/J-cole-let-nas-down...,1,15786,4724,True,0,J. Cole,69,1948205,1.69,1065,9,"[[verse, 3]]","[the, last, verse, shows, cole’s, acceptance, ...",2,191


In [148]:
rt_doc_idx_train_dict[134]

2364234

In [149]:
df[df['rt_id'] == 2364234]

Unnamed: 0,ref_text,tate_text,votes_total,verified,state,song_id,song_title,full_title,song_tate_cnt,pageviews,n_unreviewed_tates,full_lyrics,url,n_tate_contributors,primary_contributor_id,primary_contributor_IQ,has_voters,comment_cnt,artist_name,artist_id,rt_id,votes_per_1000views,chars_in_tate,chars_in_referent,ref_word_lst,tate_word_lst,ref_word_cnt,tate_word_cnt
768,"Well, that's nothin'",While many fans thought this lyric was “not fa...,143,False,accepted,235732,The Monster,The Monster by Eminem (Ft. Rihanna),43,4281504,1,"[""I'm friends with the monster that's under my...",https://genius.com/2364234/Eminem-the-monster/...,9,27018,10562,True,2,Eminem,45,2364234,3.34,166,20,"[well,, that's, nothin']","[while, many, fans, thought, this, lyric, was,...",3,27


In [150]:
tate_doc_id = 4

In [151]:
train_tate_corpus[4]

TaggedDocument(words=['this', 'is', 'challenge', 'to', 'all', 'those', 'people', 'who', 'say', 'money', 'fame', 'women', 'can', 'buy', 'you', 'happiness', 'it', 'may', 'or', 'not', 'be', 'true', 'but', 'truth', 'is', 'often', 'relative', 'until', 'you', 'get', 'it', 'how', 'could', 'you', 'know', 'if', 'it', 'will', 'give', 'you', 'happiness', 'similar', 'idea', 'expressed', 'by', 'drake', 'in', 'up', 'all', 'night', 'niggas', 'with', 'no', 'money', 'act', 'like', 'money', 'isn', 'everything'], tags=[4])

In [152]:
print('Document ({}): «{}»\n'.format(doc_id, ' '.join(train_tate_corpus[doc_id].words)))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % tate_model)
for label, index in [('MOST', 0), ('SECOND-MOST', 1), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(train_tate_corpus[sims[index][0]].words)))

Document (2857): «while some fans speculated that kiki was singer keshia chanté credible source told genius kiki is actually model and social media personality yanna barber kb although barber has not directly acknowledged that she the kiki in these lyrics she tweeted about them liked tweet where fan speculated they were referring to her and even quoted the kb lyric in tweet https twitter com daloveofkj status»

SIMILAR/DISSIMILAR DOCS PER MODEL Doc2Vec(dm/m,d50,n5,w5,mc2,s0.001,t3):

MOST (2857, 0.9452176690101624): «while some fans speculated that kiki was singer keshia chanté credible source told genius kiki is actually model and social media personality yanna barber kb although barber has not directly acknowledged that she the kiki in these lyrics she tweeted about them liked tweet where fan speculated they were referring to her and even quoted the kb lyric in tweet https twitter com daloveofkj status»

SECOND-MOST (134, 0.6661272048950195): «while many fans thought this lyric was n

Notice above that the most similar document (usually the same text) is has a similarity score approaching 1.0. However, the similarity score for the second-ranked documents should be significantly lower (assuming the documents are in fact different) and the reasoning becomes obvious when we examine the text itself.

We can run the next cell repeatedly to see a sampling other target-document comparisons.

In [153]:
# Pick a random document from the corpus and infer a vector from the model
ref_doc_id = random.randint(0, len(train_refs_corpus) - 1)

# Compare and print the second-most-similar document
print('Train Document ({}): «{}»\n'.format(ref_doc_id, ' '.join(train_refs_corpus[ref_doc_id].words)))
sim_id = ref_second_ranks[ref_doc_id]
print('Similar Document {}: «{}»\n'.format(sim_id, ' '.join(train_refs_corpus[sim_id[0]].words)))

Train Document (820): «so you better trade your fuckin mics in for some tool box es»

Similar Document (1634, 0.6862548589706421): «thank you for your service»



In [156]:
# Compare and print the second-most-similar document
print('Train Document ({}): «{}»\n'.format(ref_doc_id, ' '.join(train_tate_corpus[ref_doc_id].words)))
sim_id = tate_second_ranks[ref_doc_id]
print('Similar Document {}: «{}»\n'.format(sim_id, ' '.join(train_tate_corpus[sim_id[0]].words)))

Train Document (820): «geniuses considered as such in their respective fields are often prone to mastering that field and none other einstein excelling at sciences and failing at all other subjects it is known condition amongst specialising type brains given that eminem honed his rapping skills up to point where competitors who didn have their lives solely focused on rap can ever dare to approach he then advises them to trade their rapping career mics for another line of work or craft tool boxes the toolboxes metaphor also sets up for the wordplay in the next bar»

Similar Document (2003, 0.5984135270118713): «drake referring to his past days back when his music wasn as popular or good back when nobody really listened to it over the other rappers stuff this basically outlines everybody struggles when they re trying to start new phase for themselves of course this is outlining drake past experiences»



Are these the same across tates/refs?

In [154]:
# Pick a random document from the corpus and infer a vector from the model
tate_doc_id = random.randint(0, len(train_tate_corpus) - 1)

# Compare and print the second-most-similar document
print('Train Document ({}): «{}»\n'.format(tate_doc_id, ' '.join(train_tate_corpus[tate_doc_id].words)))
sim_id = tate_second_ranks[tate_doc_id]
print('Similar Document {}: «{}»\n'.format(sim_id, ' '.join(train_tate_corpus[sim_id[0]].words)))

Train Document (1566): «kendrick asks the man in prison to listen more carefully to the first verse of this song in the first verse kendrick uses these walls to describe the inside of woman vagina the woman being the baby mama of the man in prison who killed kendrick homie in sing about me kendrick ended up getting revenge by fucking the killer girl kendrick fame allows him to take advantage of the girl which is blatant abuse of his power as celebrity the walls of the woman also tell us that the man in prison put this woman through lot of pain and suffering probably because of the man stupid decision to kill somebody and get caught wall telling me they full of pain resentment need someone to live in them just to relieve tension»

Similar Document (2443, 0.5956164598464966): «drizzy confesses that he wants this girl by his side rather than in the the hood where she grew up this contradicts what he said in the first verse don make me give you back to the hood don make me give you back he

In [155]:
# Compare and print the second-most-similar document
print('Train Document ({}): «{}»\n'.format(tate_doc_id, ' '.join(train_refs_corpus[tate_doc_id].words)))
sim_id = ref_second_ranks[tate_doc_id]
print('Similar Document {}: «{}»\n'.format(sim_id, ' '.join(train_refs_corpus[sim_id[0]].words)))

Train Document (1566): «so when you play this song rewind the first verse about me abusing my power so you can hurt about me and her in the shower whenever she horny about me and her in the after hours of the morning about her baby daddy currently serving life»

Similar Document (1067, 0.6442455053329468): «man that thing in them jeans too fat for her rebounds so caught her off the backboard»



Testing the Model
Using the same approach above, we'll infer the vector for a randomly chosen test document, and compare the document to our model by eye.

In [158]:
# Pick a random document from the test corpus and infer a vector from the model
ref_doc_id = random.randint(0, len(test_refs_corpus) - 1)
inferred_vector = ref_model.infer_vector(test_refs_corpus[ref_doc_id])
sims = ref_model.docvecs.most_similar([inferred_vector], topn=len(ref_model.docvecs))

# Compare and print the most/median/least similar documents from the train corpus
print('Test Document ({}): «{}»\n'.format(ref_doc_id, ' '.join(test_refs_corpus[ref_doc_id])))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % ref_model)
for label, index in [('MOST', 0), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(train_refs_corpus[sims[index][0]].words)))

Test Document (343): «lying say hit you he sitting there consoling you rubbin my name through the mud who provoking you»

SIMILAR/DISSIMILAR DOCS PER MODEL Doc2Vec(dm/m,d50,n5,w5,mc2,s0.001,t3):

MOST (1809, 0.49396881461143494): «only way he comin back is through his unborns»

MEDIAN (1178, 0.07365314662456512): «and everyone who has doubt remind now take your best rhyme outdo it now do it thousand times now let em tell ya the world no longer cares or gives fuck about your rhymes and as grow outta site outta mind might go outta mine»

LEAST (671, -0.4515109360218048): «rock me real slowly put bib on me just like baby droolin over you the things you do»



In [159]:
# Pick a random document from the test corpus and infer a vector from the model
tate_doc_id = random.randint(0, len(test_tate_corpus) - 1)
inferred_vector = tate_model.infer_vector(test_tate_corpus[tate_doc_id])
sims = tate_model.docvecs.most_similar([inferred_vector], topn=len(tate_model.docvecs))

# Compare and print the most/median/least similar documents from the train corpus
print('Test Document ({}): «{}»\n'.format(tate_doc_id, ' '.join(test_tate_corpus[tate_doc_id])))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % tate_model)
for label, index in [('MOST', 0), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(train_tate_corpus[sims[index][0]].words)))

Test Document (83): «there are two ways to interpret this that all of the statements here are lies meaning the short relationship wasn fun he was indifferent about the future baby smile and he wanted to abort or there is just one lie hence the one more lie that the relationship was fun and that he would have loved the child smile although he really did want the mother to get the abortion»

SIMILAR/DISSIMILAR DOCS PER MODEL Doc2Vec(dm/m,d50,n5,w5,mc2,s0.001,t3):

MOST (2095, 0.4892362058162689): «no one wanted to support or show love during the early stages of his career when he was just simply jimmy jonah hill agrees with this sentiment https twitter com jonahhill status»

MEDIAN (2239, 0.1515483856201172): «loyalty to his squad has always been very important to drake he implying that he helps his squad to get to the top and won do anything to take them down in the song headlines from his classic album take care he states cause one of us goes in and we all go through it similarly to th

In [160]:
inferred_vector = ref_model.infer_vector(test_refs_corpus[tate_doc_id])
sims = ref_model.docvecs.most_similar([inferred_vector], topn=len(ref_model.docvecs))

# Compare and print the most/median/least similar documents from the train corpus
print('Test Document ({}): «{}»\n'.format(tate_doc_id, ' '.join(test_refs_corpus[tate_doc_id])))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % ref_model)
for label, index in [('MOST', 0), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(train_refs_corpus[sims[index][0]].words)))

Test Document (83): «what else can say it was fun for while bet really woulda loved your smile didn really wanna abort but fuck it what one more lie to tell our unborn child»

SIMILAR/DISSIMILAR DOCS PER MODEL Doc2Vec(dm/m,d50,n5,w5,mc2,s0.001,t3):

MOST (668, 0.6929534077644348): «didn wanna do it gave me every reason the point tryin to make is don ever need em seen what you do for fame what would you do for freedom»

MEDIAN (1311, 0.03979530185461044): «fraud niggas all niggas that that shit don like»

LEAST (1413, -0.5425337553024292): «oh yeah got you drinking out them white cups sodas all this shit so foreign to you thick smoke choking baby get familiar with the order just crack it then pour it then sip slow then tip low my eyes red but my brim low that xo»



...How about we try to measure cosine similarity across these document pairs....??

What does this actually mean, since the model itself is supposed to be trained on different corpuses of documents...?!


In [163]:
# 3 rt_ids for annotations we know to be particularly "bad"
bottom_3_rtid = [rt_id for rt_id in df.nsmallest(3, 'votes_per_1000views')['rt_id']]
bottom_3_rtid

[2222801, 5023358, 13660449]

In [186]:
b3_doc_id = []
b3_tot = []
for rt_id in bottom_3_rtid:
    if rt_id in list(ref_df_train['rt_id']):    
        mask = ref_df_train['rt_id'] == rt_id 
        b3_doc_id.append(ref_df_train[mask].index[0])
        b3_tot.append('train')
    else:
        mask = ref_df_test['rt_id'] == rt_id
        b3_doc_id.append(ref_df_test[mask].index[0])
        b3_tot.append('test')

print(b3_doc_id)
print(b3_tot)

[410, 633, 73]
['test', 'train', 'test']


In [268]:
mask = ref_df_test['rt_id'] == 5023358
ref_df_test[mask].shape[0]

0

In [172]:
mask = ref_df_train['rt_id'] == 5023358
ref_df_train[mask]

Unnamed: 0,ref_text,rt_id
633,"Everyday, I was strugglin' to learn what life'...",5023358


In [214]:
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import cosine

In [194]:
train_refs_corpus[410].words

['wait', 'for', 'it', 'wait', 'for', 'it']

In [196]:
ref_model.infer_vector(train_refs_corpus[410].words).shape

(50,)

In [208]:
t_rm_v = ref_model.infer_vector(train_refs_corpus[410].words)
# test_refs_corpus[410]

r_rm_v = ref_model.infer_vector(train_tate_corpus[410].words)
# .reshape(-1, 1)
t_rm_v

array([ 0.14395861,  0.15045872,  0.01599213,  0.17205262, -0.05231715,
       -0.0454246 ,  0.13048874, -0.11429933,  0.02252444,  0.07466868,
        0.05702126,  0.08374254,  0.07011436,  0.3089437 ,  0.12883076,
        0.17158018, -0.31449294,  0.06310054,  0.03068377,  0.07128427,
        0.15906012, -0.01953335, -0.01834335,  0.09780873, -0.06564244,
        0.02863793, -0.16270128,  0.07432632, -0.03422998,  0.13957839,
       -0.15767352,  0.45527083, -0.01328564,  0.04846782, -0.06781717,
        0.10024025, -0.0584981 ,  0.18782988,  0.08991164,  0.16263822,
       -0.16068429, -0.18666303, -0.11512092, -0.15562011,  0.18331145,
        0.11604247,  0.06938348,  0.03668731, -0.08721007, -0.10798368],
      dtype=float32)

In [230]:
bad_cs_rm = []
# bad_rank_rm = []
bad_cs_tm = []
# bad_rank_tm = []

for idx, b_doc_id in enumerate(b3_doc_id):
    # let's start with using tate model
    if b3_tot[idx] == 'train':
        rm_inferred_ref_vector = ref_model.infer_vector(train_refs_corpus[b_doc_id].words, epochs=100).reshape(-1, 1)
        rm_inferred_tate_vector = ref_model.infer_vector(train_tate_corpus[b_doc_id].words, epochs=100).reshape(-1, 1)
        
        tm_inferred_ref_vector = tate_model.infer_vector(train_refs_corpus[b_doc_id].words, epochs=100).reshape(-1, 1)
        tm_inferred_tate_vector = tate_model.infer_vector(train_tate_corpus[b_doc_id].words, epochs=100).reshape(-1, 1)
    else:
        rm_inferred_ref_vector = ref_model.infer_vector(test_refs_corpus[b_doc_id], epochs=100).reshape(-1, 1)
        rm_inferred_tate_vector = ref_model.infer_vector(test_tate_corpus[b_doc_id], epochs=100).reshape(-1, 1)
        
        tm_inferred_ref_vector = tate_model.infer_vector(test_refs_corpus[b_doc_id], epochs=100).reshape(-1, 1)
        tm_inferred_tate_vector = tate_model.infer_vector(test_tate_corpus[b_doc_id], epochs=100).reshape(-1, 1)
        
    # might need to just do straight up np.cosine similarity calc between vecs
    rm_cs = 1 - cosine(rm_inferred_ref_vector, rm_inferred_tate_vector)
    tm_cs = 1 - cosine(tm_inferred_ref_vector, tm_inferred_tate_vector)
    # idk if rank will work...
#     rm_rank = ref_model.docvecs.rank(rm_inferred_ref_vector, rm_inferred_tate_vector)
    bad_cs_rm.append(rm_cs)
    bad_cs_tm.append(tm_cs)
#     bad_rank_rm.append(rm_rank)

print(bad_cs_rm)
print(bad_cs_tm)
# print(bad_rank_rm)

[0.23593929409980774, 0.012102792970836163, 0.20475910604000092]
[0.3581741750240326, -0.07085774093866348, 0.3834901750087738]


In [233]:
print(np.array(bad_cs_rm).mean())
print(np.array(bad_cs_tm).mean())

0.1509337310368816
0.22360220303138098


In [228]:
bad_cs_rm[0]

0.23188668489456177

In [164]:
# 3 rt_ids for annotations we know to be "good", minus the one that's only annotating a '[VERSE]' tag
top_3_rtid = [rt_id for rt_id in df.nlargest(4, 'votes_per_1000views')['rt_id']]
top_3_rtid.remove(5140263)
top_3_rtid

[8828663, 5140224, 5140209]

In [187]:
t3_doc_id = []
t3_tot = []
for rt_id in top_3_rtid:
    if rt_id in list(ref_df_train['rt_id']):    
        mask = ref_df_train['rt_id'] == rt_id 
        t3_doc_id.append(ref_df_train[mask].index[0])
        t3_tot.append('train')
    else:
        mask = ref_df_test['rt_id'] == rt_id
        t3_doc_id.append(ref_df_test[mask].index[0])
        t3_tot.append('test')

print(t3_doc_id)
print(t3_tot)

[654, 1711, 2435]
['train', 'train', 'train']


In [176]:
mask = ref_df_train['rt_id'] == 5140209
ref_df_train[mask]

Unnamed: 0,ref_text,rt_id
2435,"Hi, my name is, what? My name is, who?\n My na...",5140209


In [237]:
good_cs_rm = []
# bad_rank_rm = []
good_cs_tm = []
# bad_rank_tm = []

for idx, t_doc_id in enumerate(t3_doc_id):
    print(t_doc_id)
    # let's start with using tate model
    if t3_tot[idx] == 'train':
        rm_inferred_ref_vector = ref_model.infer_vector(train_refs_corpus[t_doc_id].words, epochs=100).reshape(-1, 1)
        rm_inferred_tate_vector = ref_model.infer_vector(train_tate_corpus[t_doc_id].words, epochs=100).reshape(-1, 1)
        
        tm_inferred_ref_vector = tate_model.infer_vector(train_refs_corpus[t_doc_id].words, epochs=100).reshape(-1, 1)
        tm_inferred_tate_vector = tate_model.infer_vector(train_tate_corpus[t_doc_id].words, epochs=100).reshape(-1, 1)
    else:
        rm_inferred_ref_vector = ref_model.infer_vector(test_refs_corpus[t_doc_id], epochs=100).reshape(-1, 1)
        rm_inferred_tate_vector = ref_model.infer_vector(test_tate_corpus[t_doc_id], epochs=100).reshape(-1, 1)
        
        tm_inferred_ref_vector = tate_model.infer_vector(test_refs_corpus[t_doc_id], epochs=100).reshape(-1, 1)
        tm_inferred_tate_vector = tate_model.infer_vector(test_tate_corpus[t_doc_id], epochs=100).reshape(-1, 1)
        
    # might need to just do straight up np.cosine similarity calc between vecs
    rm_cs = 1 - cosine(rm_inferred_ref_vector, rm_inferred_tate_vector)
    tm_cs = 1 - cosine(tm_inferred_ref_vector, tm_inferred_tate_vector)
    # idk if rank will work...
#     rm_rank = ref_model.docvecs.rank(rm_inferred_ref_vector, rm_inferred_tate_vector)
    good_cs_rm.append(rm_cs)
    good_cs_tm.append(tm_cs)
#     bad_rank_rm.append(rm_rank)

print(good_cs_rm)
print(good_cs_tm)
# print(bad_rank_rm)

654
1711
2435
[0.20860128104686737, 0.09683160483837128, 0.00964298751205206]
[0.20694983005523682, 0.059031177312135696, 0.215658038854599]


In [236]:
print(np.array(good_cs_rm).mean())
print(np.array(good_cs_tm).mean())

0.0989088596155246
0.14913278073072433


In [None]:
good_cs = []


# Wrapping Up
That's it! Doc2Vec is a great way to explore relationships between documents.

In [None]:
# Initialize & train a mode
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(common_texts)]
model = Doc2Vec(documents, vector_size=5, window=2, min_count=1, workers=4)

In [None]:
# Persist a model to disk
fname = get_tmpfile("my_doc2vec_model")
model.save(fname)
model = Doc2Vec.load(fname)  # you can continue training with the loaded model!

In [None]:
# If you’re finished training a model (=no more updates, only querying, reduce memory usage), you can do:
model.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)

In [None]:
# Infer vector for a new document:
vector = model.infer_vector(["system", "response"])

In [239]:
len(sims)

2858