# word2vec averaged repr. of a sentence

## training word2vec on rock corpus

In [3]:
import gensim,os
import numpy as np
from scipy.spatial.distance import cosine
class MySentences(object):
    def __init__(self, dirname):
        self.dirname = dirname
 
    def __iter__(self):
        for fname in os.listdir(self.dirname):
            for line in open(os.path.join(self.dirname, fname)):
                yield line.split()

train_path='rank/train/'
sentences = MySentences(train_path) 
model = gensim.models.Word2Vec(sentences,size=100, window=5, min_count=5, workers=4)

In [9]:
import numpy as np
from scipy.spatial.distance import cosine

In [5]:
def make_feature_vec(words, model, num_features):
    # average all of the word vectors in a sentence
    featureVec = np.zeros((num_features,),dtype="float32")
    #
    nwords = 0.
    index2word_set = set(model.wv.index2word)
    for word in words:
        if word in index2word_set:
            nwords = nwords + 1.
            featureVec = np.add(featureVec,model[word])
    featureVec = np.divide(featureVec,nwords)
    return featureVec

In [10]:
v1=make_feature_vec('I love you',model,100)
v2=make_feature_vec('I love you baby',model,100)
v3=make_feature_vec('moon is shining',model,100)

In [11]:
print cosine(v1,v2)
print cosine(v1,v3)
print cosine(v2,v3)

0.145084607601
0.191834890844
0.314424242576


## save, load model

In [12]:
fname='rock_train.w2v'
model.save(fname)
model = Word2Vec.load(fname)  # you can continue training with the loaded model!

# D2V

In [32]:
# gensim modules: https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/doc2vec-lee.ipynb
from gensim import utils
from gensim.models.doc2vec import TaggedDocument
from gensim.models import Doc2Vec
import smart_open


def read_corpus(fname, tokens_only=False):
    with smart_open.smart_open(fname, encoding="iso-8859-1") as f:
        for i, line in enumerate(f):
            if tokens_only:
                yield gensim.utils.simple_preprocess(line)
            else:
                # For training data, add tags
                yield gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(line), [i])

def remove_empty_lines(fn):
    f=open(fn,'r').read().split('\n')
    output=fn.replace('.txt','_noemp.txt')
    open(output,'w').close
    g=open(output,'a')
    for line in f:
        if line!='':
            g.write(line+'\n')

## train

In [31]:
#clean file to remove empty lines if necessary
train_file='rank/train/lyrics_train_data_clean.txt'
test_file='rank/test/lyrics_test_data_clean.txt'
remove_empty_lines(train_file)
remove_empty_lines(test_file)

In [33]:
#preprocess corpus into gensim format
train_file='rank/train/lyrics_train_data_clean_noemp.txt'
test_file='rank/test/lyrics_test_data_clean_noemp.txt'
train_corpus = list(read_corpus(train_file))
test_corpus = list(read_corpus(test_file, tokens_only=True))

In [34]:
train_corpus[:2]

[TaggedDocument(words=[u'may', u'seem', u'unusual', u'but', u'there', u'is', u'really', u'nothing', u'wrong', u'with', u'me'], tags=[0]),
 TaggedDocument(words=[u'tell', u'you', u'my', u'secrets', u'but', u'you', u'think', u'that', u'boring'], tags=[1])]

In [39]:
model = gensim.models.doc2vec.Doc2Vec(size=50, min_count=2)
model.build_vocab(train_corpus)

In [40]:
for epoch in range(10):
    print epoch
    model.train(train_corpus, total_examples=model.corpus_count)
    model.alpha -= 0.002  
    model.min_alpha = model.alpha  

0
1
2
3
4
5
6
7
8
9


In [None]:
#alternative training
model = gensim.models.doc2vec.Doc2Vec(size=50, min_count=2,iter=15)
model.build_vocab(train_corpus)
model.train(train_corpus, total_examples=model.corpus_count)

In [45]:
fname='rock_train.d2v'
model.save(fname)
model = Doc2Vec.load(fname)

## inference

In [44]:
x='Oh never let you go'
y="I'll never let you go"
z="It's my turn, searching for another break"

a=model.infer_vector(x.split(' '))
b=model.infer_vector(y.split(' '))
c=model.infer_vector(z.split(' '))
print cosine(a,b)
print cosine(a,c)
print cosine(b,c)

0.296794090269
0.952921933758
0.901668460463


# other text discourse coherence features:
 - vocabulary introduction: count the new vocab in consecutive blockes of texts
 - lexical chains: looks at a sliding window of text and see the lexical overlaps 

https://corpling.uis.georgetown.edu/compdisc/webtile.html

# Rhyming


## generic rhyming function

In [46]:
import pronouncing
def is_rhyme(word1,word2):
    return word1 in pronouncing.rhymes(word2)
print is_rhyme('hood?','good')
print is_rhyme('hood','hooded')

False
False


In [47]:
def is_rhyme(word1,word2):
    word1=re.sub(u'[^A-Za-z]','',wor)
    return int(word1 in pronouncing.rhymes(word2))

In [49]:
import re
print is_rhyme('hood?','good')

TypeError: sub() takes at least 3 arguments (2 given)

## sent internal rhyming

for a list of words, check if the last one rhymes with any one of the rest



# loglik

In [None]:
import kenlm
model = kenlm.LanguageModel('train.lm')
sent='in the beginning was the word'
loglik=model.score(sent)/len(sent)

# build training data

- for a line, extract features based on this line and previous line, label=1 
- for a line, extract features based on this line and a random line, label=0
- generate a large number of sents, for a low loglik generated line (bottom ranked K in the normalized-loglik list), extract features for this line and a random line from the corpus, label=0
- do we need to generate a lot of sentences for the negative examples?
- we could also impose some hard coded constraints on the loglik threshold.