In [1]:
# gensim modules
from gensim import utils
from gensim.models.doc2vec import LabeledSentence
from gensim.models import Doc2Vec

# numpy
import numpy

# classifier
from sklearn.linear_model import LogisticRegression

In [2]:
class LabeledLineSentence(object):
    def __init__(self, sources):
        self.sources = sources
        
        flipped = {}
        
        # make sure that keys are unique
        for key, value in sources.items():
            if value not in flipped:
                flipped[value] = [key]
            else:
                raise Exception('Non-unique prefix encountered')
    
    def __iter__(self):
        for source, prefix in self.sources.items():
            with utils.smart_open(source) as fin:
                for item_no, line in enumerate(fin):
                    yield LabeledSentence(utils.to_unicode(line).split(), [prefix + '_%s' % item_no])
    
    def to_array(self):
        self.sentences = []
        for source, prefix in self.sources.items():
            with utils.smart_open(source) as fin:
                for item_no, line in enumerate(fin):
                    self.sentences.append(LabeledSentence(utils.to_unicode(line).split(), [prefix + '_%s' % item_no]))
        return self.sentences
    
    def sentences_perm(self):
        return numpy.random.permutation(self.sentences)

In [4]:
#note that at this doc2vec stage, the model is not doing classification, it's simply training a doc2vec model for each
#sentence, it doesn' t need to know which class these belong to. Only after we trained it and then we train a classifier
# then we need to access the train_pos or train_neg or whatever class you have, in which case we can access them via
#their special prefix TRAIN_NEG_1, etc. These are two distinct stages.

#sources = {'test-neg.txt':'TEST_NEG', 'test-pos.txt':'TEST_POS', 'train-neg.txt':'TRAIN_NEG', 'train-pos.txt':'TRAIN_POS', 'train-unsup.txt':'TRAIN_UNS'}

# we will need to divide these data into train and test
sources = {'manban_all_test.txt':'TEST_MB', 'manban_all_train.txt':'TRAIN_MB','kuaiban_all_test.txt':'TEST_KB','kuaiban_all_train.txt':'TRAIN_KB','yaoban_all_test.txt':'TEST_YAB','yaoban_all_train.txt':'TRAIN_YAB','yuanban_all_test.txt':'TEST_YB','yuanban_all_train.txt':'TRAIN_YB'}
sentences = LabeledLineSentence(sources)


In [22]:
sary=sentences.to_array()

In [7]:
cd plbs/

/Users/zangsir/Desktop/ISMIR17/doc2vec_shuo/plbs


In [28]:
for i in sary[0][0]:print i

在
此
金殿
用
目睁
倒
叫
某
家
喜
心中
我
今
不
打
万历主
只怕
江山
不太
平
我
把
铜锤
拿
在手
我
今
要
打主
圣龙
一
见
孙安绑
下殿
好不叫人
心痛酸
职都
押
在
龙
书案
收心
务本
种
庄田
一
见
千
岁
下
金殿
倒
叫
杨溥泪
不
干
职
都
押
在
龙
书案
山西
蒲州
快乐
安然
众位
大人
下
金殿
倒
叫
下官
不
耐烦
职都
押
在
龙
书案
收心
务本
不
居
官
众位
大人
把
殿下
倒叫杨
太
泪
汪汪
在
金殿
满
朝
文武
俱
下殿
留
我
一
人保
江山


In [8]:
model = Doc2Vec(min_count=1, window=10, size=100, sample=1e-4, negative=5, workers=7)

model.build_vocab(sentences.to_array())

In [9]:
model = Doc2Vec.load('./jingju.d2v')

In [23]:
import codecs
s=model.most_similar(codecs.decode('悲伤','utf-8'))
for i in s: 
    for j in i:
        print j

伤心
0.96675914526
气丧
0.962376117706
由人
0.960183501244
垂掉
0.95185571909
陈炳顺
0.95154863596
陵寝
0.95066344738
落泪儿
0.949612498283
五内如焚
0.941762208939
牢骚
0.941731333733
白如银
0.939425051212


## disclaimer

even though these sentiment word don't make sense, remember, we didn't train these vectors based on sentiment class labels. we trained on banshi labels. therefore, the real question is whether words in the same banshi come together, which we have no intuition about.

In [27]:
model.docvecs['TEST_MB_3']

array([ 0.02341812,  0.5366019 , -0.24876203, -0.20512319,  0.01443002,
       -0.37027919,  0.07008942,  0.63358134,  0.13886771, -0.19116218,
        0.20827897,  0.0724121 , -0.20983098, -0.12775764,  0.10004643,
       -0.07850786, -0.05763858, -0.2642684 , -0.3929947 ,  0.1294845 ,
       -0.28692403,  0.26159072, -0.20410521,  0.57262319, -0.01929191,
        0.34114429, -0.12959751,  0.54861474,  0.14315757, -0.05447926,
       -0.15363109,  0.37652737, -0.33509755,  0.52424955,  0.00793828,
        0.46900639,  0.21108279,  0.09353911,  0.64020413, -0.20744608,
       -0.34293309, -0.94957787,  0.56172746, -0.60609156,  0.3805725 ,
        0.42465335,  0.50341278, -0.02044366, -1.02513921,  0.43874472,
       -0.35291159, -0.2463025 , -0.2448426 , -0.78223664, -0.02623495,
        0.10846089,  0.37997806,  0.32956275, -0.03049704,  0.17964257,
        0.44880497, -0.33024496, -0.19095866,  0.5198248 , -0.11150498,
        0.6178382 ,  0.64241058,  0.58021307,  0.12484313, -0.14

# creating data for classification

this is really about creating a 2d array 

In [40]:
from sklearn import preprocessing
import numpy as np
def count_num_lines(file_name):
    f=open(file_name,'r').read()
    return len([i for i in f if i=='\n'])



def build_train_data(sources):
    """build data 2d array using doc2vec repr for classification, and labels, for training and testing"""
    #for each document, loop through the lines, and append the doc2vec representation of each line into the output data file.
    data=[]
    labels=[]
    classes=['MB','KB','YAB','YB']
    
    le = preprocessing.LabelEncoder()
    le.fit(classes)
    for k in sources.keys():
        #print k
        #training data only, four sets
        if 'train' not in k:
            continue
        print k
        size = count_num_lines(k)
        print 'size:',size
        prefix=sources[k]
        for i in range(size):
            lab=[j for j in classes if j in prefix][0]
            #print 'label:',lab
            prefix_train = prefix + "_" + str(i)
            data.append(model.docvecs[prefix_train])
            labels.append(lab) #this is nominal label, later to be transformed
    data=np.array(data)
    labels=le.transform(labels)
    return data,labels







sources = {'manban_all_test.txt':'TEST_MB', 'manban_all_train.txt':'TRAIN_MB','kuaiban_all_test.txt':'TEST_KB','kuaiban_all_train.txt':'TRAIN_KB','yaoban_all_test.txt':'TEST_YAB','yaoban_all_train.txt':'TRAIN_YAB','yuanban_all_test.txt':'TEST_YB','yuanban_all_train.txt':'TRAIN_YB'}


In [41]:
data,labels=build_train_data(sources)

yuanban_all_train.txt
size: 335
manban_all_train.txt
size: 174
kuaiban_all_train.txt
size: 210
yaoban_all_train.txt
size: 405


In [42]:
data

array([[ 0.04494349,  0.06091836,  0.01283445, ...,  0.1158127 ,
        -0.21338543,  0.15746288],
       [ 0.14202277,  0.53619534, -0.29973727, ..., -0.0308947 ,
        -0.90494668,  0.75839096],
       [ 0.0028049 ,  0.45299432,  0.11641489, ...,  0.0364399 ,
        -1.23158395,  0.63742286],
       ..., 
       [ 0.27983087,  0.92724085,  0.19805695, ..., -0.32091567,
        -1.18062317,  0.99392557],
       [ 0.03852019,  0.78996921, -0.22482361, ...,  0.42976123,
        -1.08289778,  0.73219556],
       [-0.00581133,  0.67735714, -0.22459288, ...,  0.29841894,
        -0.90452051,  0.56273407]], dtype=float32)

In [44]:
labels[:10]

array([3, 3, 3, 3, 3, 3, 3, 3, 3, 3])

In [45]:
def build_test_data(sources):
    """build data 2d array using doc2vec repr for classification, and labels, for training and testing"""
    #for each document, loop through the lines, and append the doc2vec representation of each line into the output data file.
    data=[]
    labels=[]
    classes=['MB','KB','YAB','YB']
    
    le = preprocessing.LabelEncoder()
    le.fit(classes)
    for k in sources.keys():
        #print k
        #training data only, four sets
        if 'test' not in k:
            continue
        print k
        size = count_num_lines(k)
        print 'size:',size
        prefix=sources[k]
        for i in range(size):
            lab=[j for j in classes if j in prefix][0]
            #print 'label:',lab
            prefix_test = prefix + "_" + str(i)
            data.append(model.docvecs[prefix_test])
            labels.append(lab) #this is nominal label, later to be transformed
    data=np.array(data)
    labels=le.transform(labels)
    return data,labels

In [46]:
test_data,test_labs=build_test_data(sources)

yaoban_all_test.txt
size: 44
kuaiban_all_test.txt
size: 23
yuanban_all_test.txt
size: 37
manban_all_test.txt
size: 19


In [49]:
print len(test_data)
print len(test_labs)

123
123


In [50]:
cd ..

/Users/zangsir/Desktop/ISMIR17/doc2vec_shuo


In [51]:
cd plsqbs/

/Users/zangsir/Desktop/ISMIR17/doc2vec_shuo/plsqbs


In [66]:
ls *train.txt


ehmb_all_train.txt     xpkb_all_train.txt     xpyuanb_all_train.txt
ehyaob_all_train.txt   xpmb_all_train.txt
ehyuanb_all_train.txt  xpyaob_all_train.txt


In [54]:
a="""ehmb_all_test.txt     xpkb_all_test.txt     xpyuanb_all_test.txt
ehyaob_all_test.txt   xpmb_all_test.txt
ehyuanb_all_test.txt  xpyaob_all_test.txt"""

In [64]:
a=a.replace('\n',' ')
b=a.split(' ')

In [67]:
aa="""ehmb_all_train.txt     xpkb_all_train.txt     xpyuanb_all_train.txt
ehyaob_all_train.txt   xpmb_all_train.txt
ehyuanb_all_train.txt  xpyaob_all_train.txt"""

In [68]:
aa=aa.replace('\n',' ')
bb=aa.split(' ')

In [69]:
for i in bb:
    if i!='':
        print "'" + i + "'"+":'TRAIN_',",

'ehmb_all_train.txt':'TRAIN_', 'xpkb_all_train.txt':'TRAIN_', 'xpyuanb_all_train.txt':'TRAIN_', 'ehyaob_all_train.txt':'TRAIN_', 'xpmb_all_train.txt':'TRAIN_', 'ehyuanb_all_train.txt':'TRAIN_', 'xpyaob_all_train.txt':'TRAIN_',


In [None]:
d={'ehmb_all_test.txt':'TEST_EHMB', 'xpkb_all_test.txt':'TEST_XPKB', 'xpyuanb_all_test.txt':'TEST_XPYB', 'ehyaob_all_test.txt':'TEST_EHYAB', 'xpmb_all_test.txt':'TEST_XPMB', 'ehyuanb_all_test.txt':'TEST_EHYB', 'xpyaob_all_test.txt':'TEST_XPYAB','ehmb_all_train.txt':'TRAIN_EHMB', 'xpkb_all_train.txt':'TRAIN_XPKB', 'xpyuanb_all_train.txt':'TRAIN_XPYB', 'ehyaob_all_train.txt':'TRAIN_EHYAB', 'xpmb_all_train.txt':'TRAIN_XPMB', 'ehyuanb_all_train.txt':'TRAIN_EHYB', 'xpyaob_all_train.txt':'TRAIN_XPYAB'}

In [70]:
tfidf=[0.6,0.55,0.48,0.48,0.616,0.536,0.472,0.488]
np.mean(tfidf)

0.52774999999999994

In [71]:
np.mean([39,36,40,45,44])

40.799999999999997