In [1]:
import os
import io
import sys
cwd = os.getcwd()
sys.path.insert(0, cwd)
from module.importData import importData
from module.toWordList import toWordList
from module.steamingWiki import steamingWiki
from module.makeModelGensim import makeModelGensim
from module.toVectore import toVectore
#from module.modelLSTM import modelLSTM
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, cohen_kappa_score
import pandas as pd
import numpy as np
import gensim
import keras
from keras.models import load_model

Using TensorFlow backend.


In [2]:
np.set_printoptions(threshold=sys.maxsize)
pd.get_option("display.max_rows", 1000)
pd.set_option('display.max_columns', 1000)

In [3]:
wikiSource			= 'idwiki'
answerData			= 'DataAnswerExam_SMP.csv'
questionData		= 'qes.csv'
dirData				= cwd+'/data/'
corpusInput			= wikiSource+'.bz2'
wikiOutput			= wikiSource+'.txt'
fileExtension		= 'bin'
trainingAlgoritm	= 0
numDimension		= 200
modelOutput			= wikiSource+'_word2vec_'+str(numDimension)+'_'+str(trainingAlgoritm)+'.'+fileExtension

In [4]:
dAnswer, dQuestion = importData(answer= dirData+answerData, question= dirData+questionData).openData()

if not(os.path.exists(dirData+modelOutput)):
	if not(os.path.exists(dirData+wikiOutput)):
		steamingWiki(corpusInput=corpusInput, wikiOutput=wikiOutput).execute()
	makeModelGensim(wikiOutput=wikiOutput, modelOutput=modelOutput, numDimension=numDimension, trainingAlgoritm=trainingAlgoritm).execute()

if fileExtension != 'bin':
	model = gensim.models.word2vec.Word2Vec.load(dirData+modelOutput)
else:
	model = gensim.models.KeyedVectors.load_word2vec_format(dirData+modelOutput, unicode_errors='ignore')


In [5]:
crossVal = KFold(n_splits=2, random_state=True, shuffle=True)

count = 1

In [6]:
dAnswer.head()

Unnamed: 0,Essay_id,Answer,Score,TrueAnswer
0,1,Alumni; ; Kelebihan; Permisalan; Tutorial; Im...,3,lulusan : alumnus; rencana : agenda; keuntunga...
120,1,lulusan : alumnus; rencana : agenda; keuntunga...,5,lulusan : alumnus; rencana : agenda; keuntunga...
365,1,alumni; argument; hasil; example; teknik; imaj...,4,lulusan : alumnus; rencana : agenda; keuntunga...
650,1,alumnus; agenda; laba; sampel; teknik; imajina...,5,lulusan : alumnus; rencana : agenda; keuntunga...
125,1,lulusan : alumnus; rencana : agenda; keuntunga...,5,lulusan : alumnus; rencana : agenda; keuntunga...


In [7]:
xtrain = dAnswer.loc[:,['Answer','TrueAnswer']]

In [8]:
xtrain.head()

Unnamed: 0,Answer,TrueAnswer
0,Alumni; ; Kelebihan; Permisalan; Tutorial; Im...,lulusan : alumnus; rencana : agenda; keuntunga...
120,lulusan : alumnus; rencana : agenda; keuntunga...,lulusan : alumnus; rencana : agenda; keuntunga...
365,alumni; argument; hasil; example; teknik; imaj...,lulusan : alumnus; rencana : agenda; keuntunga...
650,alumnus; agenda; laba; sampel; teknik; imajina...,lulusan : alumnus; rencana : agenda; keuntunga...
125,lulusan : alumnus; rencana : agenda; keuntunga...,lulusan : alumnus; rencana : agenda; keuntunga...


In [9]:
#keras.utils.to_categorical(dAnswer.loc[:,['Score']], num_classes=6)

In [10]:
class modelLSTM(object):
	from keras.models import Sequential, Model
	from keras.layers import LSTM, Dense, Dropout, Bidirectional, Lambda, Dot, Subtract, Multiply
	import keras.backend as K
	from keras import optimizers

	def biSiamenseModel(self, inputD, distance=False):

		from keras.layers import Input, concatenate, subtract, dot
		from keras import backend as K

		def euclidean_distance(vects):
			x, y = vects
			sum_square = K.sum(K.square(x - y), axis=1, keepdims=True)
			return K.sqrt(K.maximum(sum_square, K.epsilon()))
        
		def cosine_distance(vests):
			x, y = vests
			x = K.l2_normalize(x, axis=-1)
			y = K.l2_normalize(y, axis=-1)
			return -K.mean(x * y, axis=-1, keepdims=True)

		def euccos(vects):
			eucos = concatenate([cosine_distance(vects),euclidean_distance(vects)])
			return eucos

		def manhattan_distance(vects):
			x, y = vects
			return K.sum((K.abs(x - y)), axis=1, keepdims=True)

		def exponent_neg_manhattan_distance(vects):
			x, y = vects
			return K.exp(-K.sum(K.abs(x-y), axis=1, keepdims=True))


		MA = Input(shape= inputD, dtype="float32")
		MB = Input(shape= inputD, dtype="float32")

		x = self.Bidirectional(self.LSTM(200, dropout=0.2, recurrent_dropout=0.2, return_sequences=True), input_shape=inputD)(MA)
		x = self.Bidirectional(self.LSTM(150, dropout=0.2, recurrent_dropout=0.2))(x)

		y = self.Bidirectional(self.LSTM(200, dropout=0.2, recurrent_dropout=0.2, return_sequences=True), input_shape=inputD)(MB)
		y = self.Bidirectional(self.LSTM(150, dropout=0.2, recurrent_dropout=0.2))(y)
		# a2 = (self.Lambda(function=lambda a: euclidean_distance(a),output_shape=lambda a: (a,1)))([x,y])

		if distance:
			a2 = (self.Lambda(function=lambda a: euccos(a),output_shape=lambda a: (a,2)))([x,y])
		else:
			#a2 = concatenate([x,y])
			a2 = self.Multiply()([x,y])
		
		out = self.Dense(1, activation='relu')(a2)

		model = self.Model(inputs=[MA, MB], outputs=out)
		sgd = self.optimizers.SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
		model.compile(loss='mean_squared_error', optimizer=sgd, metrics=['mae'])

		model.summary()

		return model	

In [11]:
for dx, dy in crossVal.split(dAnswer):

	trainSAnswer = []
	trainTAnswer = []
	testSAnswer = []
	testTAnswer = []
	
	print("\n--------Fold {}--------\n".format(count))
	train, test= dAnswer.iloc[dx], dAnswer.iloc[dy]
	
	
	xtrain = train.loc[:,['Answer', 'TrueAnswer']]
	xtest = test.loc[:,['Answer', 'TrueAnswer']]
	ytrain = train.loc[:,['Score']].values
	ytest = test.loc[:,['Score']].values
	#ytraincat, idtrain = np.unique(train.loc[:,['Score']].values, return_inverse=True)
	#ytrain = keras.utils.to_categorical(idtrain, len(ytraincat))
	#ytestcat, idtest = np.unique(test.loc[:,['Score']].values, return_inverse=True)
	#ytest = keras.utils.to_categorical(idtest, len(ytestcat))
	#ytrain = keras.utils.to_categorical(train.loc[:,['Score']].values, num_classes=6)
	#ytest = keras.utils.to_categorical(test.loc[:,['Score']].values, num_classes=6)
	
	# bagian ini biar universal aja.
	
	[trainSAnswer.append(toWordList().sentenceToWordList(a1[0], changeNumber2Word= True)) for a1 in xtrain.loc[:,['Answer']].values]
	[trainTAnswer.append(toWordList().sentenceToWordList(a1[0], changeNumber2Word= True)) for a1 in xtrain.loc[:,['TrueAnswer']].values]
	[testSAnswer.append(toWordList().sentenceToWordList(a1[0], changeNumber2Word= True)) for a1 in xtest.loc[:,['Answer']].values]
	[testTAnswer.append(toWordList().sentenceToWordList(a1[0], changeNumber2Word= True)) for a1 in xtest.loc[:,['TrueAnswer']].values]
	
	vtrainSAnswer, vtrainTAnswer = toVectore(essays = trainSAnswer, trueAnswer=trainTAnswer, model = model, numFeature= numDimension, average=False, distance=True).changeToVector()
	vtestSAnswer, vtestTAnswer = toVectore(essays = testSAnswer, trueAnswer=testTAnswer, model = model, numFeature= numDimension, average=False, distance=True).changeToVector()
	modelNetwork = modelLSTM().biSiamenseModel(inputD=(vtrainSAnswer.shape[1], vtrainSAnswer.shape[2]), distance=True)
	modelNetwork.fit([vtrainSAnswer, vtrainTAnswer], ytrain, batch_size=100, epochs=10)
	pred = modelNetwork.predict([vtestSAnswer, vtestTAnswer])


--------Fold 1--------

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 200, 200)     0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 200, 200)     0                                            
__________________________________________________________________________________________________
bidirectional_1 (Bidirectional) (None, 200, 400)     641600      input_1[0][0]                    
__________________________________________________________________________________________________
bidirectional_3 (Bidirectional) (None, 200, 400)     641600      input_2[0][0]                    
____________________________________________________________________________________

In [12]:
dfa = pd.DataFrame()

In [13]:
pred

array([[4.707743 ],
       [3.5621681],
       [5.2957926],
       [4.900425 ],
       [5.092463 ],
       [4.93578  ],
       [5.341336 ],
       [5.431609 ],
       [4.6800823],
       [5.105651 ],
       [4.919582 ],
       [4.107125 ],
       [4.775106 ],
       [5.315917 ],
       [4.124079 ],
       [5.120642 ],
       [5.431609 ],
       [5.083535 ],
       [4.9515824],
       [5.0882807],
       [5.431609 ],
       [5.3765526],
       [5.4302335],
       [4.404651 ],
       [5.265621 ],
       [5.431609 ],
       [5.431609 ],
       [5.2350163],
       [4.862376 ],
       [4.9754796],
       [5.431609 ],
       [5.431609 ],
       [5.431609 ],
       [5.1675634],
       [5.1802235],
       [4.885205 ],
       [5.2249136],
       [4.961373 ],
       [4.8409033],
       [4.8114853],
       [5.431609 ],
       [5.0173526],
       [5.431609 ],
       [4.8817186],
       [4.8668456],
       [4.9921584],
       [5.229173 ],
       [5.431609 ],
       [4.915197 ],
       [5.431609 ],


In [14]:
cohen_kappa_score(ytest, np.around(pred), weights='quadratic')

0.8070782929351981

In [15]:
mean_squared_error(ytest, np.around(pred))

0.4512820512820513

In [16]:
mean_absolute_error(ytest, np.around(pred))

0.3487179487179487