In [1]:
import os
import io
import sys
cwd = os.getcwd()
sys.path.insert(0, cwd)
from module.importData import importData
from module.toWordList import toWordList
from module.steamingWiki import steamingWiki
from module.makeModelGensim import makeModelGensim
from module.toVectore import toVectore
from module.modelLSTM import modelLSTM
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, cohen_kappa_score
import pandas as pd
import numpy as np
import gensim
from keras.models import load_model

Using TensorFlow backend.


In [2]:
np.set_printoptions(threshold=sys.maxsize)
pd.get_option("display.max_rows", 1000)
pd.set_option('display.max_columns', 1000)

In [3]:
wikiSource			= 'idwiki'
answerData			= 'DataAnswerExam_SMP.csv'
questionData		= 'qes.csv'
dirData				= cwd+'/data/'
corpusInput			= wikiSource+'.bz2'
wikiOutput			= wikiSource+'.txt'
fileExtension		= 'bin'
trainingAlgoritm	= 0
numDimension		= 200
modelOutput			= wikiSource+'_word2vec_'+str(numDimension)+'_'+str(trainingAlgoritm)+'.'+fileExtension

In [4]:
dAnswer, dQuestion = importData(answer= dirData+answerData, question= dirData+questionData).openData()

if not(os.path.exists(dirData+modelOutput)):
	if not(os.path.exists(dirData+wikiOutput)):
		steamingWiki(corpusInput=corpusInput, wikiOutput=wikiOutput).execute()
	makeModelGensim(wikiOutput=wikiOutput, modelOutput=modelOutput, numDimension=numDimension, trainingAlgoritm=trainingAlgoritm).execute()

if fileExtension != 'bin':
	model = gensim.models.word2vec.Word2Vec.load(dirData+modelOutput)
else:
	model = gensim.models.KeyedVectors.load_word2vec_format(dirData+modelOutput, unicode_errors='ignore')


In [5]:
crossVal = KFold(n_splits=2, random_state=True, shuffle=True)

count = 1

In [6]:
dAnswer.head()

Unnamed: 0,Essay_id,Answer,Score,TrueAnswer
0,1,Alumni; ; Kelebihan; Permisalan; Tutorial; Im...,3,lulusan : alumnus; rencana : agenda; keuntunga...
120,1,lulusan : alumnus; rencana : agenda; keuntunga...,5,lulusan : alumnus; rencana : agenda; keuntunga...
365,1,alumni; argument; hasil; example; teknik; imaj...,4,lulusan : alumnus; rencana : agenda; keuntunga...
650,1,alumnus; agenda; laba; sampel; teknik; imajina...,5,lulusan : alumnus; rencana : agenda; keuntunga...
125,1,lulusan : alumnus; rencana : agenda; keuntunga...,5,lulusan : alumnus; rencana : agenda; keuntunga...


In [7]:
xtrain = dAnswer.loc[:,['Answer','TrueAnswer']]

In [8]:
xtrain.head()

Unnamed: 0,Answer,TrueAnswer
0,Alumni; ; Kelebihan; Permisalan; Tutorial; Im...,lulusan : alumnus; rencana : agenda; keuntunga...
120,lulusan : alumnus; rencana : agenda; keuntunga...,lulusan : alumnus; rencana : agenda; keuntunga...
365,alumni; argument; hasil; example; teknik; imaj...,lulusan : alumnus; rencana : agenda; keuntunga...
650,alumnus; agenda; laba; sampel; teknik; imajina...,lulusan : alumnus; rencana : agenda; keuntunga...
125,lulusan : alumnus; rencana : agenda; keuntunga...,lulusan : alumnus; rencana : agenda; keuntunga...


In [9]:
for dx, dy in crossVal.split(dAnswer):

	trainSAnswer = []
	trainTAnswer = []
	testSAnswer = []
	testTAnswer = []
	
	print("\n--------Fold {}--------\n".format(count))
	train, test= dAnswer.iloc[dx], dAnswer.iloc[dy]
	
	
	xtrain = train.loc[:,['Answer', 'TrueAnswer']]
	xtest = test.loc[:,['Answer', 'TrueAnswer']]
	ytrain = train.loc[:,['Score']].values
	ytest = test.loc[:,['Score']].values
	
	# bagian ini biar universal aja.
	
	[trainSAnswer.append(toWordList().sentenceToWordList(a1[0], changeNumber2Word= True)) for a1 in xtrain.loc[:,['Answer']].values]
	[trainTAnswer.append(toWordList().sentenceToWordList(a1[0], changeNumber2Word= True)) for a1 in xtrain.loc[:,['TrueAnswer']].values]
	[testSAnswer.append(toWordList().sentenceToWordList(a1[0], changeNumber2Word= True)) for a1 in xtest.loc[:,['Answer']].values]
	[testTAnswer.append(toWordList().sentenceToWordList(a1[0], changeNumber2Word= True)) for a1 in xtest.loc[:,['TrueAnswer']].values]
	
	vtrainSAnswer, vtrainTAnswer = toVectore(essays = trainSAnswer, trueAnswer=trainTAnswer, model = model, numFeature= numDimension, average=False, distance=True).changeToVector()
	vtestSAnswer, vtestTAnswer = toVectore(essays = testSAnswer, trueAnswer=testTAnswer, model = model, numFeature= numDimension, average=False, distance=True).changeToVector()
	modelNetwork = modelLSTM().biSiamenseModel(inputD=(vtrainSAnswer.shape[1], vtrainSAnswer.shape[2]))
	modelNetwork.fit([vtrainSAnswer, vtrainTAnswer], ytrain, batch_size=100, epochs=30)
	pred = modelNetwork.predict([vtestSAnswer, vtestTAnswer])


--------Fold 1--------

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 100, 200)     0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 100, 200)     0                                            
__________________________________________________________________________________________________
bidirectional_1 (Bidirectional) (None, 100, 400)     641600      input_1[0][0]                    
__________________________________________________________________________________________________
bidirectional_2 (Bidirectional) (None, 100, 400)     641600      input_2[0][0]                    
____________________________________________________________________________________

In [16]:
dfa = pd.DataFrame()

In [17]:
dfa.insert(0,'actual',ytest.flatten())
dfa.insert(1,'roundpredict',np.around(pred).flatten())
dfa.insert(2,'predict',pred.flatten())

In [18]:
dfa

Unnamed: 0,actual,roundpredict,predict
0,5,5.0,5.033740
1,4,3.0,2.900996
2,5,5.0,5.149524
3,5,5.0,5.048888
4,5,5.0,5.096538
5,5,5.0,4.994704
6,5,5.0,5.133086
7,5,5.0,5.162828
8,5,5.0,5.057400
9,5,5.0,5.029446


In [19]:
cohen_kappa_score(ytest, np.around(pred), weights='quadratic')

0.8258392362180474

In [20]:
mean_squared_error(ytest, np.around(pred))

0.4461538461538462

In [21]:
mean_absolute_error(ytest, np.around(pred))

0.2923076923076923