In [2]:
!pip install tensorflow

Collecting tensorflow
  Downloading tensorflow-2.9.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (511.7 MB)
[K     |███████████████████▋            | 314.2 MB 105.6 MB/s eta 0:00:02

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[K     |████████████████████████████████| 511.7 MB 43 kB/s s eta 0:00:011
Collecting keras<2.10.0,>=2.9.0rc0
  Downloading keras-2.9.0-py2.py3-none-any.whl (1.6 MB)
[K     |████████████████████████████████| 1.6 MB 26.6 MB/s eta 0:00:01
Collecting tensorboard<2.10,>=2.9
  Downloading tensorboard-2.9.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 21.2 MB/s eta 0:00:01
Collecting google-pasta>=0.1.1
  Downloading google_pasta-0.2.0-py3-none-any.whl (57 kB)
[K     |████████████████████████████████| 57 kB 6.6 MB/s  eta 0:00:01
Collecting astunparse>=1.6.0
  Downloading astunparse-1.6.3-py2.py3-none-any.whl (12 kB)
Collecting flatbuffers<2,>=1.12
  Downloading flatbuffers-1.12-py2.py3-none-any.whl (15 kB)
Collecting keras-preprocessing>=1.1.1
  Downloading Keras_Preprocessing-1.1.2-py2.py3-none-any.whl (42 kB)
[K     |████████████████████████████████| 42 kB 1.7 MB/s  eta 0:00:01
[?25hCollecting tensorflow-io-gcs-filesystem>=0.23.1
  Downloading tensorflow_

In [3]:
!pip install numpy



In [1]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Model

In [2]:
def readData(dataFile,maxSequenceLength,embeddings):
	labels=[]
	sources=[]
	wordsD={'<PAD>':0,'<UNK>':1}
	wordsL=['<PAD>','<UNK>']
	contents=[]
	nums=[]
	documentCVs=[]
	topicCVs=[]
	sourceCVs=[]
	fileIn=open(dataFile,'r')
	while True:
		line=fileIn.readline()
		if line=='':
			break
		parts=line.strip().split('\t')
		label=parts[0]
		source=parts[1]
		num=parts[2]
		documentCV=parts[3]
		topicCV=parts[4]
		sourceCV=parts[5]
		contentWords=parts[6:]
		if len(contentWords)>maxSequenceLength:
			contentWords=contentWords[0:maxSequenceLength]
		content=[]
		for contentWord in contentWords:
			if not contentWord in embeddings:
				contentWord='<UNK>'
			elif not contentWord in wordsD:
				wordsD[contentWord]=len(wordsD)
				wordsL.append(contentWord)
			content.append(wordsD[contentWord])
		if len(content)<maxSequenceLength:
			content=content+[0]*(maxSequenceLength-len(content))	
		labels.append(label)
		sources.append(source)
		contents.append(content)
		nums.append(num)
		documentCVs.append(documentCV)
		topicCVs.append(topicCV)
		sourceCVs.append(sourceCV)
	fileIn.close()
	return(labels,sources,nums,wordsL,contents,documentCVs,topicCVs,sourceCVs)


def readDocuments(dataFile,maxSequenceLength,embeddings,maxDocumentLength):
	(labels,sources,nums,wordsL,contents,documentCVs,topicCVs,sourceCVs)=readData(dataFile,maxSequenceLength,embeddings)
	labelsD=[]
	sourcesD=[]
	numsD=[]
	contentsD=[]
	documentCVsD=[]
	topicCVsD=[]
	sourceCVsD=[]
	lengthsD=[]
	i=0
	contentHere=[]
	while(True):
		contentHere.append(contents[i])
		if i==len(nums)-1 or nums[i+1]!=nums[i]:
			labelsD.append(labels[i])
			sourcesD.append(sources[i])
			numsD.append(nums[i])
			documentCVsD.append(documentCVs[i])
			topicCVsD.append(topicCVs[i])
			sourceCVsD.append(sourceCVs[i])
			if len(contentHere)<maxDocumentLength:
				lengthHere=len(contentHere)
				for j in range(maxDocumentLength-len(contentHere)):
					contentHere.append([0]*maxSequenceLength)
			else:
				lengthHere=maxDocumentLength
				contentHere=contentHere[0:maxDocumentLength]
			lengthsD.append(lengthHere)
			contentsD.append(contentHere)
			contentHere=[]
		i=i+1
		if i==len(nums):
			break
	return(labelsD,sourcesD,numsD,wordsL,contentsD,documentCVsD,topicCVsD,sourceCVsD,lengthsD)


def readEmbeddings(vectorsFile):
	embeddings={}
	for line in open(vectorsFile):
		parts=line.split()
		if len(parts)!=301:
			continue
		coefs=np.asarray(parts[1:],dtype='float32')
		embeddings[parts[0]]=coefs
	return(embeddings)

def prepareEmbeddings(embeddings,wordsL):
	embeddingMatrix = np.zeros((len(wordsL), 300))
	for i in range(len(wordsL)):
		row=np.zeros(300)
		word=wordsL[i]
		if word!='<PAD>' and word!='<UNK>':
			row=embeddings[word]
			row=row/np.sqrt(sum(row*row))
		embeddingMatrix[i]=row
	return(embeddingMatrix)

def lengthToAverageMask(lengthD,maxDocumentLength,binary=False):
	result=[]
	for i in range(len(lengthD)):
		if binary:
			multiplier=1.0
		else:
			multiplier=maxDocumentLength*1.0/lengthD[i]
		vec=[multiplier,multiplier]
		vec0=[0.0,0.0]
		vector=[vec]*lengthD[i]+[vec0]*(maxDocumentLength-lengthD[i])
		result.append(vector)
	return(np.array(result))

	
def evaluateD(resultPred,resultTrue):
	return(np.mean((resultPred[:,1]>0.5)==(resultTrue[:,1]==1)))






class Style1():
	def __init__(self,embeddingMatrix,labelsNum,maxSequenceLength,maxDocumentLength,onGPU):
		self.maxSequenceLength=maxSequenceLength
		self.maxDocumentLength=maxDocumentLength
		self.labelsNum=labelsNum
		
		self.embeddingSize=np.shape(embeddingMatrix)[1]
		self.embL=keras.layers.Embedding(np.shape(embeddingMatrix)[0],self.embeddingSize,input_length=maxSequenceLength,weights=[embeddingMatrix],trainable=False)
		self.reshape1L=keras.layers.Lambda(self.backend_reshape,output_shape=(maxSequenceLength,self.embeddingSize))
		if onGPU:
			self.LSTMforL=keras.layers.LSTM(units=100,go_backwards=False,return_sequences=False)
			self.LSTMrevL=keras.layers.LSTM(units=100,go_backwards=True,return_sequences=False)
		else:
			self.LSTMforL=keras.layers.LSTM(units=100,go_backwards=False,return_sequences=False)
			self.LSTMrevL=keras.layers.LSTM(units=100,go_backwards=True,return_sequences=False)
		self.conL=keras.layers.Concatenate(axis=1)
		self.denseL=keras.layers.Dense(labelsNum,activation="softmax")
		self.reshape2L=keras.layers.Lambda(self.backend_reshape2,output_shape=(maxDocumentLength,labelsNum))
		self.multiply=keras.layers.Multiply()
		self.poolingL=keras.layers.GlobalAveragePooling1D()

	def backend_reshape(self,x):
		return keras.backend.reshape(x,(-1,self.maxSequenceLength,self.embeddingSize))

	def backend_reshape2(self,x):
		return keras.backend.reshape(x,(-1,self.maxDocumentLength,self.labelsNum))

	def getMask(self,lengthD):
		return (lengthToAverageMask(lengthD,self.maxDocumentLength,binary=False))

	def getModel(self):
		inputWords = keras.layers.Input(shape=(self.maxDocumentLength,self.maxSequenceLength,))
		inputMask = keras.layers.Input(shape=(self.maxDocumentLength,self.labelsNum,))
		x=self.embL(inputWords)
		x=self.reshape1L(x)
		lstm1=self.LSTMforL(x)
		lstm2=self.LSTMrevL(x)
		x=self.conL([lstm1,lstm2])
		Ps=self.denseL(x)
		Ps=self.reshape2L(Ps)
		Ps=self.multiply([Ps,inputMask])
		Ps=self.poolingL(Ps)
		model=Model(inputs=[inputWords,inputMask], outputs=Ps)
		return(model)

In [3]:
# Local setup
dataPath="./"
batch_size=64
onGPU=True	
epochs=10
dataset="all_data.tsv"

print('Reading data')
# Reading data

embeddings=readEmbeddings(dataPath+"GoogleNewsUnigrams.txt")
MAX_SEQUENCE_LENGTH=120
MAX_DOCUMENT_LENGTH=50
(labels,sources,nums,wordsL,contents,documentCV,topicCV,sourceCV,lengthD)=readDocuments(dataPath+dataset,MAX_SEQUENCE_LENGTH,embeddings,MAX_DOCUMENT_LENGTH)
embeddingMatrix=prepareEmbeddings(embeddings,wordsL)

print("Converting to numpy")
# Converting to numpy
y=np.asarray(labels,dtype='float32')
allY=np.concatenate((np.expand_dims(1-y,1),np.expand_dims(y,1)),axis=1)
allX=np.array(contents)


# Prepare CV scenarios
documentCV=np.asarray(documentCV,dtype='int32')
topicCV=np.asarray(topicCV,dtype='int32')
sourceCV=np.asarray(sourceCV,dtype='int32')
scenarioCV=sourceCV

Reading data
Converting to numpy


In [None]:

print('Start')
result=np.array([[-1,-1]]*len(scenarioCV),dtype='float32')
for folda in range(max(scenarioCV)):
	fold=folda+1
	print("Evaluating on fold "+str(fold)+"...")
	whichTest=np.isin(scenarioCV,fold)
	trainY=allY[~whichTest,]
	develY=allY[whichTest,]
	trainX=allX[~whichTest,]
	develX=allX[whichTest,]
	style1=Style1(embeddingMatrix,2,MAX_SEQUENCE_LENGTH,MAX_DOCUMENT_LENGTH,onGPU)
	mask=style1.getMask(lengthD)
	trainM=np.array(mask)[~whichTest]
	develM=np.array(mask)[whichTest]
	model=style1.getModel()
	model.compile(optimizer=tf.optimizers.Adam(),loss="binary_crossentropy",metrics=["accuracy"])
	fit=model.fit([trainX,trainM],trainY, epochs=epochs,validation_data=([develX,develM],develY),batch_size=batch_size)
	predictions=model.predict([develX,develM])
	result[whichTest,:]=predictions

evaluateD(result,allY)


Start
Evaluating on fold 1...
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10