kj

ym001 · May 8, 2020 · 621aa03 · 621aa03
1 parent 9b63eec
commit 621aa03
Show file tree

Hide file tree

Showing 48 changed files with 4,435 additions and 1,653 deletions.
diff --git a/Manteia/Augmentation.py b/Manteia/Augmentation.py
@@ -0,0 +1,282 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+#  data.py
+#  
+#  Copyright 2017 yves <yves.mercadier@ac-montpellier.fr>
+#  
+#  This program is free software; you can redistribute it and/or modify
+#  it under the terms of the GNU General Public License as published by
+#  the Free Software Foundation; either version 2 of the License, or
+#  (at your option) any later version.
+#  
+#  This program is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#  GNU General Public License for more details.
+#  
+#  You should have received a copy of the GNU General Public License
+#  along with this program; if not, write to the Free Software
+#  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+#  MA 02110-1301, USA.
+#  
+#  '
+import numpy as np
+import random
+from nltk.corpus import wordnet
+
+class Augmentation:
+
+	r"""
+		This is the class to do data augmentation.
+		
+		Args:
+		
+				
+			documents (:obj:`list`, optional, defaults to None):
+				A list of documents.
+				
+			labels (:obj:`float`, optional, defaults to None):
+				A list of labels.
+
+			dataset_name (:obj:`string`, optional, defaults to ''):
+				Name of the dataset.
+
+			path (:obj:`string`, optional, defaults to ''):
+				Path to save the report.
+				 
+		Example::
+		
+			from Manteia.Statistic import Statistic
+			documents=['a text','text b']
+			labels=['a','b']
+			Statistic(documents,labels)
+			
+		Attributes:
+	"""
+	def __init__(self,documents=[],labels=[],strategy='daia',verbose=True):
+
+		self.documents    = documents
+		self.labels       = labels
+		self.verbose      = verbose
+		if verbose:
+				print('Augmentation %s.' % strategy)
+		if strategy=='eda':
+			self.documents_augmented,self.labels_augmented = eda(self.documents,self.labels)
+		if strategy=='uda':
+			self.documents_augmented,self.labels_augmented = eda(self.documents,self.labels)
+		if strategy=='pyramid':
+			self.documents_augmented,self.labels_augmented = pyramid(self.documents,self.labels)
+
+	def test(self):
+		return "Mantéïa Augmentation."
+
+def uda(documents,labels):
+	documents_augmented=[]
+	labels_augmented=[]
+
+	data_stats=get_data_stats(documents)
+	token_prob=0.9
+	op = TfIdfWordRep(token_prob, data_stats)
+
+	for text,label in zip(documents,labels):
+		text_aug=op(text)
+		documents_augmented.append(text_aug)
+		labels_augmented.append(label)
+	return documents_augmented,labels_augmented
+
+#https://github.com/google-research/uda/blob/master/text/augmentation/word_level_augment.py
+def get_data_stats(texts):
+  """Compute the IDF score for each word. Then compute the TF-IDF score."""
+  word_doc_freq = collections.defaultdict(int)
+  # Compute IDF
+  for text in texts:
+    cur_word_dict = {}
+    cur_sent = text.split(' ')
+    for word in cur_sent:
+      cur_word_dict[word] = 1
+    for word in cur_word_dict:
+      word_doc_freq[word] += 1
+  idf = {}
+  for word in word_doc_freq:
+    idf[word] = math.log(len(texts) * 1. / word_doc_freq[word])
+  # Compute TF-IDF
+  tf_idf = {}
+  for text in texts:
+    cur_word_dict = {}
+    cur_sent = text.split(' ')
+    for word in cur_sent:
+      if word not in tf_idf:
+        tf_idf[word] = 0
+      tf_idf[word] += 1. / len(cur_sent) * idf[word]
+  return {
+      "idf": idf,
+      "tf_idf": tf_idf,
+  }
+
+class EfficientRandomGen(object):
+  """A base class that generate multiple random numbers at the same time."""
+
+  def reset_random_prob(self):
+    """Generate many random numbers at the same time and cache them."""
+    cache_len = 100000
+    self.random_prob_cache = np.random.random(size=(cache_len,))
+    self.random_prob_ptr = cache_len - 1
+
+  def get_random_prob(self):
+    """Get a random number."""
+    value = self.random_prob_cache[self.random_prob_ptr]
+    self.random_prob_ptr -= 1
+    if self.random_prob_ptr == -1:
+      self.reset_random_prob()
+    return value
+
+  def get_random_token(self):
+    """Get a random token."""
+    token = self.token_list[self.token_ptr]
+    self.token_ptr -= 1
+    if self.token_ptr == -1:
+      self.reset_token_list()
+    return token
+
+class TfIdfWordRep(EfficientRandomGen):
+  """TF-IDF Based Word Replacement."""
+
+  def __init__(self, token_prob, data_stats):
+    super(TfIdfWordRep, self).__init__()
+    self.token_prob = token_prob
+    self.data_stats = data_stats
+    self.idf = data_stats["idf"]
+    self.tf_idf = data_stats["tf_idf"]
+    tf_idf_items = data_stats["tf_idf"].items()
+    tf_idf_items = sorted(tf_idf_items, key=lambda item: -item[1])
+    self.tf_idf_keys = []
+    self.tf_idf_values = []
+    for key, value in tf_idf_items:
+      self.tf_idf_keys += [key]
+      self.tf_idf_values += [value]
+    self.normalized_tf_idf = np.array(self.tf_idf_values)
+    self.normalized_tf_idf = (self.normalized_tf_idf.max()
+                              - self.normalized_tf_idf)
+    self.normalized_tf_idf = (self.normalized_tf_idf
+                              / self.normalized_tf_idf.sum())
+    self.reset_token_list()
+    self.reset_random_prob()
+
+  def get_replace_prob(self, all_words):
+    """Compute the probability of replacing tokens in a sentence."""
+    cur_tf_idf = collections.defaultdict(int)
+    for word in all_words:
+      cur_tf_idf[word] += 1. / len(all_words) * self.idf[word]
+    replace_prob = []
+    for word in all_words:
+      replace_prob += [cur_tf_idf[word]]
+    replace_prob = np.array(replace_prob)
+    replace_prob = np.max(replace_prob) - replace_prob
+    replace_prob = (replace_prob / replace_prob.sum() *
+                    self.token_prob * len(all_words))
+    return replace_prob
+
+  def __call__(self, example):
+
+    all_words = example.split(' ')
+
+    replace_prob = self.get_replace_prob(all_words)
+    all_words = self.replace_tokens(
+        all_words,
+        replace_prob[:len(all_words)]
+        )
+
+    return " ".join(all_words)
+
+  def replace_tokens(self, word_list, replace_prob):
+    """Replace tokens in a sentence."""
+    for i in range(len(word_list)):
+      if self.get_random_prob() < replace_prob[i]:
+        word_list[i] = self.get_random_token()
+    return word_list
+
+  def reset_token_list(self):
+    cache_len = len(self.tf_idf_keys)
+    token_list_idx = np.random.choice(
+        cache_len, (cache_len,), p=self.normalized_tf_idf)
+    self.token_list = []
+    for idx in token_list_idx:
+      self.token_list += [self.tf_idf_keys[idx]]
+    self.token_ptr = len(self.token_list) - 1
+    print("sampled token list: {:s}".format(" ".join(self.token_list)))
+
+def eda(documents,labels):
+	documents_augmented=[]
+	labels_augmented=[]
+
+	for document,label in zip(documents,labels):
+		text_list,label_list = eda_text(document,label)
+		documents_augmented  = documents_augmented+text_list
+		labels_augmented     = labels_augmented+label_list
+	return documents_augmented,labels_augmented
+
+def eda_text(text,label):
+	text_list,label_list=[],[]
+
+	#pour decoupage en word
+	word_list_1=text.split(' ')
+	#inversion de deux mot
+	idx_1 = random.randint(0,len(word_list_1)-1) 
+	idx_2 = random.randint(0,len(word_list_1)-1) 
+	word_list_1[idx_1],word_list_1[idx_2] = word_list_1[idx_2],word_list_1[idx_1]
+	text_list = [' '.join(word_list_1)]
+	label_list= [label]
+	#suppression d'un mot mot
+	word_list_2=text.split(' ')
+	idx_3 = random.randint(0,len(word_list_2)-1) 
+	del word_list_2[idx_1]
+	text_list.append(' '.join(word_list_2))
+	label_list.append(label)
+	#Synonym Replacement
+	word_list_3=text.split(' ')
+	idx_4 = random.randint(0,len(word_list_3)-1) 
+	if len(wordnet.synsets(word_list_3[idx_4]))>0:
+			idx_synonym=random.randint(0,len(wordnet.synsets(word_list_3[idx_4]))-1)
+			synonym = wordnet.synsets(word_list_3[idx_4])[idx_synonym].lemma_names()[0]
+			if synonym!=word_list_3[idx_4]:
+				word_list_3[idx_4]=synonym
+				text_list.append(' '.join(word_list_2))
+				label_list.append(label)
+	#Random Insertion (RI)
+	word_list_4=text.split(' ')
+	idx_5 = random.randint(0,len(word_list_4)-1) 
+	idx_6 = random.randint(0,len(word_list_4)-1) 
+	if len(wordnet.synsets(word_list_4[idx_5]))>0:
+			idx_synonym=random.randint(0,len(wordnet.synsets(word_list_4[idx_5]))-1)
+			synonym = wordnet.synsets(word_list_4[idx_5])[idx_synonym].lemma_names()[0]
+			if synonym!=word_list_4[idx_5]:
+				word_list_4.insert(idx_6, synonym)
+				text_list.append(' '.join(word_list_2))
+				label_list.append(label)
+	return text_list,label_list
+
+def split_text(text,label):
+	text_list,label_list=[],[]
+
+	decoup_1a = int(0.05*len(text))
+	decoup_1b = int(0.95*len(text))
+	decoup_2 = int(len(text)/2)
+	decoup_3 = int(len(text)/3)
+
+
+	#for split in 3 levels
+	text_list  = text_list+[text[decoup_1a:decoup_1b],text[:decoup_2],text[decoup_2:],text[:decoup_3],text[decoup_3:2*decoup_3],text[2*decoup_3:]]
+	label_list = label_list+[label,label,label,label,label,label]
+
+	return text_list,label_list
+
+def pyramid(documents,labels):
+	documents_augmented=[]
+	labels_augmented=[]
+
+	for text,label in zip(documents,labels):
+		text_list,label_list=split_text(text,label)
+		documents_augmented  = documents_augmented+text_list
+		labels_augmented = labels_augmented+label_list
+	return documents_augmented,labels_augmented
diff --git a/Manteia/Classification.py b/Manteia/Classification.py
@@ -52,7 +52,7 @@ def __init__(self,documents = [],labels = [],model=None,process=False,verbose=Tr
 		if model!=None:
 			self.model = model
 		else:
-			self.model = Model(num_labels=len(pp.list_labels),early_stopping=True)
+			self.model = Model(num_labels=len(pp.list_labels))
 
 		if process:
 			print('Process...')
@@ -78,7 +78,7 @@ def predict(self,documents):
 		inputs,masks   = encode_text(documents,self.model.tokenizer)
 		predict_inputs = totensors(inputs)
 		predict_masks  = totensors(masks)
-		dt             = Create_DataLoader_predict(predict_inputs,predict_masks)
+		dt             = Create_DataLoader_predict(inputs=predict_inputs,masks=predict_masks)
 		prediction     = self.model.predict(dt)
 		prediction     = decode_label(prediction,self.list_labels)
 		return prediction

diff --git a/Manteia/Dataset.py b/Manteia/Dataset.py
@@ -26,6 +26,8 @@
 import numpy as np
 from nltk.corpus import reuters,brown,webtext
 from sklearn.datasets import fetch_20newsgroups
+import urllib.request
+import zipfile
 
 class Dataset:
 
@@ -61,6 +63,10 @@ def test(self):
 	def load(self):
 		if self.name=="20newsgroups":
 			self.documents,self.labels=self.load_20newsgroups()
+		if self.name=="SST-2":
+			self.documents,self.labels=self.load_SST_2()
+		if self.name=="SST-B":
+			self.documents,self.labels=self.load_SST_B()
 
 	def load_20newsgroups(self):
 		#categorie = ['sci.crypt', 'sci.electronics','sci.med', 'sci.space']
@@ -71,6 +77,24 @@ def load_20newsgroups(self):
 		for i in range(len(twenty_train.target)):
 			label.append(categorie[twenty_train.target[i]])
 		return doc,label
+
+	def load_SST_2(self):
+		url='https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSST-2.zip?alt=media&token=aabc5f6b-e466-44a2-b9b4-cf6337f84ac8'
+		print("Downloading and extracting SST-2...")
+		self.download_and_extract(url,'./dataset')
+
+	def load_SST_B(self):
+		url='https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSTS-B.zip?alt=media&token=bddb94a7-8706-4e0d-a694-1109e12273b5'
+		print("Downloading and extracting SST-B...")
+		self.download_and_extract(url,'./dataset')
+
+	def download_and_extract(url, data_dir):
+		data_file = "temp"
+		urllib.request.urlretrieve(url, data_file)
+		with zipfile.ZipFile(data_file) as zip_ref:
+			zip_ref.extractall(data_dir)
+		os.remove(data_file)
+		print("\tCompleted!")
 
 	def get_documents(self):
 		return self.documents