df

ym001 · May 7, 2020 · f14070e · f14070e
1 parent b45f6c7
commit f14070e
Show file tree

Hide file tree

Showing 46 changed files with 1,366 additions and 387 deletions.
diff --git a/Exemples/exemple_Classification.py b/Exemples/exemple_Classification.py
@@ -41,7 +41,7 @@ def main(args):
 			['funny'],['not funny'],['funny'],['not funny'],['funny'],['not funny'],['not funny'],['not funny'],['funny'],['not funny'],
 			]
 
-	cl=Classification(documents=documents,labels=labels)
+	cl=Classification(documents=documents,labels=labels,process=True)
 	print(cl.predict(documents[:2]))
 	return 0
 

diff --git a/Exemples/exemple_Dataset.py b/Exemples/exemple_Dataset.py
@@ -0,0 +1,8 @@
+from Manteia.Dataset import Dataset
+
+ds=Dataset('20newsgroups')
+documents=ds.get_documents()
+labels=ds.get_labels()
+
+print(documents[:5])
+print(labels[:5])
diff --git a/Manteia/Classification.py b/Manteia/Classification.py
@@ -43,21 +43,25 @@ class Classification:
 			
 		Attributes:
 	"""
-	def __init__(self,model_name ='bert',documents = [],labels = []): 
-		self.MAX_SEQ_LEN = 64
-		self.model_name  = model_name
+	def __init__(self,documents = [],labels = [],model=None,process=False,verbose=True):
 
 		if documents!=[] and labels!=[]:
 			pp               = Preprocess(documents=documents,labels=labels)
 			self.list_labels = pp.list_labels
-			print(self.list_labels)
-			self.model       = Model(num_labels=len(pp.list_labels))
+
+		if model!=None:
+			self.model = model
+		else:
+			self.model = Model(num_labels=len(pp.list_labels),early_stopping=True)
+
+		if process:
+			print('Process...')
 			self.model.load()
 
 			train_text, validation_text, train_labels, validation_labels = train_test_split(pp.documents, pp.labels, random_state=2018, test_size=0.1)
 
-			train_ids,train_masks           = encode_text(train_text,self.model.tokenizer,self.MAX_SEQ_LEN)
-			validation_ids,validation_masks = encode_text(validation_text,self.model.tokenizer,self.MAX_SEQ_LEN)
+			train_ids,train_masks           = encode_text(train_text,self.model.tokenizer,self.model.MAX_SEQ_LEN)
+			validation_ids,validation_masks = encode_text(validation_text,self.model.tokenizer,self.model.MAX_SEQ_LEN)
 			train_labels                    = encode_label(train_labels,pp.list_labels)
 			validation_labels               = encode_label(validation_labels,pp.list_labels)
 

diff --git a/Manteia/Dataset.py b/Manteia/Dataset.py
@@ -0,0 +1,79 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+#  data.py
+#  
+#  Copyright 2017 yves <yves.mercadier@ac-montpellier.fr>
+#  
+#  This program is free software; you can redistribute it and/or modify
+#  it under the terms of the GNU General Public License as published by
+#  the Free Software Foundation; either version 2 of the License, or
+#  (at your option) any later version.
+#  
+#  This program is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#  GNU General Public License for more details.
+#  
+#  You should have received a copy of the GNU General Public License
+#  along with this program; if not, write to the Free Software
+#  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+#  MA 02110-1301, USA.
+#  
+#  '
+
+from .Preprocess import Preprocess
+import numpy as np
+from nltk.corpus import reuters,brown,webtext
+from sklearn.datasets import fetch_20newsgroups
+
+class Dataset:
+
+	r"""
+		This is the class to give datasets.
+		
+		Args:
+
+			dataset_name (:obj:`string`, optional, defaults to ''):
+				Name of the dataset.
+				 
+		Example::
+
+			from Manteia.Dataset import Dataset
+
+			ds=Dataset('20newsgroups')
+			documents=ds.get_documents()
+			labels=ds.get_labels()
+
+			print(documents[:5])
+			print(labels[:5])
+
+		Attributes:
+	"""
+	def __init__(self,name='20newsgroups'):
+
+		self.name=name
+		self.load()
+
+	def test(self):
+		return "Mantéïa Dataset."
+
+	def load(self):
+		if self.name=="20newsgroups":
+			self.documents,self.labels=self.load_20newsgroups()
+
+	def load_20newsgroups(self):
+		#categorie = ['sci.crypt', 'sci.electronics','sci.med', 'sci.space']
+		categorie = ['sci.crypt', 'sci.electronics','sci.med', 'sci.space','rec.autos','rec.motorcycles','rec.sport.baseball','rec.sport.hockey','talk.politics.guns','talk.politics.mideast','talk.politics.misc','talk.religion.misc']
+		twenty_train = fetch_20newsgroups(subset='train',categories=categorie, shuffle=True, random_state=42)
+		doc=twenty_train.data
+		label=[]
+		for i in range(len(twenty_train.target)):
+			label.append(categorie[twenty_train.target[i]])
+		return doc,label
+
+	def get_documents(self):
+		return self.documents
+
+	def get_labels(self):
+		return self.labels
diff --git a/Manteia/Generation.py b/Manteia/Generation.py
@@ -26,11 +26,9 @@ class Generation:
 				 
 		Example::
 		
-			from Manteia.Classification import Classification
-			# Initializing a list of texts,labels
-			documents=['a text','text b']
-			labels=['a','b']
-			Classification(documents,labels)
+			from Manteia.Generation import Generation 
+						
+			Generation(seed='What do you do if a bird shits on your car?')
 			
 		Attributes:
 	"""

diff --git a/Manteia/Model.py b/Manteia/Model.py
@@ -96,13 +96,13 @@ class Model:
 			
 		Attributes:
 	"""
-	def __init__(self,model_name ='bert',num_labels=0): # constructeur
+	def __init__(self,model_name ='bert',num_labels=0,early_stopping=False,path='./model'): # constructeur
 		self.model_name = model_name
-		self.batch_size = 32
-		self.epochs = 4
-		self.MAX_SEQ_LEN = 12
-
+		self.early_stopping=early_stopping
 		self.num_labels=num_labels
+		self.path=path
+		if self.early_stopping:
+			self.es=EarlyStopping(path=path)
 	def test(self):
 		return "Model Mantéïa."
 	def load(self):
@@ -162,7 +162,11 @@ def device(self):
 			print('No GPU available, using the CPU instead.')
 			self.device = torch.device("cpu")
 
-	def configuration(self,train_dataloader):
+	def configuration(self,train_dataloader,batch_size = 16,epochs = 20,MAX_SEQ_LEN = 128):
+		self.batch_size = batch_size
+		self.epochs = epochs
+		self.MAX_SEQ_LEN = MAX_SEQ_LEN
+
 		self.model.cuda()
 		self.optimizer = AdamW(self.model.parameters(),lr = 2e-5,eps = 1e-8)
 		self.total_steps = len(train_dataloader) * self.epochs
@@ -224,9 +228,7 @@ def fit(self,train_dataloader,validation_dataloader):
 			print("")
 			print("  Average training loss: {0:.2f}".format(avg_train_loss))
 			print("  Training epoch took: {:}".format(format_time(time.time() - t0)))
-
-
-			print("")
+
 			print("Running Validation...")
 
 			t0 = time.time()
@@ -258,14 +260,22 @@ def fit(self,train_dataloader,validation_dataloader):
 					eval_accuracy += tmp_eval_accuracy
 
 					nb_eval_steps += 1
-
-			print("  Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
+			acc_validation=eval_accuracy/nb_eval_steps
+			print("  Accuracy: {0:.2f}".format(acc_validation))
 			print("  Validation took: {:}".format(format_time(time.time() - t0)))
 
+			if self.early_stopping:
+				self.es(acc_validation, self.model)
+
+				if self.es.early_stop:
+					print("Early stopping")
+					break
 		print("")
 		print("Training complete!")
 
 	def predict(self,predict_dataloader):
+		if self.early_stopping:
+			self.model.from_pretrained(self.path)
 		self.model.eval()
 		prediction=[]
 		for batch in predict_dataloader:
@@ -285,6 +295,7 @@ def predict(self,predict_dataloader):
 						tmp_logits = tmp_logits.detach().cpu().numpy()
 					prediction.extend(flat_prediction(tmp_logits))
 		return prediction
+
 	def fit_generation(self,text_loader):
 
 		self.model.train()
@@ -500,9 +511,69 @@ def format_time(elapsed):
 
 # Function to calculate the accuracy of our predictions vs labels
 def flat_accuracy(preds, labels):
-    pred_flat = np.argmax(preds, axis=1).flatten()
-    labels_flat = labels.flatten()
-    return np.sum(pred_flat == labels_flat) / len(labels_flat)
+	pred_flat = np.argmax(preds, axis=1).flatten()
+	labels_flat = labels.flatten()
+	return np.sum(pred_flat == labels_flat) / len(labels_flat)
+
+def accuracy(preds, labels):
+	return np.sum(preds == labels) / len(labels)
 
 def flat_prediction(preds):
-    return np.argmax(preds, axis=1).flatten()
+	return np.argmax(preds, axis=1).flatten()
+
+
+class EarlyStopping:
+	"""Early stops the training if validation loss doesn't improve after a given patience."""
+	def __init__(self, patience=2, delta=0,path=None, verbose=True):
+		"""
+        Args:
+            patience (int): How long to wait after last time validation loss improved.
+                            Default: 2
+            verbose (bool): If True, prints a message for each validation loss improvement. 
+                            Default: False
+            delta (float): Minimum change in the monitored quantity to qualify as an improvement.
+                            Default: 0
+		"""
+		self.patience = patience
+		self.verbose = verbose
+		self.counter = 0
+		self.best_score = None
+		self.early_stop = False
+		self.acc_validation_min = 0
+		self.delta = delta
+		self.path=path
+
+	def __call__(self, acc_validation , model):
+
+		score = acc_validation
+
+		if self.best_score is None:
+			self.best_score = score
+			self.save_checkpoint(acc_validation, model)
+		elif score < self.best_score:
+			self.counter += 1
+			print(f'EarlyStopping counter: {self.counter} out of {self.patience}')
+			if self.counter >= self.patience:
+				self.early_stop = True
+		else:
+			print(f'Save model : {self.counter} out of {self.patience}')
+			self.best_score = score
+			self.save_checkpoint(acc_validation, model)
+			self.counter = 0
+
+	def save_checkpoint(self, acc_validation, model):
+		'''Saves model when validation loss decrease.'''
+		print(self.path)
+		if self.verbose:
+			print(f'Validation accuracy increased ({self.acc_validation_min:.6f} --> {acc_validation:.6f}).  Saving model ...')
+		import os
+		if not os.path.isdir(self.path):
+			# define the name of the directory to be created
+			try:
+				os.mkdir(self.path)
+			except OSError:
+				print ("Creation of the directory %s failed" % self.path)
+			else:
+				print ("Successfully created the directory %s " % self.path)
+		model.save_pretrained(self.path)
+		self.acc_validation_min = acc_validation
diff --git a/README.md b/README.md
@@ -5,23 +5,27 @@ This module proclaims the good word. May they
 regain total freedom of artificial thought towards a new age
 reminiscent.
 
-You can install it with pip:
+You can install it with pip :
 
      pip install Manteia
 
-Example of use Classification:
+Example of use Classification :
 
      >>> from Manteia.Classification import Classification
      >>> # Initializing a list of texts,labels
      >>> documents=['a text','text b']
      >>> labels=['a','b']
      >>> Classification(documents,labels)
 
-Example of use Generation:
+Example of use Generation :
 
      >>> from Manteia.Generation import Generation
      >>> Generation(seed='What do you do if a bird shits on your car?')
      If you're a car owner, you're supposed to be able to call the police
      and have them take the bird off the car.
-
+
+documentation :
+
+     https://manteia.readthedocs.io/en/latest/#
+
 This code is licensed under MIT.
diff --git a/docs/Classification.rst b/docs/Classification.rst
@@ -0,0 +1,5 @@
+Classification
+==============
+
+.. automodule:: Manteia.Classification
+    :members:
diff --git a/docs/Generation.rst b/docs/Generation.rst
@@ -0,0 +1,5 @@
+Generation
+==========
+
+.. automodule:: Manteia.Generation
+    :members:
diff --git a/docs/Model.rst b/docs/Model.rst
@@ -0,0 +1,5 @@
+Model
+=====
+
+.. automodule:: Manteia.Model
+    :members:
diff --git a/docs/Preprocess.rst b/docs/Preprocess.rst
@@ -0,0 +1,5 @@
+Preprocess
+==========
+
+.. automodule:: Manteia.Preprocess
+    :members:
diff --git a/docs/Statistic.rst b/docs/Statistic.rst
@@ -0,0 +1,5 @@
+Statistic
+=========
+
+.. automodule:: Manteia.Statistic
+    :members:
diff --git a/docs/Task.rst b/docs/Task.rst
@@ -0,0 +1,5 @@
+Task
+====
+
+.. automodule:: Manteia.Task
+    :members:
diff --git a/docs/Visualisation.rst b/docs/Visualisation.rst
@@ -0,0 +1,5 @@
+Visualisation
+=============
+
+.. automodule:: Manteia.Visualisation
+    :members:
diff --git a/docs/_build/doctrees/Classification.doctree b/docs/_build/doctrees/Classification.doctree
diff --git a/docs/_build/doctrees/Generation.doctree b/docs/_build/doctrees/Generation.doctree
diff --git a/docs/_build/doctrees/Model.doctree b/docs/_build/doctrees/Model.doctree
diff --git a/docs/_build/doctrees/Preprocess.doctree b/docs/_build/doctrees/Preprocess.doctree
diff --git a/docs/_build/doctrees/Statistic.doctree b/docs/_build/doctrees/Statistic.doctree
diff --git a/docs/_build/doctrees/Task.doctree b/docs/_build/doctrees/Task.doctree
diff --git a/docs/_build/doctrees/Visualisation.doctree b/docs/_build/doctrees/Visualisation.doctree
diff --git a/docs/_build/doctrees/environment.pickle b/docs/_build/doctrees/environment.pickle
diff --git a/docs/_build/doctrees/index.doctree b/docs/_build/doctrees/index.doctree