Mantéïa

ym001 · May 27, 2020 · 644f5f1 · 644f5f1
1 parent 2b147b4
commit 644f5f1
Show file tree

Hide file tree

Showing 7 changed files with 173 additions and 48 deletions.
diff --git a/Exemples/exemple_Classification1.py b/Exemples/exemple_Classification1.py
@@ -5,5 +5,5 @@
 
 labels = ['funny','not funny']
 
-model = Model(model_name ='roberta')
+model = Model(model_name ='bert')
 cl=Classification(model,documents,labels,process_classif=True)
diff --git a/Exemples/exemple_Classification_dataset.py b/Exemples/exemple_Classification_dataset.py
@@ -1,26 +1,6 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
-#
-#  exemple_Data.py
-#  
-#  Copyright 2020 Yves <yves@mercadier>
-#  
-#  This program is free software; you can redistribute it and/or modify
-#  it under the terms of the GNU General Public License as published by
-#  the Free Software Foundation; either version 2 of the License, or
-#  (at your option) any later version.
-#  
-#  This program is distributed in the hope that it will be useful,
-#  but WITHOUT ANY WARRANTY; without even the implied warranty of
-#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-#  GNU General Public License for more details.
-#  
-#  You should have received a copy of the GNU General Public License
-#  along with this program; if not, write to the Free Software
-#  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
-#  MA 02110-1301, USA.
-#  
-#
+
 from Manteia.Classification import Classification 
 from Manteia.Preprocess import Preprocess
 from Manteia.Dataset import Dataset
@@ -33,13 +13,11 @@ def main(args):
 	pp             = Preprocess(documents=documents,labels=labels,nb_sample=500)
 	documents      = pp.documents
 	labels         = pp.labels
+
 	cl             = Classification(documents_train=documents,labels_train=labels)
 	cl.list_labels = pp.list_labels
 
-	cl.load_model()
-	dt_train ,dt_validation = cl.process_text()
-	cl.model.configuration(dt_train)
-	cl.model.fit(dt_train,dt_validation)
+	cl.process()
 
 	print(cl.predict(documents[:5]))
 

diff --git a/Manteia/Model.py b/Manteia/Model.py
@@ -19,6 +19,7 @@
 from transformers import (
     WEIGHTS_NAME,
     BertTokenizer,
+    BartTokenizer,
     XLNetTokenizer,
     XLMTokenizer,
     RobertaTokenizer,
@@ -28,6 +29,7 @@
     FlaubertTokenizer
 )
 from transformers import BertForSequenceClassification
+from transformers import BartForSequenceClassification
 from transformers import RobertaForSequenceClassification
 from transformers import XLMForSequenceClassification
 from transformers import XLNetForSequenceClassification
@@ -37,6 +39,8 @@
 from transformers import FlaubertForSequenceClassification
 from transformers import GPT2Tokenizer, GPT2LMHeadModel
 
+from transformers import BartForConditionalGeneration, BartConfig
+
 from Manteia.Utils import progress
 
 import numpy as np
@@ -48,7 +52,7 @@
 import datetime
 import gc
 
-#model'distilbert','albert','xlnet','roberta','camenbert','scibert'
+#model'bert','distilbert','albert','bart','xlnet','roberta','camenbert','scibert'
 class Model:
 	r"""
 		This is the class to construct model.
@@ -89,10 +93,11 @@ class Model:
 			
 		Attributes:
 	"""
-	def __init__(self,model_name ='bert',model_type=None,num_labels=0,epochs=None,MAX_SEQ_LEN = 128,early_stopping=False,path='./model',verbose=True): 
+	def __init__(self,model_name ='bert',model_type=None,task='classification',num_labels=0,epochs=None,MAX_SEQ_LEN = 128,early_stopping=False,path='./model',verbose=True): 
 
 		self.model_name      = model_name
 		self.model_type      = model_type
+		self.task            = task
 		self.early_stopping  = early_stopping
 		self.num_labels      = num_labels
 		self.MAX_SEQ_LEN     = MAX_SEQ_LEN
@@ -137,7 +142,17 @@ def load_type(self):
 				else:
 					raise TypeError("{} Model type not in : {}".format(self.model_name,model_dict))
 
-		if self.model_name=='xlnet':
+		if self.model_name=='bart':
+			model_dict=['bart-large','bart-large-mnli','bart-large-cnn','bart-large-xsum','mbart-large-en-ro']
+			if self.model_type is None:
+				self.model_type=model_dict[0]
+			else:
+				if self.model_type in model_dict:
+					print('type compatible')
+				else:
+					raise TypeError("{} Model type not in : {}".format(self.model_name,model_dict))
+
+		if self.model_name=='albert':
 			model_dict=['albert-base-v1','albert-large-v1','albert-xlarge-v1','albert-xxlarge-v1','albert-base-v2','albert-large-v2','albert-xlarge-v2','albert-xxlarge-v2']
 			if self.model_type is None:
 				self.model_type=model_dict[0]
@@ -186,6 +201,15 @@ def load_type(self):
 					print('type compatible')
 				else:
 					raise TypeError("{} Model type not in : {}".format(self.model_name,model_dict))
+		if self.model_name=='flaubert':
+			model_dict=['flaubert-base-uncased', 'flaubert-small-cased', 'flaubert-base-cased', 'flaubert-large-cased']
+			if self.model_type is None:
+				self.model_type=model_dict[0]
+			else:
+				if self.model_type in model_dict:
+					print('type compatible')
+				else:
+					raise TypeError("{} Model type not in : {}".format(self.model_name,model_dict))
 
 
 	def load_tokenizer(self):
@@ -195,19 +219,21 @@ def load_tokenizer(self):
 		if self.model_name=='bert':
 			self.tokenizer = BertTokenizer.from_pretrained      (self.model_type, do_lower_case=True)
 		if self.model_name=='distilbert':
-			self.tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', do_lower_case=True)
+			self.tokenizer = DistilBertTokenizer.from_pretrained(self.model_type, do_lower_case=True)
 		if self.model_name=='albert':
-			self.tokenizer = AlbertTokenizer.from_pretrained    ('albert-base-v1', do_lower_case=True)	
+			self.tokenizer = AlbertTokenizer.from_pretrained    (self.model_type, do_lower_case=True)
+		if self.model_name=='bart':
+			self.tokenizer = BartTokenizer.from_pretrained    (self.model_type, do_lower_case=True)
 		if self.model_name=='xlnet':
-			self.tokenizer = XLNetTokenizer.from_pretrained     ('xlnet-base-cased', do_lower_case=True)
+			self.tokenizer = XLNetTokenizer.from_pretrained     (self.model_type, do_lower_case=True)
 		if self.model_name=='roberta':
-			self.tokenizer = RobertaTokenizer.from_pretrained   ('roberta-base', do_lower_case=True)
+			self.tokenizer = RobertaTokenizer.from_pretrained   (self.model_type, do_lower_case=True)
 		if self.model_name=='camenbert':
-			self.tokenizer = CamembertTokenizer.from_pretrained ('camembert-base', do_lower_case=True)
+			self.tokenizer = CamembertTokenizer.from_pretrained (self.model_type, do_lower_case=True)
 		if self.model_name=='flaubert':
-			self.tokenizer = FlaubertTokenizer.from_pretrained  ('flaubert-base-uncased', do_lower_case=True)
+			self.tokenizer = FlaubertTokenizer.from_pretrained  (self.model_type, do_lower_case=True)
 		if self.model_name=='gpt2':
-			self.tokenizer = GPT2Tokenizer.from_pretrained      ('gpt2-medium')
+			self.tokenizer = GPT2Tokenizer.from_pretrained      (self.model_type)
 
 	def load_class(self):
 		# Load the tokenizer.
@@ -223,19 +249,26 @@ def load_class(self):
 			output_hidden_states = False, # Whether the model returns all hidden-states.
 		)
 		if self.model_name=='distilbert':
-			self.model     = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased",num_labels = self.num_labels,output_attentions = False,output_hidden_states = False,)
+			self.model     = DistilBertForSequenceClassification.from_pretrained(self.model_type,num_labels = self.num_labels,output_attentions = False,output_hidden_states = False,)
 		if self.model_name=='albert':
-			self.model     = AlbertForSequenceClassification.from_pretrained    ("albert-base-v1",num_labels = self.num_labels,output_attentions = False,output_hidden_states = False,)
+			self.model     = AlbertForSequenceClassification.from_pretrained    (self.model_type,num_labels = self.num_labels,output_attentions = False,output_hidden_states = False,)
+		if self.model_name=='bart':
+			if self.task=='classification':
+				self.model = BartForSequenceClassification.from_pretrained      (self.model_type,num_labels = self.num_labels,output_attentions = False,output_hidden_states = False,)
+			if self.task=='summarize':
+				self.model = BartForConditionalGeneration.from_pretrained       (self.model_type)
+
+
 		if self.model_name=='xlnet':
-			self.model     = XLNetForSequenceClassification.from_pretrained     ("xlnet-base-cased",num_labels = self.num_labels,output_attentions = False,output_hidden_states = False,)
+			self.model     = XLNetForSequenceClassification.from_pretrained     (self.model_type,num_labels = self.num_labels,output_attentions = False,output_hidden_states = False,)
 		if self.model_name=='roberta':
-			self.model     = RobertaForSequenceClassification.from_pretrained   ("roberta-base",num_labels = self.num_labels,output_attentions = False,output_hidden_states = False,)
+			self.model     = RobertaForSequenceClassification.from_pretrained   (self.model_type,num_labels = self.num_labels,output_attentions = False,output_hidden_states = False,)
 		if self.model_name=='camenbert':
-			self.model     = CamembertForSequenceClassification.from_pretrained ("camembert-base",num_labels = self.num_labels,output_attentions = False,output_hidden_states = False,)
+			self.model     = CamembertForSequenceClassification.from_pretrained (self.model_type,num_labels = self.num_labels,output_attentions = False,output_hidden_states = False,)
 		if self.model_name=='flaubert':
-			self.model     = FlaubertForSequenceClassification.from_pretrained  ("flaubert-base-uncased",num_labels = self.num_labels,output_attentions = False,output_hidden_states = False,)
+			self.model     = FlaubertForSequenceClassification.from_pretrained  (self.model_type,num_labels = self.num_labels,output_attentions = False,output_hidden_states = False,)
 		if self.model_name=='gpt2-medium':
-			self.model     = GPT2LMHeadModel.from_pretrained('gpt2-medium')
+			self.model     = GPT2LMHeadModel.from_pretrained                    (self.model_type)
 
 	def devices(self):
 		# If there's a GPU available...
@@ -533,12 +566,12 @@ def save(self,file_name):
 			else:
 				print ("Successfully created the directory %s " % self.path)
 		self.model.to(torch.device('cpu'))
-		torch.save(self.model.module.state_dict(),self.path+file_name)
+		torch.save(self.model.module.state_dict(),os.path.join(self.path,file_name))
 		self.model.to(self.device)
 
 	def load(self,file_name):
 			self.load_class()
-			self.model.load_state_dict(torch.load(self.path+file_name))
+			self.model.load_state_dict(torch.load(os.path.join(self.path,file_name)))
 			self.model.to(self.device)
 
 def choose_from_top(probs, n=5):

diff --git a/Manteia/Summarize.py b/Manteia/Summarize.py
@@ -0,0 +1,107 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+.. module:: Classification
+   :platform: Unix, Windows
+   :synopsis: A useful module indeed.
+
+.. moduleauthor:: Yves Mercadier <manteia.ym001@gmail.com>
+
+
+"""
+import numpy as np
+import random
+import pandas as pd
+import sklearn
+from sklearn.model_selection import train_test_split,KFold
+import time
+import datetime
+import gc
+from .Model import *
+from .Preprocess import Preprocess,list_labels
+
+class Summarize:
+	r"""
+		This is the class to summarize text.
+		
+		Args:
+		
+			model (:obj:`Model`, optional, defaults to  'bert'):
+				give the name of a model.
+				
+			documents (:obj:`list`, optional, defaults to None):
+				A list of documents.
+				
+			labels (:obj:`float`, optional, defaults to None):
+				A list of labels.
+
+				 
+		Example 1::
+
+		
+			from Manteia.Summarize import Summarize
+			
+			
+	"""
+	def __init__(self,model=None,documents = [],verbose=True):
+
+		self.process_classif = process_classif
+		self.verbose         = verbose
+		self.model           = model
+		self.documents_train = documents_train
+		self.labels_train    = labels_train
+		self.documents_test  = documents_test
+		self.labels_test     = labels_test
+
+		self.load_model()
+		inputs=self.process_text()
+		print(self.predict(inputs))
+
+	def load_model(self):
+		"""
+		Example 3::
+		
+			from Manteia.Summarize import Summarize 
+			
+		"""
+		if self.model is None:
+			self.model = Model(model_name ='bart',model_type='bart-large-cnn',task='summarize')
+		self.model.load_type()
+		self.model.load_tokenizer()
+		self.model.load_class()
+
+
+
+	def process_text(self):
+		r"""
+		This is the description of the process_text function.
+		
+		Example 4::
+		
+			from Manteia.Summarize import Summarize
+		"""
+		inputs = self.model.tokenizer.batch_encode_plus(self.documents, max_length=1024, return_tensors='pt')
+		return inputs
+
+	def predict(self,inputs):
+		r"""
+		This is the description of the predict function.
+		
+		Args:
+		
+			documents (:obj:`list`, optional, defaults to None):
+				A list of documents (str).
+				 
+					 
+		Example 5::
+		
+			from Manteia.Summarize import Summarize 
+			
+		"""
+		summary_ids = self.model.generate(inputs['input_ids'], num_beams=4, max_length=5, early_stopping=True)
+		summary     = [self.model.tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids]
+		return summary
+
+
+
+
diff --git a/Manteia/__init__.py b/Manteia/__init__.py
@@ -28,7 +28,7 @@
 	reminiscent.
 """
 
-__version__ = "0.0.26"
+__version__ = "0.0.29"
 
 
 from Manteia import Classification

diff --git a/README.md b/README.md
@@ -11,6 +11,13 @@ You can install it with pip :
 
      __pip install Manteia__
 
+For use with GPU and cuda we recommend the use of [Anaconda](https://www.anaconda.com/open-source) :
+
+     __conda create -n manteia_env python=3.7__
+     __conda activate manteia_env__
+     __conda install pytorch__
+     __pip install manteia__
+
 Example of use Classification :
 
 

diff --git a/setup.py b/setup.py
@@ -6,13 +6,13 @@
 
 #with open('requirements.txt') as f:
 #    requirements = f.read().splitlines()
-requirements = ['wget==3.2','matplotlib==3.2.1','nltk==3.5','numpy==1.18.4','pandas==1.0.3','scikit_learn==0.23.1','seaborn==0.10.1','torch==1.5.0','transformers==2.9.1']
+requirements = ['wget==3.2','matplotlib==3.2.1','nltk==3.5','numpy==1.18.4','pandas==1.0.3','scikit_learn==0.23.1','seaborn==0.10.1','torch','transformers==2.9.1']
 setup(
 
     name='Manteia',
 
 #    version=Manteia.__version__,
-    version="0.0.26",
+    version="0.0.29",
 
     packages=find_packages(),