hg

ym001 · May 6, 2020 · abf1805 · abf1805
1 parent 3181f02
commit abf1805
Show file tree

Hide file tree

Showing 12 changed files with 332 additions and 45 deletions.
diff --git a/Manteia/Classification.py b/Manteia/Classification.py
@@ -43,13 +43,14 @@ class Classification:
 			
 		Attributes:
 	"""
-	def __init__(self,model_name ='bert',documents = None,labels = None): 
+	def __init__(self,model_name ='bert',documents = [],labels = []): 
 		self.MAX_SEQ_LEN = 64
 		self.model_name  = model_name
 
-		if documents!=None and labels!=None:
-			pp               = Preprocess(documents,labels)
+		if documents!=[] and labels!=[]:
+			pp               = Preprocess(documents=documents,labels=labels)
 			self.list_labels = pp.list_labels
+			print(self.list_labels)
 			self.model       = Model(num_labels=len(pp.list_labels))
 			self.model.load()
 
@@ -65,6 +66,7 @@ def __init__(self,model_name ='bert',documents = None,labels = None):
 
 			self.model.configuration(dt_train)
 			self.model.fit(dt_train,dt_validation)
+
 	def test(self):
 		return "Classification Mantéïa."
 

diff --git a/Manteia/Model.py b/Manteia/Model.py
@@ -33,7 +33,8 @@
     RobertaTokenizer,
     DistilBertTokenizer,
     AlbertTokenizer,
-    CamembertTokenizer
+    CamembertTokenizer,
+    FlaubertTokenizer
 )
 from transformers import BertForSequenceClassification
 from transformers import RobertaForSequenceClassification
@@ -42,6 +43,7 @@
 from transformers import DistilBertForSequenceClassification
 from transformers import AlbertForSequenceClassification
 from transformers import CamembertForSequenceClassification
+from transformers import FlaubertForSequenceClassification
 from transformers import GPT2Tokenizer, GPT2LMHeadModel
 
 import numpy as np
@@ -55,25 +57,66 @@
 
 #model'distilbert','albert','xlnet','roberta','camenbert','scibert'
 class Model:
-	def __init__(self,model_name ='bert',num_labels=None): # constructeur
+	r"""
+		This is the class to construct model.
+		
+		Args:
+		
+			model_name (:obj:`string`, optional, defaults to  'bert'):
+				give the name of a model.
+			num_labels (:obj:`int`, optional, defaults to  '0'):
+				give the number of categorie for classification.
+				
+
+				 
+		Example::
+		
+			from Manteia.Preprocess import Preprocess
+			from Manteia.Model import Model,encode_text,encode_label,Create_DataLoader_train
+			from sklearn.model_selection import train_test_split
+
+			documents=['a text','text b']
+			labels=['a','b']
+			pp               = Preprocess(documents=documents,labels=labels)
+			model       = Model(model_name=model_name,num_labels=len(pp.list_labels))
+			model.load()
+
+			train_text, validation_text, train_labels, validation_labels = train_test_split(pp.documents, pp.labels, random_state=2018, test_size=0.1)
+
+			train_ids,train_masks           = encode_text(train_text,model.tokenizer,MAX_SEQ_LEN)
+			validation_ids,validation_masks = encode_text(validation_text,model.tokenizer,MAX_SEQ_LEN)
+			train_labels                    = encode_label(train_labels,pp.list_labels)
+			validation_labels               = encode_label(validation_labels,pp.list_labels)
+
+			dt_train          = Create_DataLoader_train(train_ids,train_masks,train_labels)
+			dt_validation     = Create_DataLoader_train(validation_ids,validation_masks,validation_labels)
+		
+			model.configuration(dt_train)
+			model.fit(dt_train,dt_validation)
+			
+		Attributes:
+	"""
+	def __init__(self,model_name ='bert',num_labels=0): # constructeur
 		self.model_name = model_name
 		self.batch_size = 32
 		self.epochs = 4
-		self.MAX_SEQ_LEN = 64
+		self.MAX_SEQ_LEN = 12
+
 		self.num_labels=num_labels
 	def test(self):
 		return "Model Mantéïa."
 	def load(self):
 		# Load the tokenizer.
 		print('Loading {} tokenizer...'.format(self.model_name))
-
+		num_labels = self.num_labels # The number of output labels
 		if self.model_name=='bert':
-			self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
+			#model_type='bert-base-uncased'
+			model_type='bert-base-multilingual-cased'
+			self.tokenizer = BertTokenizer.from_pretrained(model_type, do_lower_case=True)
 
 			# Load BertForSequenceClassification, the pretrained BERT model with a single 
 			# linear classification layer on top. 
-			self.model     = BertForSequenceClassification.from_pretrained("bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.
-			num_labels = self.num_labels, # The number of output labels--2 for binary classification.
+			self.model     = BertForSequenceClassification.from_pretrained(model_type, # Use the 12-layer BERT model, with an uncased vocab.
 			# You can increase this for multi-class tasks.   
 			output_attentions = False, # Whether the model returns attentions weights.
 			output_hidden_states = False, # Whether the model returns all hidden-states.
@@ -97,6 +140,10 @@ def load(self):
 		if self.model_name=='camenbert':
 			self.tokenizer = CamembertTokenizer.from_pretrained('camembert-base', do_lower_case=True)
 			self.model     = CamembertForSequenceClassification.from_pretrained("camembert-base",num_labels = num_labels,output_attentions = False,output_hidden_states = False,)
+
+		if self.model_name=='flaubert':
+			self.tokenizer = FlaubertTokenizer.from_pretrained('flaubert-base-uncased', do_lower_case=True)
+			self.model     = FlaubertForSequenceClassification.from_pretrained("flaubert-base-uncased",num_labels = num_labels,output_attentions = False,output_hidden_states = False,)
 		if self.model_name=='gpt2-medium':
 			self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')
 			self.model     = GPT2LMHeadModel.from_pretrained('gpt2-medium')
@@ -174,22 +221,22 @@ def fit(self,train_dataloader,validation_dataloader):
 
 				loss_values.append(avg_train_loss)
 
-				print("")
-				print("  Average training loss: {0:.2f}".format(avg_train_loss))
-				print("  Training epcoh took: {:}".format(format_time(time.time() - t0)))
+			print("")
+			print("  Average training loss: {0:.2f}".format(avg_train_loss))
+			print("  Training epoch took: {:}".format(format_time(time.time() - t0)))
 
 
-				print("")
-				print("Running Validation...")
+			print("")
+			print("Running Validation...")
 
-				t0 = time.time()
+			t0 = time.time()
 
-				self.model.eval()
+			self.model.eval()
 
-				eval_loss, eval_accuracy = 0, 0
-				nb_eval_steps, nb_eval_examples = 0, 0
+			eval_loss, eval_accuracy = 0, 0
+			nb_eval_steps, nb_eval_examples = 0, 0
 
-				for batch in validation_dataloader:
+			for batch in validation_dataloader:
 
 					batch = tuple(t.to(self.device) for t in batch)
 
@@ -212,8 +259,8 @@ def fit(self,train_dataloader,validation_dataloader):
 
 					nb_eval_steps += 1
 
-				print("  Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
-				print("  Validation took: {:}".format(format_time(time.time() - t0)))
+			print("  Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
+			print("  Validation took: {:}".format(format_time(time.time() - t0)))
 
 		print("")
 		print("Training complete!")

diff --git a/Manteia/Preprocess.py b/Manteia/Preprocess.py
@@ -75,36 +75,40 @@ class Preprocess:
 			
 		Attributes:
 	"""
-	def __init__(self,documents=None,labels=None,percentage=1.0,size_by_nb_sample=False,nb_sample=None,path='./Document/',lang='english',preprocess=True):
+	def __init__(self,documents=[],labels=[],percentage=1.0,size_by_nb_sample=False,nb_sample=None,path='./Document/',lang='english',preprocess=True):
 
 		self.documents=documents
 		self.labels=labels
 		self.percentage=percentage
 		self.size_by_nb_sample=size_by_nb_sample
 		self.path=path
 		self.lang=lang
-		if preprocess and documents!=None and labels!=None:
+		if preprocess and documents!=[] and labels!=[]:
 			print('Preprocess...')
+			################
+			for i in range(len(documents)):
+				documents[i]=str(documents[i])
+			for i in range(len(labels)):
+				labels[i]=str(labels[i])
+			################
 			self.load()
 			self.reduction()
 			self.df_documents=clean(self.df_documents)
 			self.list_labels=self.list_labels(self.df_labels[LABEL_COLUMN].values.tolist())
 
 			self.documents=self.df_documents[TEXT_COLUMN].values.tolist()
-			print(self.documents)
 			self.labels=self.df_labels[LABEL_COLUMN].values.tolist()
-			#self.construct_id()
 
 	def test(self):
 		return "Preprocess Mantéïa."
 
 	def load(self): # load data -> dataframe df
-		if self.documents!=None:
+		if self.documents!=[]:
 			self.df_documents=pd.DataFrame({TEXT_COLUMN:self.documents})
-		if self.labels!=None:
+		if self.labels!=[]:
 			self.df_labels  =pd.DataFrame({LABEL_COLUMN:self.labels})
 			#multiclass
-			self.df_labels[LABEL_COLUMN] = self.df_labels[LABEL_COLUMN].apply(lambda x: x[0])
+			#self.df_labels[LABEL_COLUMN] = self.df_labels[LABEL_COLUMN].apply(lambda x: x[0])
 
 	def reduction(self):
 		if self.size_by_nb_sample==True:
@@ -127,11 +131,19 @@ def get_labels(self):
 
 	def get_df(self):
 		return pd.DataFrame({TEXT_COLUMN:self.df_documents[TEXT_COLUMN] , LABEL_COLUMN:self.df_labels[LABEL_COLUMN]})
-
+
+
 	def list_labels(self,labels):
 		return list(np.sort(np.unique(np.array(labels)), axis=0))
-
-
+	'''
+	def list_labels(self,labels):
+		label=[]
+		for l in labels:
+				if l not in label:
+					label.append(l)
+		label.sort(reverse=False)
+		return label
+	'''
 
 def clean_stop_word(df,lang='english'):
 		stop_unicode = stopwords.words(lang)

diff --git a/Manteia/Statistic.py b/Manteia/Statistic.py
@@ -26,13 +26,39 @@
 
 class Statistic:
 
-
-	def __init__(self,documents=None,labels=None,name=None,path='',statistic=True):
+	r"""
+		This is the class to make statistic of text.
+		
+		Args:
+		
+				
+			documents (:obj:`list`, optional, defaults to None):
+				A list of documents.
+				
+			labels (:obj:`float`, optional, defaults to None):
+				A list of labels.
+
+			dataset_name (:obj:`string`, optional, defaults to ''):
+				Name of the dataset.
+
+			path (:obj:`string`, optional, defaults to ''):
+				Path to save the report.
+				 
+		Example::
+		
+			from Manteia.Statistic import Statistic
+			documents=['a text','text b']
+			labels=['a','b']
+			Statistic(documents,labels)
+			
+		Attributes:
+	"""
+	def __init__(self,documents=[],labels=[],dataset_name='',path='',statistic=True):
 		self.documents=documents
 		self.labels=labels
 		self.path=path
-		self.name=name
-		if statistic==True and documents!=None and labels!=None:
+		self.dataset_name=dataset_name
+		if statistic==True and documents!=[] and labels!=[]:
 			self.list_labels=self.list_labels(labels)
 			self.print_report()
 
@@ -139,7 +165,7 @@ def class_imbalance(self):
 
 	def report(self):
 		report=''
-		report+="Dataset : {}\n".format(self.name)
+		report+="Dataset : {}\n".format(self.dataset_name)
 		report+="Number of documents : {}\n".format(self.number_text())
 		report+="Type : {}\n".format(self.type(self.labels))
 		report+="List of labels : {}\n".format(self.list_labels)
@@ -155,9 +181,9 @@ def print_report(self):
 		print(self.report())
 
 	def save_report(self):
-		fichier=self.path+"statistical_report_"+self.name+".txt"
+		fichier=self.path+"statistical_report_"+self.dataset_name+".txt"
 		mon_fichier = open(fichier, "w") 
-		mon_fichier.write(self.rapport)
+		mon_fichier.write(self.report)
 		mon_fichier.close()
 
 	def list_labels(self,labels):

diff --git a/Manteia/Visualisation.py b/Manteia/Visualisation.py
@@ -29,9 +29,65 @@
 import matplotlib.pyplot as plt
 
 class Visualisation:
+
+	r"""
+		This is the class to make visualisation of NLP task.
+		
+		Args:
+		
+				
+			documents (:obj:`list`, optional, defaults to None):
+				A list of documents.
+				
+			labels (:obj:`float`, optional, defaults to None):
+				A list of labels.
+
+			dataset_name (:obj:`string`, optional, defaults to ''):
+				Name of the dataset.
+
+			path (:obj:`string`, optional, defaults to ''):
+				Path to save the report.
+
+			save (:obj:`bool`, optional, defaults to False):
+				save the graph to the path.
+
+			show (:obj:`bool`, optional, defaults to False):
+				show the graph.
+				 
+		Example::
+		
+			from Manteia.Statistic import Statistic 
+			from Manteia.Visualisation import Visualisation
+			
+			documents = [
+			'  !?? What do you call a potato in space? Spudnik:::13 ;;    //   ',
+			'What should you do before criticizing Pac-Man? WAKA WAKA WAKA mile in his shoe.',
+			'What did Arnold Schwarzenegger say at the abortion clinic? Hasta last vista, baby.',
+			'Why do you never see elephants hiding in trees? \'Cause they are freaking good at it',
+			'My son just got a tattoo of a heart, a spade, a club, and a diamond, all without my permission. I guess I\'ll deal with him later.',
+			'Mom: "Do you want this?" Me: "No." Mom: "Ok I\'ll give it to your brother." Me: "No I want it."',
+			'Ibuprofen is my favorite headache medicine that also sounds like a reggae professor.',
+			'INTERVIEWER: Why do you want to work here? ME: *crumbs tumbling from my mouth* Oh, I don\'t. I was just walking by and saw you had donuts.',
+			'I\'ve struggled for years to be above the influence... But I\'ve never been able to get that high',
+			'With Facebook, you can stay in touch with people you would otherwise never talk to, but that\'s only one of the many awful things about it',
+			]
+			
+			labels = [
+			['funny'],['not funny'],['funny'],['not funny'],['funny'],['not funny'],['not funny'],['not funny'],['funny'],['not funny'],
+			]
+			
+			stat=Statistic(documents,labels)
+			dictionary=stat.dictionnary_stat_labels()
+			path='./visu.png'
+			visu = Visualisation(path)
+			visu.format_data(dictionary)
+			visu.plot_bar()
+			
+		Attributes:
+	"""
 
 
-	def __init__(self,path='',name='',save=True,show=False):
+	def __init__(self,path='',name='',save=False,show=True):
 		self.path=path
 		self.name=name
 		self.save=save

diff --git a/docs/_build/doctrees/environment.pickle b/docs/_build/doctrees/environment.pickle
diff --git a/docs/_build/doctrees/index.doctree b/docs/_build/doctrees/index.doctree