Skip to content

Commit

Permalink
Mantéïa
Browse files Browse the repository at this point in the history
  • Loading branch information
ym001 committed May 27, 2020
1 parent 2b147b4 commit 644f5f1
Show file tree
Hide file tree
Showing 7 changed files with 173 additions and 48 deletions.
2 changes: 1 addition & 1 deletion Exemples/exemple_Classification1.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,5 +5,5 @@

labels = ['funny','not funny']

model = Model(model_name ='roberta')
model = Model(model_name ='bert')
cl=Classification(model,documents,labels,process_classif=True)
28 changes: 3 additions & 25 deletions Exemples/exemple_Classification_dataset.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,6 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# exemple_Data.py
#
# Copyright 2020 Yves <yves@mercadier>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
# MA 02110-1301, USA.
#
#

from Manteia.Classification import Classification
from Manteia.Preprocess import Preprocess
from Manteia.Dataset import Dataset
Expand All @@ -33,13 +13,11 @@ def main(args):
pp = Preprocess(documents=documents,labels=labels,nb_sample=500)
documents = pp.documents
labels = pp.labels

cl = Classification(documents_train=documents,labels_train=labels)
cl.list_labels = pp.list_labels

cl.load_model()
dt_train ,dt_validation = cl.process_text()
cl.model.configuration(dt_train)
cl.model.fit(dt_train,dt_validation)
cl.process()

print(cl.predict(documents[:5]))

Expand Down
71 changes: 52 additions & 19 deletions Manteia/Model.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from transformers import (
WEIGHTS_NAME,
BertTokenizer,
BartTokenizer,
XLNetTokenizer,
XLMTokenizer,
RobertaTokenizer,
Expand All @@ -28,6 +29,7 @@
FlaubertTokenizer
)
from transformers import BertForSequenceClassification
from transformers import BartForSequenceClassification
from transformers import RobertaForSequenceClassification
from transformers import XLMForSequenceClassification
from transformers import XLNetForSequenceClassification
Expand All @@ -37,6 +39,8 @@
from transformers import FlaubertForSequenceClassification
from transformers import GPT2Tokenizer, GPT2LMHeadModel

from transformers import BartForConditionalGeneration, BartConfig

from Manteia.Utils import progress

import numpy as np
Expand All @@ -48,7 +52,7 @@
import datetime
import gc

#model'distilbert','albert','xlnet','roberta','camenbert','scibert'
#model'bert','distilbert','albert','bart','xlnet','roberta','camenbert','scibert'
class Model:
r"""
This is the class to construct model.
Expand Down Expand Up @@ -89,10 +93,11 @@ class Model:
Attributes:
"""
def __init__(self,model_name ='bert',model_type=None,num_labels=0,epochs=None,MAX_SEQ_LEN = 128,early_stopping=False,path='./model',verbose=True):
def __init__(self,model_name ='bert',model_type=None,task='classification',num_labels=0,epochs=None,MAX_SEQ_LEN = 128,early_stopping=False,path='./model',verbose=True):

self.model_name = model_name
self.model_type = model_type
self.task = task
self.early_stopping = early_stopping
self.num_labels = num_labels
self.MAX_SEQ_LEN = MAX_SEQ_LEN
Expand Down Expand Up @@ -137,7 +142,17 @@ def load_type(self):
else:
raise TypeError("{} Model type not in : {}".format(self.model_name,model_dict))

if self.model_name=='xlnet':
if self.model_name=='bart':
model_dict=['bart-large','bart-large-mnli','bart-large-cnn','bart-large-xsum','mbart-large-en-ro']
if self.model_type is None:
self.model_type=model_dict[0]
else:
if self.model_type in model_dict:
print('type compatible')
else:
raise TypeError("{} Model type not in : {}".format(self.model_name,model_dict))

if self.model_name=='albert':
model_dict=['albert-base-v1','albert-large-v1','albert-xlarge-v1','albert-xxlarge-v1','albert-base-v2','albert-large-v2','albert-xlarge-v2','albert-xxlarge-v2']
if self.model_type is None:
self.model_type=model_dict[0]
Expand Down Expand Up @@ -186,6 +201,15 @@ def load_type(self):
print('type compatible')
else:
raise TypeError("{} Model type not in : {}".format(self.model_name,model_dict))
if self.model_name=='flaubert':
model_dict=['flaubert-base-uncased', 'flaubert-small-cased', 'flaubert-base-cased', 'flaubert-large-cased']
if self.model_type is None:
self.model_type=model_dict[0]
else:
if self.model_type in model_dict:
print('type compatible')
else:
raise TypeError("{} Model type not in : {}".format(self.model_name,model_dict))


def load_tokenizer(self):
Expand All @@ -195,19 +219,21 @@ def load_tokenizer(self):
if self.model_name=='bert':
self.tokenizer = BertTokenizer.from_pretrained (self.model_type, do_lower_case=True)
if self.model_name=='distilbert':
self.tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', do_lower_case=True)
self.tokenizer = DistilBertTokenizer.from_pretrained(self.model_type, do_lower_case=True)
if self.model_name=='albert':
self.tokenizer = AlbertTokenizer.from_pretrained ('albert-base-v1', do_lower_case=True)
self.tokenizer = AlbertTokenizer.from_pretrained (self.model_type, do_lower_case=True)
if self.model_name=='bart':
self.tokenizer = BartTokenizer.from_pretrained (self.model_type, do_lower_case=True)
if self.model_name=='xlnet':
self.tokenizer = XLNetTokenizer.from_pretrained ('xlnet-base-cased', do_lower_case=True)
self.tokenizer = XLNetTokenizer.from_pretrained (self.model_type, do_lower_case=True)
if self.model_name=='roberta':
self.tokenizer = RobertaTokenizer.from_pretrained ('roberta-base', do_lower_case=True)
self.tokenizer = RobertaTokenizer.from_pretrained (self.model_type, do_lower_case=True)
if self.model_name=='camenbert':
self.tokenizer = CamembertTokenizer.from_pretrained ('camembert-base', do_lower_case=True)
self.tokenizer = CamembertTokenizer.from_pretrained (self.model_type, do_lower_case=True)
if self.model_name=='flaubert':
self.tokenizer = FlaubertTokenizer.from_pretrained ('flaubert-base-uncased', do_lower_case=True)
self.tokenizer = FlaubertTokenizer.from_pretrained (self.model_type, do_lower_case=True)
if self.model_name=='gpt2':
self.tokenizer = GPT2Tokenizer.from_pretrained ('gpt2-medium')
self.tokenizer = GPT2Tokenizer.from_pretrained (self.model_type)

def load_class(self):
# Load the tokenizer.
Expand All @@ -223,19 +249,26 @@ def load_class(self):
output_hidden_states = False, # Whether the model returns all hidden-states.
)
if self.model_name=='distilbert':
self.model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased",num_labels = self.num_labels,output_attentions = False,output_hidden_states = False,)
self.model = DistilBertForSequenceClassification.from_pretrained(self.model_type,num_labels = self.num_labels,output_attentions = False,output_hidden_states = False,)
if self.model_name=='albert':
self.model = AlbertForSequenceClassification.from_pretrained ("albert-base-v1",num_labels = self.num_labels,output_attentions = False,output_hidden_states = False,)
self.model = AlbertForSequenceClassification.from_pretrained (self.model_type,num_labels = self.num_labels,output_attentions = False,output_hidden_states = False,)
if self.model_name=='bart':
if self.task=='classification':
self.model = BartForSequenceClassification.from_pretrained (self.model_type,num_labels = self.num_labels,output_attentions = False,output_hidden_states = False,)
if self.task=='summarize':
self.model = BartForConditionalGeneration.from_pretrained (self.model_type)


if self.model_name=='xlnet':
self.model = XLNetForSequenceClassification.from_pretrained ("xlnet-base-cased",num_labels = self.num_labels,output_attentions = False,output_hidden_states = False,)
self.model = XLNetForSequenceClassification.from_pretrained (self.model_type,num_labels = self.num_labels,output_attentions = False,output_hidden_states = False,)
if self.model_name=='roberta':
self.model = RobertaForSequenceClassification.from_pretrained ("roberta-base",num_labels = self.num_labels,output_attentions = False,output_hidden_states = False,)
self.model = RobertaForSequenceClassification.from_pretrained (self.model_type,num_labels = self.num_labels,output_attentions = False,output_hidden_states = False,)
if self.model_name=='camenbert':
self.model = CamembertForSequenceClassification.from_pretrained ("camembert-base",num_labels = self.num_labels,output_attentions = False,output_hidden_states = False,)
self.model = CamembertForSequenceClassification.from_pretrained (self.model_type,num_labels = self.num_labels,output_attentions = False,output_hidden_states = False,)
if self.model_name=='flaubert':
self.model = FlaubertForSequenceClassification.from_pretrained ("flaubert-base-uncased",num_labels = self.num_labels,output_attentions = False,output_hidden_states = False,)
self.model = FlaubertForSequenceClassification.from_pretrained (self.model_type,num_labels = self.num_labels,output_attentions = False,output_hidden_states = False,)
if self.model_name=='gpt2-medium':
self.model = GPT2LMHeadModel.from_pretrained('gpt2-medium')
self.model = GPT2LMHeadModel.from_pretrained (self.model_type)

def devices(self):
# If there's a GPU available...
Expand Down Expand Up @@ -533,12 +566,12 @@ def save(self,file_name):
else:
print ("Successfully created the directory %s " % self.path)
self.model.to(torch.device('cpu'))
torch.save(self.model.module.state_dict(),self.path+file_name)
torch.save(self.model.module.state_dict(),os.path.join(self.path,file_name))
self.model.to(self.device)

def load(self,file_name):
self.load_class()
self.model.load_state_dict(torch.load(self.path+file_name))
self.model.load_state_dict(torch.load(os.path.join(self.path,file_name)))
self.model.to(self.device)

def choose_from_top(probs, n=5):
Expand Down
107 changes: 107 additions & 0 deletions Manteia/Summarize.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
.. module:: Classification
:platform: Unix, Windows
:synopsis: A useful module indeed.
.. moduleauthor:: Yves Mercadier <manteia.ym001@gmail.com>
"""
import numpy as np
import random
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split,KFold
import time
import datetime
import gc
from .Model import *
from .Preprocess import Preprocess,list_labels

class Summarize:
r"""
This is the class to summarize text.
Args:
model (:obj:`Model`, optional, defaults to 'bert'):
give the name of a model.
documents (:obj:`list`, optional, defaults to None):
A list of documents.
labels (:obj:`float`, optional, defaults to None):
A list of labels.
Example 1::
from Manteia.Summarize import Summarize
"""
def __init__(self,model=None,documents = [],verbose=True):

self.process_classif = process_classif
self.verbose = verbose
self.model = model
self.documents_train = documents_train
self.labels_train = labels_train
self.documents_test = documents_test
self.labels_test = labels_test

self.load_model()
inputs=self.process_text()
print(self.predict(inputs))

def load_model(self):
"""
Example 3::
from Manteia.Summarize import Summarize
"""
if self.model is None:
self.model = Model(model_name ='bart',model_type='bart-large-cnn',task='summarize')
self.model.load_type()
self.model.load_tokenizer()
self.model.load_class()



def process_text(self):
r"""
This is the description of the process_text function.
Example 4::
from Manteia.Summarize import Summarize
"""
inputs = self.model.tokenizer.batch_encode_plus(self.documents, max_length=1024, return_tensors='pt')
return inputs

def predict(self,inputs):
r"""
This is the description of the predict function.
Args:
documents (:obj:`list`, optional, defaults to None):
A list of documents (str).
Example 5::
from Manteia.Summarize import Summarize
"""
summary_ids = self.model.generate(inputs['input_ids'], num_beams=4, max_length=5, early_stopping=True)
summary = [self.model.tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids]
return summary




2 changes: 1 addition & 1 deletion Manteia/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
reminiscent.
"""

__version__ = "0.0.26"
__version__ = "0.0.29"


from Manteia import Classification
Expand Down
7 changes: 7 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,13 @@ You can install it with pip :

     __pip install Manteia__

For use with GPU and cuda we recommend the use of [Anaconda](https://www.anaconda.com/open-source) :

     __conda create -n manteia_env python=3.7__
     __conda activate manteia_env__
     __conda install pytorch__
     __pip install manteia__

Example of use Classification :


Expand Down
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,13 @@

#with open('requirements.txt') as f:
# requirements = f.read().splitlines()
requirements = ['wget==3.2','matplotlib==3.2.1','nltk==3.5','numpy==1.18.4','pandas==1.0.3','scikit_learn==0.23.1','seaborn==0.10.1','torch==1.5.0','transformers==2.9.1']
requirements = ['wget==3.2','matplotlib==3.2.1','nltk==3.5','numpy==1.18.4','pandas==1.0.3','scikit_learn==0.23.1','seaborn==0.10.1','torch','transformers==2.9.1']
setup(

name='Manteia',

# version=Manteia.__version__,
version="0.0.26",
version="0.0.29",

packages=find_packages(),

Expand Down

0 comments on commit 644f5f1

Please sign in to comment.