Skip to content

Commit

Permalink
Mantéïa
Browse files Browse the repository at this point in the history
  • Loading branch information
ym001 committed May 23, 2020
1 parent 6679bc0 commit c16e07a
Show file tree
Hide file tree
Showing 30 changed files with 355 additions and 188 deletions.
7 changes: 4 additions & 3 deletions Manteia/Classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,7 @@ def load_model(self):
"""
if self.model is None:
self.model = Model()
self.model.load_type()
self.model.load_tokenizer()
self.model.num_labels=len(self.list_labels)
self.model.load_class()
Expand Down Expand Up @@ -160,8 +161,8 @@ def process_text(self):
train_labels = encode_label(train_labels,self.list_labels)
validation_labels = encode_label(validation_labels,self.list_labels)

dt_train = Create_DataLoader_train(train_ids,train_masks,train_labels)
dt_validation = Create_DataLoader_train(validation_ids,validation_masks,validation_labels)
dt_train = Create_DataLoader(train_ids,train_masks,train_labels)
dt_validation = Create_DataLoader(validation_ids,validation_masks,validation_labels)
return dt_train ,dt_validation

def predict(self,documents):
Expand Down Expand Up @@ -193,7 +194,7 @@ def predict(self,documents):
inputs,masks = encode_text(documents,self.model.tokenizer)
predict_inputs = totensors(inputs)
predict_masks = totensors(masks)
dt = Create_DataLoader_predict(inputs=predict_inputs,masks=predict_masks)
dt = Create_DataLoader(inputs=predict_inputs,masks=predict_masks)
prediction = self.model.predict(dt)
prediction = decode_label(prediction,self.list_labels)
return prediction
Expand Down
190 changes: 134 additions & 56 deletions Manteia/Model.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,16 +89,20 @@ class Model:
Attributes:
"""
def __init__(self,model_name ='bert',num_labels=0,MAX_SEQ_LEN = 128,early_stopping=False,path='./model',verbose=True):

self.model_name = model_name
self.early_stopping = early_stopping
self.num_labels = num_labels
self.MAX_SEQ_LEN = MAX_SEQ_LEN
self.path = path
self.verbose = verbose

def __init__(self,model_name ='bert',model_type=None,num_labels=0,epochs=None,MAX_SEQ_LEN = 128,early_stopping=False,path='./model',verbose=True):

self.model_name = model_name
self.model_type = model_type
self.early_stopping = early_stopping
self.num_labels = num_labels
self.MAX_SEQ_LEN = MAX_SEQ_LEN
self.epochs = epochs
self.path = path
self.verbose = verbose
self.history = {}
self.history['loss'] = []
self.history['step'] = []
self.history['accuracy'] = []

seed_val = 42
random.seed(seed_val)
Expand All @@ -108,15 +112,88 @@ def __init__(self,model_name ='bert',num_labels=0,MAX_SEQ_LEN = 128,early_stoppi

def test(self):
return "Model Mantéïa."

def load_type(self):
if self.model_name=='bert':
model_dict=['bert-base-uncased','bert-large-uncased','bert-base-cased','bert-large-cased','bert-base-multilingual-uncased','bert-base-multilingual-cased','bert-base-chinese'
,'bert-base-german-cased','bert-large-uncased-whole-word-masking','bert-large-cased-whole-word-masking','bert-large-uncased-whole-word-masking-finetuned-squad'
,'bert-large-cased-whole-word-masking-finetuned-squad','bert-base-cased-finetuned-mrpc','bert-base-german-dbmdz-cased'
,'bert-base-german-dbmdz-uncased','bert-base-japanese','bert-base-japanese-whole-word-masking','bert-base-japanese-char','bert-base-japanese-char-whole-word-masking'
,'bert-base-finnish-cased-v1','bert-base-finnish-uncased-v1','bert-base-dutch-cased']
if self.model_type is None:
self.model_type=model_dict[0]
else:
if self.model_type in model_dict:
print('type compatible')
else:
raise TypeError("{} Model type not in : {}".format(self.model_name,model_dict))

if self.model_name=='xlnet':
model_dict=['xlnet-base-cased','xlnet-large-cased']
if self.model_type is None:
self.model_type=model_dict[0]
else:
if self.model_type in model_dict:
print('type compatible')
else:
raise TypeError("{} Model type not in : {}".format(self.model_name,model_dict))

if self.model_name=='xlnet':
model_dict=['albert-base-v1','albert-large-v1','albert-xlarge-v1','albert-xxlarge-v1','albert-base-v2','albert-large-v2','albert-xlarge-v2','albert-xxlarge-v2']
if self.model_type is None:
self.model_type=model_dict[0]
else:
if self.model_type in model_dict:
print('type compatible')
else:
raise TypeError("{} Model type not in : {}".format(self.model_name,model_dict))

if self.model_name=='roberta':
model_dict=['roberta-base','roberta-large','roberta-large-mnli','distilroberta-base','roberta-base-openai-detector','roberta-large-openai-detector','xlm-roberta-base','xlm-roberta-large']
if self.model_type is None:
self.model_type=model_dict[0]
else:
if self.model_type in model_dict:
print('type compatible')
else:
raise TypeError("{} Model type not in : {}".format(self.model_name,model_dict))

if self.model_name=='distilbert':
model_dict=['distilbert-base-uncased','distilbert-base-uncased-distilled-squad','distilbert-base-cased','distilbert-base-cased-distilled-squad']
if self.model_type is None:
self.model_type=model_dict[0]
else:
if self.model_type in model_dict:
print('type compatible')
else:
raise TypeError("{} Model type not in : {}".format(self.model_name,model_dict))

if self.model_name=='gpt2':
model_dict=['openai-gpt','GPT-2','gpt2','gpt2-medium','gpt2-large','gpt2-xl']
if self.model_type is None:
self.model_type=model_dict[0]
else:
if self.model_type in model_dict:
print('type compatible')
else:
raise TypeError("{} Model type not in : {}".format(self.model_name,model_dict))

if self.model_name=='camembert':
model_dict=['camembert-base']
if self.model_type is None:
self.model_type=model_dict[0]
else:
if self.model_type in model_dict:
print('type compatible')
else:
raise TypeError("{} Model type not in : {}".format(self.model_name,model_dict))


def load_tokenizer(self):
# Load the tokenizer.
if self.verbose==True:
print('Loading {} tokenizer...'.format(self.model_name))
if self.model_name=='bert':
#model_type='bert-base-uncased'
model_type='bert-base-multilingual-cased'
self.tokenizer = BertTokenizer.from_pretrained (model_type, do_lower_case=True)
self.tokenizer = BertTokenizer.from_pretrained (self.model_type, do_lower_case=True)
if self.model_name=='distilbert':
self.tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', do_lower_case=True)
if self.model_name=='albert':
Expand All @@ -129,39 +206,37 @@ def load_tokenizer(self):
self.tokenizer = CamembertTokenizer.from_pretrained ('camembert-base', do_lower_case=True)
if self.model_name=='flaubert':
self.tokenizer = FlaubertTokenizer.from_pretrained ('flaubert-base-uncased', do_lower_case=True)
if self.model_name=='gpt2-medium':
if self.model_name=='gpt2':
self.tokenizer = GPT2Tokenizer.from_pretrained ('gpt2-medium')

def load_class(self):
# Load the tokenizer.
if self.verbose==True:
print('Loading {} class...'.format(self.model_name))
num_labels = self.num_labels # The number of output labels
if self.model_name=='bert':
#model_type='bert-base-uncased'
model_type='bert-base-multilingual-cased'
# Load BertForSequenceClassification, the pretrained BERT model with a single
# linear classification layer on top.
self.model = BertForSequenceClassification.from_pretrained(model_type, # Use the 12-layer BERT model, with an uncased vocab.
# You can increase this for multi-class tasks.
self.model = BertForSequenceClassification.from_pretrained(self.model_type, # Use the 12-layer BERT model, with an uncased vocab.
# You can increase this for multi-class tasks.
num_labels = self.num_labels,
output_attentions = False, # Whether the model returns attentions weights.
output_hidden_states = False, # Whether the model returns all hidden-states.
)
if self.model_name=='distilbert':
self.model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased",num_labels = num_labels,output_attentions = False,output_hidden_states = False,)
self.model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased",num_labels = self.num_labels,output_attentions = False,output_hidden_states = False,)
if self.model_name=='albert':
self.model = AlbertForSequenceClassification.from_pretrained ("albert-base-v1",num_labels = num_labels,output_attentions = False,output_hidden_states = False,)
self.model = AlbertForSequenceClassification.from_pretrained ("albert-base-v1",num_labels = self.num_labels,output_attentions = False,output_hidden_states = False,)
if self.model_name=='xlnet':
self.model = XLNetForSequenceClassification.from_pretrained ("xlnet-base-cased",num_labels = num_labels,output_attentions = False,output_hidden_states = False,)
self.model = XLNetForSequenceClassification.from_pretrained ("xlnet-base-cased",num_labels = self.num_labels,output_attentions = False,output_hidden_states = False,)
if self.model_name=='roberta':
self.model = RobertaForSequenceClassification.from_pretrained ("roberta-base",num_labels = num_labels,output_attentions = False,output_hidden_states = False,)
self.model = RobertaForSequenceClassification.from_pretrained ("roberta-base",num_labels = self.num_labels,output_attentions = False,output_hidden_states = False,)
if self.model_name=='camenbert':
self.model = CamembertForSequenceClassification.from_pretrained ("camembert-base",num_labels = num_labels,output_attentions = False,output_hidden_states = False,)
self.model = CamembertForSequenceClassification.from_pretrained ("camembert-base",num_labels = self.num_labels,output_attentions = False,output_hidden_states = False,)
if self.model_name=='flaubert':
self.model = FlaubertForSequenceClassification.from_pretrained ("flaubert-base-uncased",num_labels = num_labels,output_attentions = False,output_hidden_states = False,)
self.model = FlaubertForSequenceClassification.from_pretrained ("flaubert-base-uncased",num_labels = self.num_labels,output_attentions = False,output_hidden_states = False,)
if self.model_name=='gpt2-medium':
self.model = GPT2LMHeadModel.from_pretrained('gpt2-medium')

def devices(self):
# If there's a GPU available...
if torch.cuda.is_available():
Expand All @@ -176,11 +251,11 @@ def devices(self):
print('No GPU available, using the CPU instead.')
self.device = torch.device("cpu")

def configuration(self,train_dataloader,batch_size = 16,epochs = 4,n_gpu=1):
def configuration(self,train_dataloader,batch_size = 16,epochs = 20,n_gpu=1):
self.batch_size = batch_size
self.epochs = epochs
if self.epochs is None:
self.epochs = epochs
self.n_gpu = n_gpu
#self.model.cuda()
self.optimizer = AdamW(self.model.parameters(),lr = 2e-5,eps = 1e-8)
self.total_steps = len(train_dataloader) * self.epochs
self.scheduler = get_linear_schedule_with_warmup(self.optimizer,num_warmup_steps = 0,num_training_steps = self.total_steps)
Expand Down Expand Up @@ -232,13 +307,19 @@ def fit(self,train_dataloader,validation_dataloader):
outputs = self.model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)


loss = outputs[0]
#loss = outputs[0]
loss, logits = outputs[:2]

if self.n_gpu > 1:
loss = loss.mean()
loss.backward()
total_loss += loss.item()


###metric
self.history['loss'].append(loss.item())
acc=accuracy(np.argmax(logits.detach().cpu().numpy(), axis=1), batch[2].numpy())
self.history['accuracy'].append(acc)
self.history['step'].append(step)
##########
torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
self.optimizer.step()
self.scheduler.step()
Expand Down Expand Up @@ -323,8 +404,9 @@ def predict(self,predict_dataloader,p_type='class'):

self.model.eval()
predictions = None
print('Predicting :')

for batch in predict_dataloader:
for step,batch in enumerate(predict_dataloader):

batch = tuple(t.to(self.device) for t in batch)

Expand All @@ -348,6 +430,8 @@ def predict(self,predict_dataloader,p_type='class'):
if p_type=='probability':
if predictions is None:predictions=torch.softmax(logits, dim=1).numpy()
else:predictions=np.append(predictions,torch.softmax(logits, dim=1).numpy(), axis=0)
progress(count=step+1, total=len(predict_dataloader))

return predictions

def fit_generation(self,text_loader):
Expand Down Expand Up @@ -449,8 +533,8 @@ def save(self,file_name):
else:
print ("Successfully created the directory %s " % self.path)
self.model.to(torch.device('cpu'))
torch.save(self.model.module.state_dict(),self.path+file_name+'.pt')
model.to(self.device)
torch.save(self.model.module.state_dict(),self.path+file_name)
self.model.to(self.device)

def load(self,file_name):
self.load_class()
Expand All @@ -466,12 +550,25 @@ def choose_from_top(probs, n=5):
return int(token_id)


def Create_DataLoader(inputs=None,masks=None,labels=None,batch_size=16):
#for train
if inputs is not None and masks is not None and labels is not None:
td = TensorDataset(totensors(inputs), totensors(masks), totensors(labels))
rs = RandomSampler(td)
return DataLoader(td, sampler=rs, batch_size=batch_size)
#for test
if inputs is not None and masks is not None and labels is None:
td = TensorDataset(totensors(inputs), totensors(masks))
ss = SequentialSampler(td)
return DataLoader(td, sampler=ss, batch_size=batch_size)

#deprecate
def Create_DataLoader_train(inputs,masks,labels,batch_size=16):
td = TensorDataset(totensors(inputs), totensors(masks), totensors(labels))
rs = RandomSampler(td)
return DataLoader(td, sampler=rs, batch_size=batch_size)

#a supprimer!!!!!!
#deprecate
def Create_DataLoader_predict(inputs,masks,batch_size=16):
td = TensorDataset(totensors(inputs), totensors(masks))
ss = SequentialSampler(td)
Expand Down Expand Up @@ -651,35 +748,16 @@ def save_checkpoint(self, acc_validation, model,device_model):
self.acc_validation_min = acc_validation

'''
bert-base-uncased,bert-large-uncased,bert-base-cased,bert-large-cased,bert-base-multilingual-uncased,bert-base-multilingual-cased,bert-base-chinese
,bert-base-german-cased,bert-large-uncased-whole-word-masking,bert-large-cased-whole-word-masking,bert-large-uncased-whole-word-masking-finetuned-squad
,bert-large-cased-whole-word-masking-finetuned-squad,bert-base-cased-finetuned-mrpc,bert-base-german-dbmdz-cased
,bert-base-german-dbmdz-uncased,bert-base-japanese,bert-base-japanese-whole-word-masking,bert-base-japanese-char,bert-base-japanese-char-whole-word-masking
,bert-base-finnish-cased-v1,bert-base-finnish-uncased-v1,bert-base-dutch-cased
openai-gpt,GPT-2,gpt2,gpt2-medium,gpt2-large,gpt2-xl,Transformer-XL,transfo-xl-wt103
,Transformer-XL,transfo-xl-wt103
xlnet-base-cased,xlnet-large-cased,
xlm-mlm-en-2048,xlm-mlm-ende-1024,xlm-mlm-enfr-1024,xlm-mlm-enro-1024,xlm-mlm-xnli15-1024,xlm-mlm-tlm-xnli15-1024,xlm-clm-enfr-1024,xlm-clm-ende-1024
,xlm-mlm-17-1280,xlm-mlm-100-1280
roberta-base,roberta-large,roberta-large-mnli,distilroberta-base,roberta-base-openai-detector,roberta-large-openai-detector
,distilbert-base-uncased,distilbert-base-uncased-distilled-squad,distilbert-base-cased,distilbert-base-cased-distilled-squad
distilgpt2,distilbert-base-german-cased,distilbert-base-multilingual-cased
ctrl
camembert-base,
albert-base-v1,albert-large-v1,albert-xlarge-v1,albert-xxlarge-v1,albert-base-v2,albert-large-v2,albert-xlarge-v2,albert-xxlarge-v2
t5-small,t5-base,t5-large,t5-3B,t5-11B
,xlm-roberta-base,xlm-roberta-large
flaubert-small-cased,flaubert-base-uncased,flaubert-base-cased,flaubert-large-cased
bart-large,bart-large-mnli,bart-large-cnn,mbart-large-en-ro
Expand Down
35 changes: 33 additions & 2 deletions Manteia/Visualisation.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
"""
import os

from .Preprocess import Preprocess
from .Statistic import Statistic
from .Model import Model
Expand Down Expand Up @@ -115,7 +117,36 @@ def plot_boxplot(self,labels,longueur_des_doc,ylim=200):
plt.xticks(rotation=90)
sns.boxplot(x='Labels', y='Length of document', data=data, palette='Set2',notch=True,showfliers=True, showmeans=True, meanline=True)
ax.set_ylim(0, ylim)
plt.show()
#plt.savefig('/home/mercadier/these/resultat/image/longueur-doc-by-classe.png')
if self.show:
plt.show()
if self.save:
path=os.path.join(self.path,'boxplot.png')
plt.savefig(path)

def plot_train(self,loss,accuracy,granularity=None):
if granularity is not None:

loss = np.resize(np.array(loss),(int(len(loss)/granularity),granularity))
loss = np.mean(loss, axis=1)
accuracy = np.resize(np.array(accuracy),(int(len(accuracy)/granularity),granularity))
accuracy = np.mean(accuracy, axis=1)
fig, ax1 = plt.subplots()
ax1.plot(accuracy, color='r')

ax2 = ax1.twinx()
ax2.plot(loss, color='b')

plt.title('Model train')
ax1.set_xlabel('Step')
ax1.set_ylabel('Accuracy', color='r')
ax2.set_ylabel('Loss', color='b')

#plt.legend(['train'], loc='upper left')
if self.show:
plt.show()
if self.save:
path=os.path.join(self.path,'train.png')
plt.savefig(path)



2 changes: 1 addition & 1 deletion Manteia/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
reminiscent.
"""

__version__ = "0.0.21"
__version__ = "0.0.25"


from Manteia import Classification
Expand Down
Binary file modified docs/_build/doctrees/Model.doctree
Binary file not shown.
Binary file modified docs/_build/doctrees/Statistic.doctree
Binary file not shown.
Binary file modified docs/_build/doctrees/Visualisation.doctree
Binary file not shown.
Binary file modified docs/_build/doctrees/environment.pickle
Binary file not shown.
Binary file modified docs/_build/doctrees/index.doctree
Binary file not shown.

0 comments on commit c16e07a

Please sign in to comment.