Skip to content

Commit

Permalink
df
Browse files Browse the repository at this point in the history
  • Loading branch information
ym001 committed May 7, 2020
1 parent b45f6c7 commit f14070e
Show file tree
Hide file tree
Showing 46 changed files with 1,366 additions and 387 deletions.
2 changes: 1 addition & 1 deletion Exemples/exemple_Classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def main(args):
['funny'],['not funny'],['funny'],['not funny'],['funny'],['not funny'],['not funny'],['not funny'],['funny'],['not funny'],
]

cl=Classification(documents=documents,labels=labels)
cl=Classification(documents=documents,labels=labels,process=True)
print(cl.predict(documents[:2]))
return 0

Expand Down
8 changes: 8 additions & 0 deletions Exemples/exemple_Dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
from Manteia.Dataset import Dataset

ds=Dataset('20newsgroups')
documents=ds.get_documents()
labels=ds.get_labels()

print(documents[:5])
print(labels[:5])
18 changes: 11 additions & 7 deletions Manteia/Classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,21 +43,25 @@ class Classification:
Attributes:
"""
def __init__(self,model_name ='bert',documents = [],labels = []):
self.MAX_SEQ_LEN = 64
self.model_name = model_name
def __init__(self,documents = [],labels = [],model=None,process=False,verbose=True):

if documents!=[] and labels!=[]:
pp = Preprocess(documents=documents,labels=labels)
self.list_labels = pp.list_labels
print(self.list_labels)
self.model = Model(num_labels=len(pp.list_labels))

if model!=None:
self.model = model
else:
self.model = Model(num_labels=len(pp.list_labels),early_stopping=True)

if process:
print('Process...')
self.model.load()

train_text, validation_text, train_labels, validation_labels = train_test_split(pp.documents, pp.labels, random_state=2018, test_size=0.1)

train_ids,train_masks = encode_text(train_text,self.model.tokenizer,self.MAX_SEQ_LEN)
validation_ids,validation_masks = encode_text(validation_text,self.model.tokenizer,self.MAX_SEQ_LEN)
train_ids,train_masks = encode_text(train_text,self.model.tokenizer,self.model.MAX_SEQ_LEN)
validation_ids,validation_masks = encode_text(validation_text,self.model.tokenizer,self.model.MAX_SEQ_LEN)
train_labels = encode_label(train_labels,pp.list_labels)
validation_labels = encode_label(validation_labels,pp.list_labels)

Expand Down
79 changes: 79 additions & 0 deletions Manteia/Dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# data.py
#
# Copyright 2017 yves <yves.mercadier@ac-montpellier.fr>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
# MA 02110-1301, USA.
#
# '

from .Preprocess import Preprocess
import numpy as np
from nltk.corpus import reuters,brown,webtext
from sklearn.datasets import fetch_20newsgroups

class Dataset:

r"""
This is the class to give datasets.
Args:
dataset_name (:obj:`string`, optional, defaults to ''):
Name of the dataset.
Example::
from Manteia.Dataset import Dataset
ds=Dataset('20newsgroups')
documents=ds.get_documents()
labels=ds.get_labels()
print(documents[:5])
print(labels[:5])
Attributes:
"""
def __init__(self,name='20newsgroups'):

self.name=name
self.load()

def test(self):
return "Mantéïa Dataset."

def load(self):
if self.name=="20newsgroups":
self.documents,self.labels=self.load_20newsgroups()

def load_20newsgroups(self):
#categorie = ['sci.crypt', 'sci.electronics','sci.med', 'sci.space']
categorie = ['sci.crypt', 'sci.electronics','sci.med', 'sci.space','rec.autos','rec.motorcycles','rec.sport.baseball','rec.sport.hockey','talk.politics.guns','talk.politics.mideast','talk.politics.misc','talk.religion.misc']
twenty_train = fetch_20newsgroups(subset='train',categories=categorie, shuffle=True, random_state=42)
doc=twenty_train.data
label=[]
for i in range(len(twenty_train.target)):
label.append(categorie[twenty_train.target[i]])
return doc,label

def get_documents(self):
return self.documents

def get_labels(self):
return self.labels
8 changes: 3 additions & 5 deletions Manteia/Generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,9 @@ class Generation:
Example::
from Manteia.Classification import Classification
# Initializing a list of texts,labels
documents=['a text','text b']
labels=['a','b']
Classification(documents,labels)
from Manteia.Generation import Generation
Generation(seed='What do you do if a bird shits on your car?')
Attributes:
"""
Expand Down
101 changes: 86 additions & 15 deletions Manteia/Model.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,13 +96,13 @@ class Model:
Attributes:
"""
def __init__(self,model_name ='bert',num_labels=0): # constructeur
def __init__(self,model_name ='bert',num_labels=0,early_stopping=False,path='./model'): # constructeur
self.model_name = model_name
self.batch_size = 32
self.epochs = 4
self.MAX_SEQ_LEN = 12

self.early_stopping=early_stopping
self.num_labels=num_labels
self.path=path
if self.early_stopping:
self.es=EarlyStopping(path=path)
def test(self):
return "Model Mantéïa."
def load(self):
Expand Down Expand Up @@ -162,7 +162,11 @@ def device(self):
print('No GPU available, using the CPU instead.')
self.device = torch.device("cpu")

def configuration(self,train_dataloader):
def configuration(self,train_dataloader,batch_size = 16,epochs = 20,MAX_SEQ_LEN = 128):
self.batch_size = batch_size
self.epochs = epochs
self.MAX_SEQ_LEN = MAX_SEQ_LEN

self.model.cuda()
self.optimizer = AdamW(self.model.parameters(),lr = 2e-5,eps = 1e-8)
self.total_steps = len(train_dataloader) * self.epochs
Expand Down Expand Up @@ -224,9 +228,7 @@ def fit(self,train_dataloader,validation_dataloader):
print("")
print(" Average training loss: {0:.2f}".format(avg_train_loss))
print(" Training epoch took: {:}".format(format_time(time.time() - t0)))


print("")

print("Running Validation...")

t0 = time.time()
Expand Down Expand Up @@ -258,14 +260,22 @@ def fit(self,train_dataloader,validation_dataloader):
eval_accuracy += tmp_eval_accuracy

nb_eval_steps += 1

print(" Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
acc_validation=eval_accuracy/nb_eval_steps
print(" Accuracy: {0:.2f}".format(acc_validation))
print(" Validation took: {:}".format(format_time(time.time() - t0)))

if self.early_stopping:
self.es(acc_validation, self.model)

if self.es.early_stop:
print("Early stopping")
break
print("")
print("Training complete!")

def predict(self,predict_dataloader):
if self.early_stopping:
self.model.from_pretrained(self.path)
self.model.eval()
prediction=[]
for batch in predict_dataloader:
Expand All @@ -285,6 +295,7 @@ def predict(self,predict_dataloader):
tmp_logits = tmp_logits.detach().cpu().numpy()
prediction.extend(flat_prediction(tmp_logits))
return prediction

def fit_generation(self,text_loader):

self.model.train()
Expand Down Expand Up @@ -500,9 +511,69 @@ def format_time(elapsed):

# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
pred_flat = np.argmax(preds, axis=1).flatten()
labels_flat = labels.flatten()
return np.sum(pred_flat == labels_flat) / len(labels_flat)
pred_flat = np.argmax(preds, axis=1).flatten()
labels_flat = labels.flatten()
return np.sum(pred_flat == labels_flat) / len(labels_flat)

def accuracy(preds, labels):
return np.sum(preds == labels) / len(labels)

def flat_prediction(preds):
return np.argmax(preds, axis=1).flatten()
return np.argmax(preds, axis=1).flatten()


class EarlyStopping:
"""Early stops the training if validation loss doesn't improve after a given patience."""
def __init__(self, patience=2, delta=0,path=None, verbose=True):
"""
Args:
patience (int): How long to wait after last time validation loss improved.
Default: 2
verbose (bool): If True, prints a message for each validation loss improvement.
Default: False
delta (float): Minimum change in the monitored quantity to qualify as an improvement.
Default: 0
"""
self.patience = patience
self.verbose = verbose
self.counter = 0
self.best_score = None
self.early_stop = False
self.acc_validation_min = 0
self.delta = delta
self.path=path

def __call__(self, acc_validation , model):

score = acc_validation

if self.best_score is None:
self.best_score = score
self.save_checkpoint(acc_validation, model)
elif score < self.best_score:
self.counter += 1
print(f'EarlyStopping counter: {self.counter} out of {self.patience}')
if self.counter >= self.patience:
self.early_stop = True
else:
print(f'Save model : {self.counter} out of {self.patience}')
self.best_score = score
self.save_checkpoint(acc_validation, model)
self.counter = 0

def save_checkpoint(self, acc_validation, model):
'''Saves model when validation loss decrease.'''
print(self.path)
if self.verbose:
print(f'Validation accuracy increased ({self.acc_validation_min:.6f} --> {acc_validation:.6f}). Saving model ...')
import os
if not os.path.isdir(self.path):
# define the name of the directory to be created
try:
os.mkdir(self.path)
except OSError:
print ("Creation of the directory %s failed" % self.path)
else:
print ("Successfully created the directory %s " % self.path)
model.save_pretrained(self.path)
self.acc_validation_min = acc_validation
12 changes: 8 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,23 +5,27 @@ This module proclaims the good word. May they
regain total freedom of artificial thought towards a new age
reminiscent.

You can install it with pip:
You can install it with pip :

     pip install Manteia

Example of use Classification:
Example of use Classification :

     >>> from Manteia.Classification import Classification
     >>> # Initializing a list of texts,labels
     >>> documents=['a text','text b']
     >>> labels=['a','b']
     >>> Classification(documents,labels)

Example of use Generation:
Example of use Generation :

     >>> from Manteia.Generation import Generation
     >>> Generation(seed='What do you do if a bird shits on your car?')
If you're a car owner, you're supposed to be able to call the police
and have them take the bird off the car.


documentation :

https://manteia.readthedocs.io/en/latest/#

This code is licensed under MIT.
5 changes: 5 additions & 0 deletions docs/Classification.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
Classification
==============

.. automodule:: Manteia.Classification
:members:
5 changes: 5 additions & 0 deletions docs/Generation.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
Generation
==========

.. automodule:: Manteia.Generation
:members:
5 changes: 5 additions & 0 deletions docs/Model.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
Model
=====

.. automodule:: Manteia.Model
:members:
5 changes: 5 additions & 0 deletions docs/Preprocess.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
Preprocess
==========

.. automodule:: Manteia.Preprocess
:members:
5 changes: 5 additions & 0 deletions docs/Statistic.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
Statistic
=========

.. automodule:: Manteia.Statistic
:members:
5 changes: 5 additions & 0 deletions docs/Task.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
Task
====

.. automodule:: Manteia.Task
:members:
5 changes: 5 additions & 0 deletions docs/Visualisation.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
Visualisation
=============

.. automodule:: Manteia.Visualisation
:members:
Binary file added docs/_build/doctrees/Classification.doctree
Binary file not shown.
Binary file added docs/_build/doctrees/Generation.doctree
Binary file not shown.
Binary file added docs/_build/doctrees/Model.doctree
Binary file not shown.
Binary file added docs/_build/doctrees/Preprocess.doctree
Binary file not shown.
Binary file added docs/_build/doctrees/Statistic.doctree
Binary file not shown.
Binary file added docs/_build/doctrees/Task.doctree
Binary file not shown.
Binary file added docs/_build/doctrees/Visualisation.doctree
Binary file not shown.
Binary file modified docs/_build/doctrees/environment.pickle
Binary file not shown.
Binary file modified docs/_build/doctrees/index.doctree
Binary file not shown.

0 comments on commit f14070e

Please sign in to comment.