Skip to content

Commit

Permalink
hg
Browse files Browse the repository at this point in the history
  • Loading branch information
ym001 committed May 6, 2020
1 parent 3181f02 commit abf1805
Show file tree
Hide file tree
Showing 12 changed files with 332 additions and 45 deletions.
8 changes: 5 additions & 3 deletions Manteia/Classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,13 +43,14 @@ class Classification:
Attributes:
"""
def __init__(self,model_name ='bert',documents = None,labels = None):
def __init__(self,model_name ='bert',documents = [],labels = []):
self.MAX_SEQ_LEN = 64
self.model_name = model_name

if documents!=None and labels!=None:
pp = Preprocess(documents,labels)
if documents!=[] and labels!=[]:
pp = Preprocess(documents=documents,labels=labels)
self.list_labels = pp.list_labels
print(self.list_labels)
self.model = Model(num_labels=len(pp.list_labels))
self.model.load()

Expand All @@ -65,6 +66,7 @@ def __init__(self,model_name ='bert',documents = None,labels = None):

self.model.configuration(dt_train)
self.model.fit(dt_train,dt_validation)

def test(self):
return "Classification Mantéïa."

Expand Down
85 changes: 66 additions & 19 deletions Manteia/Model.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,8 @@
RobertaTokenizer,
DistilBertTokenizer,
AlbertTokenizer,
CamembertTokenizer
CamembertTokenizer,
FlaubertTokenizer
)
from transformers import BertForSequenceClassification
from transformers import RobertaForSequenceClassification
Expand All @@ -42,6 +43,7 @@
from transformers import DistilBertForSequenceClassification
from transformers import AlbertForSequenceClassification
from transformers import CamembertForSequenceClassification
from transformers import FlaubertForSequenceClassification
from transformers import GPT2Tokenizer, GPT2LMHeadModel

import numpy as np
Expand All @@ -55,25 +57,66 @@

#model'distilbert','albert','xlnet','roberta','camenbert','scibert'
class Model:
def __init__(self,model_name ='bert',num_labels=None): # constructeur
r"""
This is the class to construct model.
Args:
model_name (:obj:`string`, optional, defaults to 'bert'):
give the name of a model.
num_labels (:obj:`int`, optional, defaults to '0'):
give the number of categorie for classification.
Example::
from Manteia.Preprocess import Preprocess
from Manteia.Model import Model,encode_text,encode_label,Create_DataLoader_train
from sklearn.model_selection import train_test_split
documents=['a text','text b']
labels=['a','b']
pp = Preprocess(documents=documents,labels=labels)
model = Model(model_name=model_name,num_labels=len(pp.list_labels))
model.load()
train_text, validation_text, train_labels, validation_labels = train_test_split(pp.documents, pp.labels, random_state=2018, test_size=0.1)
train_ids,train_masks = encode_text(train_text,model.tokenizer,MAX_SEQ_LEN)
validation_ids,validation_masks = encode_text(validation_text,model.tokenizer,MAX_SEQ_LEN)
train_labels = encode_label(train_labels,pp.list_labels)
validation_labels = encode_label(validation_labels,pp.list_labels)
dt_train = Create_DataLoader_train(train_ids,train_masks,train_labels)
dt_validation = Create_DataLoader_train(validation_ids,validation_masks,validation_labels)
model.configuration(dt_train)
model.fit(dt_train,dt_validation)
Attributes:
"""
def __init__(self,model_name ='bert',num_labels=0): # constructeur
self.model_name = model_name
self.batch_size = 32
self.epochs = 4
self.MAX_SEQ_LEN = 64
self.MAX_SEQ_LEN = 12

self.num_labels=num_labels
def test(self):
return "Model Mantéïa."
def load(self):
# Load the tokenizer.
print('Loading {} tokenizer...'.format(self.model_name))

num_labels = self.num_labels # The number of output labels
if self.model_name=='bert':
self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
#model_type='bert-base-uncased'
model_type='bert-base-multilingual-cased'
self.tokenizer = BertTokenizer.from_pretrained(model_type, do_lower_case=True)

# Load BertForSequenceClassification, the pretrained BERT model with a single
# linear classification layer on top.
self.model = BertForSequenceClassification.from_pretrained("bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.
num_labels = self.num_labels, # The number of output labels--2 for binary classification.
self.model = BertForSequenceClassification.from_pretrained(model_type, # Use the 12-layer BERT model, with an uncased vocab.
# You can increase this for multi-class tasks.
output_attentions = False, # Whether the model returns attentions weights.
output_hidden_states = False, # Whether the model returns all hidden-states.
Expand All @@ -97,6 +140,10 @@ def load(self):
if self.model_name=='camenbert':
self.tokenizer = CamembertTokenizer.from_pretrained('camembert-base', do_lower_case=True)
self.model = CamembertForSequenceClassification.from_pretrained("camembert-base",num_labels = num_labels,output_attentions = False,output_hidden_states = False,)

if self.model_name=='flaubert':
self.tokenizer = FlaubertTokenizer.from_pretrained('flaubert-base-uncased', do_lower_case=True)
self.model = FlaubertForSequenceClassification.from_pretrained("flaubert-base-uncased",num_labels = num_labels,output_attentions = False,output_hidden_states = False,)
if self.model_name=='gpt2-medium':
self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')
self.model = GPT2LMHeadModel.from_pretrained('gpt2-medium')
Expand Down Expand Up @@ -174,22 +221,22 @@ def fit(self,train_dataloader,validation_dataloader):

loss_values.append(avg_train_loss)

print("")
print(" Average training loss: {0:.2f}".format(avg_train_loss))
print(" Training epcoh took: {:}".format(format_time(time.time() - t0)))
print("")
print(" Average training loss: {0:.2f}".format(avg_train_loss))
print(" Training epoch took: {:}".format(format_time(time.time() - t0)))


print("")
print("Running Validation...")
print("")
print("Running Validation...")

t0 = time.time()
t0 = time.time()

self.model.eval()
self.model.eval()

eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0

for batch in validation_dataloader:
for batch in validation_dataloader:

batch = tuple(t.to(self.device) for t in batch)

Expand All @@ -212,8 +259,8 @@ def fit(self,train_dataloader,validation_dataloader):

nb_eval_steps += 1

print(" Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
print(" Validation took: {:}".format(format_time(time.time() - t0)))
print(" Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
print(" Validation took: {:}".format(format_time(time.time() - t0)))

print("")
print("Training complete!")
Expand Down
32 changes: 22 additions & 10 deletions Manteia/Preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,36 +75,40 @@ class Preprocess:
Attributes:
"""
def __init__(self,documents=None,labels=None,percentage=1.0,size_by_nb_sample=False,nb_sample=None,path='./Document/',lang='english',preprocess=True):
def __init__(self,documents=[],labels=[],percentage=1.0,size_by_nb_sample=False,nb_sample=None,path='./Document/',lang='english',preprocess=True):

self.documents=documents
self.labels=labels
self.percentage=percentage
self.size_by_nb_sample=size_by_nb_sample
self.path=path
self.lang=lang
if preprocess and documents!=None and labels!=None:
if preprocess and documents!=[] and labels!=[]:
print('Preprocess...')
################
for i in range(len(documents)):
documents[i]=str(documents[i])
for i in range(len(labels)):
labels[i]=str(labels[i])
################
self.load()
self.reduction()
self.df_documents=clean(self.df_documents)
self.list_labels=self.list_labels(self.df_labels[LABEL_COLUMN].values.tolist())

self.documents=self.df_documents[TEXT_COLUMN].values.tolist()
print(self.documents)
self.labels=self.df_labels[LABEL_COLUMN].values.tolist()
#self.construct_id()

def test(self):
return "Preprocess Mantéïa."

def load(self): # load data -> dataframe df
if self.documents!=None:
if self.documents!=[]:
self.df_documents=pd.DataFrame({TEXT_COLUMN:self.documents})
if self.labels!=None:
if self.labels!=[]:
self.df_labels =pd.DataFrame({LABEL_COLUMN:self.labels})
#multiclass
self.df_labels[LABEL_COLUMN] = self.df_labels[LABEL_COLUMN].apply(lambda x: x[0])
#self.df_labels[LABEL_COLUMN] = self.df_labels[LABEL_COLUMN].apply(lambda x: x[0])

def reduction(self):
if self.size_by_nb_sample==True:
Expand All @@ -127,11 +131,19 @@ def get_labels(self):

def get_df(self):
return pd.DataFrame({TEXT_COLUMN:self.df_documents[TEXT_COLUMN] , LABEL_COLUMN:self.df_labels[LABEL_COLUMN]})



def list_labels(self,labels):
return list(np.sort(np.unique(np.array(labels)), axis=0))


'''
def list_labels(self,labels):
label=[]
for l in labels:
if l not in label:
label.append(l)
label.sort(reverse=False)
return label
'''

def clean_stop_word(df,lang='english'):
stop_unicode = stopwords.words(lang)
Expand Down
40 changes: 33 additions & 7 deletions Manteia/Statistic.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,39 @@

class Statistic:


def __init__(self,documents=None,labels=None,name=None,path='',statistic=True):
r"""
This is the class to make statistic of text.
Args:
documents (:obj:`list`, optional, defaults to None):
A list of documents.
labels (:obj:`float`, optional, defaults to None):
A list of labels.
dataset_name (:obj:`string`, optional, defaults to ''):
Name of the dataset.
path (:obj:`string`, optional, defaults to ''):
Path to save the report.
Example::
from Manteia.Statistic import Statistic
documents=['a text','text b']
labels=['a','b']
Statistic(documents,labels)
Attributes:
"""
def __init__(self,documents=[],labels=[],dataset_name='',path='',statistic=True):
self.documents=documents
self.labels=labels
self.path=path
self.name=name
if statistic==True and documents!=None and labels!=None:
self.dataset_name=dataset_name
if statistic==True and documents!=[] and labels!=[]:
self.list_labels=self.list_labels(labels)
self.print_report()

Expand Down Expand Up @@ -139,7 +165,7 @@ def class_imbalance(self):

def report(self):
report=''
report+="Dataset : {}\n".format(self.name)
report+="Dataset : {}\n".format(self.dataset_name)
report+="Number of documents : {}\n".format(self.number_text())
report+="Type : {}\n".format(self.type(self.labels))
report+="List of labels : {}\n".format(self.list_labels)
Expand All @@ -155,9 +181,9 @@ def print_report(self):
print(self.report())

def save_report(self):
fichier=self.path+"statistical_report_"+self.name+".txt"
fichier=self.path+"statistical_report_"+self.dataset_name+".txt"
mon_fichier = open(fichier, "w")
mon_fichier.write(self.rapport)
mon_fichier.write(self.report)
mon_fichier.close()

def list_labels(self,labels):
Expand Down
58 changes: 57 additions & 1 deletion Manteia/Visualisation.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,65 @@
import matplotlib.pyplot as plt

class Visualisation:

r"""
This is the class to make visualisation of NLP task.
Args:
documents (:obj:`list`, optional, defaults to None):
A list of documents.
labels (:obj:`float`, optional, defaults to None):
A list of labels.
dataset_name (:obj:`string`, optional, defaults to ''):
Name of the dataset.
path (:obj:`string`, optional, defaults to ''):
Path to save the report.
save (:obj:`bool`, optional, defaults to False):
save the graph to the path.
show (:obj:`bool`, optional, defaults to False):
show the graph.
Example::
from Manteia.Statistic import Statistic
from Manteia.Visualisation import Visualisation
documents = [
' !?? What do you call a potato in space? Spudnik:::13 ;; // ',
'What should you do before criticizing Pac-Man? WAKA WAKA WAKA mile in his shoe.',
'What did Arnold Schwarzenegger say at the abortion clinic? Hasta last vista, baby.',
'Why do you never see elephants hiding in trees? \'Cause they are freaking good at it',
'My son just got a tattoo of a heart, a spade, a club, and a diamond, all without my permission. I guess I\'ll deal with him later.',
'Mom: "Do you want this?" Me: "No." Mom: "Ok I\'ll give it to your brother." Me: "No I want it."',
'Ibuprofen is my favorite headache medicine that also sounds like a reggae professor.',
'INTERVIEWER: Why do you want to work here? ME: *crumbs tumbling from my mouth* Oh, I don\'t. I was just walking by and saw you had donuts.',
'I\'ve struggled for years to be above the influence... But I\'ve never been able to get that high',
'With Facebook, you can stay in touch with people you would otherwise never talk to, but that\'s only one of the many awful things about it',
]
labels = [
['funny'],['not funny'],['funny'],['not funny'],['funny'],['not funny'],['not funny'],['not funny'],['funny'],['not funny'],
]
stat=Statistic(documents,labels)
dictionary=stat.dictionnary_stat_labels()
path='./visu.png'
visu = Visualisation(path)
visu.format_data(dictionary)
visu.plot_bar()
Attributes:
"""


def __init__(self,path='',name='',save=True,show=False):
def __init__(self,path='',name='',save=False,show=True):
self.path=path
self.name=name
self.save=save
Expand Down
Binary file modified docs/_build/doctrees/environment.pickle
Binary file not shown.
Binary file modified docs/_build/doctrees/index.doctree
Binary file not shown.

0 comments on commit abf1805

Please sign in to comment.