In [9]:
import urllib.request

url_train = "https://raw.githubusercontent.com/qanastek/ANTILLES/main/ANTILLES/train.conllu"
url_dev = "https://raw.githubusercontent.com/qanastek/ANTILLES/main/ANTILLES/dev.conllu"
url_test = "https://raw.githubusercontent.com/qanastek/ANTILLES/main/ANTILLES/test.conllu"

urllib.request.urlretrieve(url_train, "train.conllu")
urllib.request.urlretrieve(url_dev, "dev.conllu")
urllib.request.urlretrieve(url_test, "test.conllu")

('test.conllu', <http.client.HTTPMessage at 0x1ed18e6b9d0>)

In [10]:
def extract_sentences(filepath):
    with open(filepath, 'r', encoding='utf-8') as f:
        sentences = []
        sentence = []
        for line in f:
            line = line.strip()
            if len(line) == 0:
                if len(sentence) > 0:
                    sentences.append(sentence)
                    sentence = []
            else:
                tokens = line.split('\t')
                if len(tokens) >= 4:
                    sentence.append((tokens[1], tokens[3]))
        if len(sentence) > 0:
            sentences.append(sentence)
        return sentences

train_sents = extract_sentences('train.conllu')
dev_sents = extract_sentences('dev.conllu')
test_sents = extract_sentences('test.conllu')



In [19]:
dev_sents

[[('Aviator', 'PROPN'),
  (',', 'PUNCT'),
  ('un', 'DINTMS'),
  ('film', 'NMS'),
  ('sur', 'PREP'),
  ('la', 'DETFS'),
  ('vie', 'NFS'),
  ('de', 'PREP'),
  ('Hughes', 'PROPN'),
  ('.', 'YPFOR')],
 [('Les', 'DET'),
  ('études', 'NFP'),
  ('durent', 'VERB'),
  ('six', 'CHIF'),
  ('ans', 'NMP'),
  ('mais', 'COCO'),
  ('leur', 'DET'),
  ('contenu', 'NMS'),
  ('diffère', 'VERB'),
  ('donc', 'ADV'),
  ('selon', 'PREP'),
  ('les', 'DET'),
  ('Facultés', 'NOUN'),
  ('.', 'YPFOR')],
 [('Mais', 'COCO'),
  ('comment', 'ADV'),
  ('faire', 'VERB'),
  ('dans', 'PREP'),
  ('un', 'DINTMS'),
  ('contexte', 'NMS'),
  ('structurellement', 'ADV'),
  ('raciste', 'ADJMS'),
  ('?', 'PUNCT')],
 [("L'", 'DET'),
  ('«', 'PUNCT'),
  ('oasis', 'NFS'),
  ('de', 'PREP'),
  ('vie', 'NFS'),
  ('»', 'PUNCT'),
  (',', 'PUNCT'),
  ('dans', 'PREP'),
  ('un', 'DINTMS'),
  ('milieu', 'NMS'),
  ('où', 'ADV'),
  ('règne', 'VERB'),
  ("l'", 'DET'),
  ('obscurité', 'NFS'),
  ('totale', 'ADJFS'),
  ('et', 'COCO'),
  ('une', 'D

In [11]:
train_sents

[[('Les', 'DET'),
  ('commotions', 'NFP'),
  ('cérébrales', 'ADJFP'),
  ('sont', 'AUX'),
  ('devenu', 'VPPMS'),
  ('si', 'ADV'),
  ('courantes', 'ADJFP'),
  ('dans', 'PREP'),
  ('ce', 'PDEMMS'),
  ('sport', 'NMS'),
  ("qu'", 'COSUB'),
  ('on', 'PINDMS'),
  ('les', 'PPOBJMS'),
  ('considére', 'VERB'),
  ('presque', 'ADV'),
  ('comme', 'PREP'),
  ('la', 'DETFS'),
  ('routine', 'NFS'),
  ('.', 'YPFOR')],
 [("L'", 'DET'),
  ('œuvre', 'NFS'),
  ('est', 'AUX'),
  ('située', 'VPPFS'),
  ('dans', 'PREP'),
  ('la', 'DETFS'),
  ('galerie', 'NFS'),
  ('des', '_'),
  ('de', 'PREP'),
  ('les', 'DET'),
  ('batailles', 'NFP'),
  (',', 'PUNCT'),
  ('dans', 'PREP'),
  ('le', 'DETMS'),
  ('château', 'NMS'),
  ('de', 'PREP'),
  ('Versailles', 'PROPN'),
  ('.', 'YPFOR')],
 [('Le', 'DETMS'),
  ('comportement', 'NMS'),
  ('de', 'PREP'),
  ('la', 'DETFS'),
  ('Turquie', 'PROPN'),
  ('vis-à-vis', 'ADV'),
  ('du', '_'),
  ('de', 'PREP'),
  ('le', 'DETMS'),
  ('problème', 'NMS'),
  ('palestinien', 'ADJMS'),
  (

In [12]:
def train_hmm_pos_tagger(train_sents):
    tagged_sents = [nltk.tag.untag(sent) for sent in train_sents]
    unigram_tagger = nltk.UnigramTagger(train_sents)
    bigram_tagger = nltk.BigramTagger(train_sents, backoff=unigram_tagger)
    trigram_tagger = nltk.TrigramTagger(train_sents, backoff=bigram_tagger)

    return trigram_tagger

hmm_pos_tagger = train_hmm_pos_tagger(train_sents)


In [16]:
import spacy

#le modèle français
nlp = spacy.load("fr_core_news_sm")

def evaluate_spacy_pos_tagger(test_sents):
    correct = 0
    total = 0

    for sent in test_sents:
        words, gold_tags = zip(*sent)
        doc = nlp(" ".join(words))  # Concaténez les mots en une chaîne de caractères
        tags = [token.pos_ for token in doc]
        for word, gold_tag, tag in zip(words, gold_tags, tags):
            total += 1
            if gold_tag == tag:
                correct += 1

    accuracy = correct / total
    return accuracy

test_texts = [' '.join(sent[0] for sent in sents) for sents in test_sents]
test_docs = list(nlp.pipe(test_texts))
test_spacy_sents = [[(token.text, token.pos_) for token in doc] for doc in test_docs]

test_accuracy = evaluate_spacy_pos_tagger(test_spacy_sents)
print("Test set accuracy:", test_accuracy)


Test set accuracy: 0.977339992600814


In [17]:
for i in range(10):
    print(f"Phrase {i + 1}:")
    for word, tag in test_spacy_sents[i]:
        print(f"{word}\t{tag}")
    print()

Phrase 1:
Je	PRON
sens	VERB
qu	NUM
'	PUNCT
entre	ADP
ça	PRON
et	CCONJ
les	DET
films	NOUN
de	ADP
médecins	NOUN
et	CCONJ
scientifiques	ADJ
fous	ADJ
que	SCONJ
nous	PRON
avons	AUX
déjà	ADV
vus	VERB
,	PUNCT
nous	PRON
pourrions	VERB
emprunter	VERB
un	DET
autre	ADJ
chemin	NOUN
pour	ADP
l	DET
'	DET
origine	NOUN
.	PUNCT

Phrase 2:
On	PRON
pourra	VERB
toujours	ADV
parler	VERB
à	ADP
propos	NOUN
d	ADP
'	ADP
Averroès	NOUN
de	ADP
"	PUNCT
décentrement	NOUN
du	ADP
de	ADP
le	DET
Sujet	NOUN
"	PUNCT
.	PUNCT

Phrase 3:
«	NOUN
Il	PRON
a	AUX
été	AUX
largement	ADV
démontré	VERB
que	SCONJ
la	DET
population	NOUN
civile	ADJ
du	ADP
de	ADP
le	DET
territoire	NOUN
non	ADV
autonome	ADJ
du	ADP
de	ADP
le	DET
Sahara	PROPN
occidental	ADJ
est	AUX
l	DET
'	DET
objet	NOUN
de	ADP
diverses	ADJ
atteintes	VERB
aux	ADP
à	ADP
les	DET
droits	NOUN
humains	ADJ
,	PUNCT
comme	ADP
la	DET
détention	NOUN
arbitraire	ADJ
,	PUNCT
les	DET
coups	NOUN
et	CCONJ
les	DET
tortures	NOUN
»	PUNCT
,	PUNCT
écrit	VERB
l	DET
'	DET
ONG	NOUN
internationale

In [18]:
test_sentence = "Le chat mange la souris."

doc = nlp(test_sentence)

print("Test sentence:")
for token in doc:
    print(f"{token.text}\t{token.pos_}")

Test sentence:
Le	DET
chat	NOUN
mange	VERB
la	DET
souris	PROPN
.	PUNCT
