In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import string

class ConLLUParser:

    def __init__(self, path, verbose=False) -> None:
        # open .conllu file and split each line and remove the '\n' character at the end
        with open(path) as f:
            raw_lines = f.read().splitlines()
            #splitting lines from the conlu parser 
        
        # retrieve the name of the columns
        self.col_names = [
            'ID', 'FORM', 'LEMMA', 'UPOS', 'XPOS', 'FEATS', 'HEAD', 'DEPREL', 'DEPS', 'MISC'
        ]
        
        # all_words stores each word of the corpus, identified by sent_id (id of the sentence in the corpus) and the ID (position of the word in the sentence)
        # all_contractions stores the contractions, e.g. 'du' -> 'de le' ; 'didn't' -> 'did n't'
        # all_sentences stores each sentence of the corpus, identified by sent_id and computes the length of the sentence
        all_words, all_contractions, all_sentences = [], [], []
        
        # temporary variables
        temp, count, n_exceptions = [], 0, 0

        # treat the corpus line by line using pop function
        while len(raw_lines) > 0:
            
            line = raw_lines.pop(0)
            if len(line) > 0:
                splits = line.split('\t')
            
            # the sentences always start by a line giving the sent_id "# sent_id = ..."
            if line.startswith('# sent_id = '):
                id = line.replace('# sent_id = ', '')
                temp.append(id)
                
            
            # then another one gives the sentence "# text = ..."
            elif line.startswith('# text = '):
                text = line.replace('# text = ', '')
                temp.append(text)
            
            # a sentence is ended by a blank line
            elif line == '':
                if count > 0:
                    temp.append(count)
                    all_sentences.append(temp)
                count = 0
                temp = []

            # print all lines that don't start by a digit as their are not useful
            # count them for later sanity checks
            elif not line[0].isdigit():
                if verbose:
                    print(f'IS OMITTED: {line}')
                n_exceptions += 1

            # the lines that start by a number contain the words and their characteristics
            elif splits[0].isdigit():
                assert len(splits) == len(self.col_names)
                all_words.append([id] + splits)
                count += 1

            # if the start of a line contains a dash, then it represents a contradiction
            elif '-' in splits[0] or '.' in splits[0]:
                all_contractions.append([id] + splits[:2])
            
            else:
                raise ValueError(f'Cannot handle this line: {line}')
            

        self.sentences = pd.DataFrame(all_sentences, columns=['sent_id', 'text', 'length'])
        self.words = pd.DataFrame(all_words, columns=['sent_id'] + self.col_names)
        self.contradictions = pd.DataFrame(all_contractions, columns=['sent_id', 'ID', 'FORM'])
            


if __name__=="__main__":

    verbose = False
    fr_train = ConLLUParser('fr_gsd-ud-train.conllu', verbose)
    fr_dev = ConLLUParser('fr_gsd-ud-dev.conllu', verbose)
    fr_test = ConLLUParser('fr_gsd-ud-test.conllu', verbose)



In [3]:
train_contradictions=fr_train.contradictions
train_sentences=fr_train.sentences
train_words=fr_train.words

In [4]:
train_sentences

Unnamed: 0,sent_id,text,length
0,fr-ud-train_00001,Les commotions cérébrales sont devenu si coura...,19
1,fr-ud-train_00002,L'œuvre est située dans la galerie des bataill...,17
2,fr-ud-train_00003,Le comportement de la Turquie vis-à-vis du pro...,34
3,fr-ud-train_00004,"Toutefois, les filles adorent les desserts.",8
4,fr-ud-train_00005,Ismene entre et annonce que c'est Farnace qui ...,18
...,...,...,...
14445,fr-ud-train_14550,"Le 28 mars 1792, ces territoires formèrent deu...",22
14446,fr-ud-train_14551,Ce débutant de l'année 1983 et double All-Star...,58
14447,fr-ud-train_14552,La population est alors indigène et fait parti...,25
14448,fr-ud-train_14553,"Mais MSI propose aussi, pour 699 euros, une ve...",25


In [5]:
fr_train.words

Unnamed: 0,sent_id,ID,FORM,LEMMA,UPOS,XPOS,FEATS,HEAD,DEPREL,DEPS,MISC
0,fr-ud-train_00001,1,Les,le,DET,_,Definite=Def|Number=Plur|PronType=Art,2,det,_,wordform=les
1,fr-ud-train_00001,2,commotions,commotion,NOUN,_,Gender=Fem|Number=Plur,5,nsubj,_,_
2,fr-ud-train_00001,3,cérébrales,cérébral,ADJ,_,Gender=Fem|Number=Plur,2,amod,_,_
3,fr-ud-train_00001,4,sont,être,AUX,_,Mood=Ind|Number=Plur|Person=3|Tense=Pres|VerbF...,5,aux:tense,_,_
4,fr-ud-train_00001,5,devenu,devenir,VERB,_,Gender=Masc|Number=Sing|Tense=Past|Typo=Yes|Ve...,0,root,_,CorrectForm=devenues|CorrectGender=Fem|Correct...
...,...,...,...,...,...,...,...,...,...,...,...
354562,fr-ud-train_14554,39,d',de,ADP,_,_,40,case,_,SpaceAfter=No
354563,fr-ud-train_14554,40,en,en,ADP,_,_,38,advmod,_,ExtPos=ADV|Idiom=Yes
354564,fr-ud-train_14554,41,haut,haut,NOUN,_,Gender=Masc|Number=Sing,40,fixed,_,InIdiom=Yes
354565,fr-ud-train_14554,42,»,»,PUNCT,_,_,34,punct,_,SpaceAfter=No


In [6]:
words_interest=train_words[['FORM','UPOS']]
words_interest.groupby('UPOS')
words_interest['UPOS'].value_counts(normalize=False)


NOUN     66421
ADP      56473
DET      54144
PUNCT    39011
VERB     28185
PROPN    24750
ADJ      20981
PRON     16050
ADV      12443
AUX      11587
CCONJ     9301
NUM       9255
X         2680
SCONJ     2598
SYM        619
INTJ        69
Name: UPOS, dtype: int64

In [7]:
import string
import pandas as pd
import random

POS_LABEL = [
    "NOUN", "ADP", "DET", "PUNCT", "VERB", "PROPN", "ADJ", "PRON", "ADV",
    "AUX", "CCONJ", "NUM", "X", "SCONJ", "SYM", "INTJ"
]

from sklearn.metrics import accuracy_score




class NaiveClassifier:
    

    def __init__(self) -> None:
        pass

    def predict(self, data: pd.DataFrame):

        assert 'FORM' in data.columns, 'Column FORM is missing from data'
        
        def naive_rules(word):
            word = word.lower()  # Convert word to lowercase for consistent rule checking
            
            if word.endswith(('tion', 'sion', 'ment')) or word in ['ville', 'année', 'pays', 'nom', 'jour', 'mois', 'homme', 'femme', 'vie', 'enfant']:
                return 'NOUN'
            
            if word.endswith(('er', 'ir', 're')) or word in ['être', 'avoir', 'est', 'était', 'été', 'fut', 'étant']:
                return 'VERB'
            if word in ['être', 'avoir', 'est', 'était', 'été', 'fut', 'étant']:
                return 'AUX'
            if word.endswith(('eux', 'euse')) or word in ['premier', 'première', 'français', 'française', 'nouveau', 'nouvelle', 'grand', 'grande', 'petit', 'petite']:
                return 'ADJ'
            if word.endswith('ment') or word in ['très', 'bien', 'aussi', 'toujours', 'surtout', 'encore', 'trop', 'jamais', 'vraiment']:
                return 'ADV'
            if word in ['le', 'la', 'les', 'l', 'un', 'une', 'des']:
                return 'DET'
            if word in ['il', 'elle', 'nous', 'vous', 'ils', 'elles', 'lui', 'on', 'je', 'me', 'ma', 'mon', 'leur', 'leurs']:
                return 'PRON'
            if word in ['et', 'ou', 'mais', 'donc']:
                return 'CCONJ'
            if word in ['que', 'quand', 'si', 'lorsque']:
                return 'SCONJ'
            if word in ['de', 'à', 'en', 'sur', 'avec', 'dans', 'par', 'pour', 'sans', 'sous', 'contre', 'après', 'avant', 'vers', 'chez']:
                return 'ADP'
            if word.isdigit() or word in ['un', 'deux', 'trois', 'deuxième', 'troisième']:
                return 'NUM'
            if word[0].isupper() and word not in ['et', 'ou', 'mais', 'donc']:
                return 'PROPN'
            if word in [',', '.', '(', ')', ':', ';', '«', '»', '!', '?']:
                return 'PUNCT'
            if word in ['%', '€']:
                return 'SYM'
            if word in ['oh', 'ah', 'eh']:
                return 'INTJ'
            return 'NOUN'

        
        return data.FORM.apply(naive_rules).tolist()
    def evaluate(self, y_true, y_pred):
        return accuracy_score(y_true, y_pred)

class RandomClassifier:

    def __init__(self, random_seed=0) -> None:
        self.random_seed = random_seed
        self.label = POS_LABEL
    
    def predict(self, data: pd.DataFrame):
        random.seed(self.random_seed)
        return [random.choice(self.label) for _ in range(len(data))]
    def evaluate(self, y_true, y_pred):
        return accuracy_score(y_true, y_pred)

class StratifiedClassifier: 
    def __init__(self) -> None :
        pass
    def predict(self, data: pd.DataFrame):
        valuecounts= data['UPOS'].value_counts(normalize=True).to_dict()
        probabilities = list(valuecounts.values())
        labels = list(valuecounts.keys())
        N=len(data)
        
        return np.random.choice(POS_LABEL, size=N, p=probabilities)
    def evaluate(self, data: pd.DataFrame):
        y_true = data['UPOS'].tolist()
        y_pred = self.predict(data)
        
        assert len(y_true) == len(y_pred), "y_true and y_pred do not have the same length"
        
        metrics = {}
        metrics['accuracy'] = accuracy_score(y_true, y_pred)
        return metrics

class MostCommonPOSClassifier:
    def __init__(self) -> None:
        self.most_common_pos = None

    def predict(self, data: pd.DataFrame):
        self.most_common_pos = data['UPOS'].value_counts().idxmax()    
        return [self.most_common_pos] * len(data)
    
    def evaluate(self, data: pd.DataFrame):
        y_true = data['UPOS'].tolist()
        y_pred = self.predict(data)
        assert len(y_true) == len(y_pred), "y_true and y_pred do not have the same length"
        metrics = {}
        metrics['accuracy'] = accuracy_score(y_true, y_pred)
        return metrics

class WordDistributedRandomClassifier:
    def __init__(self):
        self.word_distributions = {}
        self.global_distribution = None

    def fit(self, data: pd.DataFrame):
        #on train_data
        # what's the POS distribution on each word 
        self.global_distribution = data['UPOS'].value_counts(normalize=True).to_dict()
        for word, group in data.groupby('FORM'):
            self.word_distributions[word] = group['UPOS'].value_counts(normalize=True).to_dict()

    def predict(self, data: pd.DataFrame):
        predictions = []
        for word in data['FORM']:
            if word in self.word_distributions:
                pos_tags = list(self.word_distributions[word].keys())
                probabilities = list(self.word_distributions[word].values())
                predictions.append(np.random.choice(pos_tags, p=probabilities))
            else:
                pos_tags = list(self.global_distribution.keys())
                probabilities = list(self.global_distribution.values())
                predictions.append(np.random.choice(pos_tags, p=probabilities))
        return predictions

    def evaluate(self, data: pd.DataFrame):
        y_true = data['UPOS'].tolist()
        y_pred = self.predict(data)
        return accuracy_score(y_true, y_pred)


data = train_words




In [182]:
word_distributed_classifier = WordDistributedRandomClassifier()
word_distributed_classifier.fit(train_words)  # Assuming train_data is your training dataset
# Training and evaluating
word_distributed_classifier = WordDistributedRandomClassifier()
word_distributed_classifier.fit(train_data)  # Assuming train_data is your training dataset

predictions = word_distributed_classifier.predict(test_data)  # Assuming test_data is your test dataset

accuracy = evaluate(test_data['UPOS'].tolist(), predictions)
print(accuracy)

In [159]:
stratifiedClassifier = StratifiedClassifier()
stratified_accuracy = stratifiedClassifier.evaluate(data)
print(f"Accuracy of StratifiedClassifier: {stratified_accuracy}")

Accuracy of NaiveClassifier: {'accuracy': 0.11635882639952393}


In [157]:
stratified_predictions

array(['AUX', 'NOUN', 'PROPN', ..., 'DET', 'DET', 'NOUN'], dtype='<U5')

In [107]:
occurence_of_words=data['FORM'].value_counts()
occurence_of_words.to_csv('occurence_of_words.csv')

In [164]:

import pandas as pd
#Q1: ____________________________
naive_classifier = NaiveClassifier()
random_classifier = RandomClassifier()

naive_predictions = naive_classifier.predict(data)

naive_accuracy = naive_classifier.evaluate(data['UPOS'].tolist(), naive_predictions)
print(f"Accuracy of NaiveClassifier: {naive_accuracy}")

#Q2: For 10 differentt seeds, computing accuracy per each case : 
for seed in range(10):
    random_classifier = RandomClassifier(random_seed=seed)
    random_predictions = random_classifier.predict(data)
    random_accuracy = evaluate(data['UPOS'].tolist(), random_predictions)
    print(f"Accuracy of RandomClassifier with seed {seed}: {random_accuracy}")
    
# Q3: For POS distribution : 
stratifiedClassifier = StratifiedClassifier()
stratified_accuracy = stratifiedClassifier.evaluate(data)
print(f"Accuracy of StratifiedClassifier: {stratified_accuracy}")

# Q4: For mostCommon POS: 
mostCommonPOSClassifier = MostCommonPOSClassifier()
mostCommonPOSClassifier_accuracy = mostCommonPOSClassifier.evaluate(data)
print(f"Accuracy of MostCommonClassifier: {mostCommonPOSClassifier_accuracy}")

# Q5: 
word_distributed_classifier = WordDistributedClassifier()
word_distributed_classifier.fit(train_data)  # Assuming train_data is your training dataset

predictions = word_distributed_classifier.predict(test_data)  # Assuming test_data is your test dataset

accuracy = evaluate(test_data['UPOS'].tolist(), predictions)
print(accuracy)

Accuracy of NaiveClassifier: 0.6216004309481706
Accuracy of RandomClassifier with seed 0: {'accuracy': 0.06279490195083016}
Accuracy of RandomClassifier with seed 1: {'accuracy': 0.06276669853652483}
Accuracy of RandomClassifier with seed 2: {'accuracy': 0.06123807348117563}
Accuracy of RandomClassifier with seed 3: {'accuracy': 0.062453640637735605}
Accuracy of RandomClassifier with seed 4: {'accuracy': 0.06364382472142077}
Accuracy of RandomClassifier with seed 5: {'accuracy': 0.062287240493334124}
Accuracy of RandomClassifier with seed 6: {'accuracy': 0.06234082698051426}
Accuracy of RandomClassifier with seed 7: {'accuracy': 0.06260311873355388}
Accuracy of RandomClassifier with seed 8: {'accuracy': 0.06235492868766693}
Accuracy of RandomClassifier with seed 9: {'accuracy': 0.061759836645824344}
Accuracy of StratifiedClassifier: {'accuracy': 0.11656189098252234}
Accuracy of MostCommonClassifier: {'accuracy': 0.18732989815747095}


## Question 4 : by frequency of each POS

 answer has been detailed above

In [140]:
c=data.UPOS.value_counts(normalize=True)
c

NOUN     0.187330
ADP      0.159273
DET      0.152705
PUNCT    0.110024
VERB     0.079491
PROPN    0.069803
ADJ      0.059174
PRON     0.045266
ADV      0.035094
AUX      0.032679
CCONJ    0.026232
NUM      0.026102
X        0.007559
SCONJ    0.007327
SYM      0.001746
INTJ     0.000195
Name: UPOS, dtype: float64

## Question 5:  random for each POS that doesn't belong to training data words 

In [184]:
data

Unnamed: 0,sent_id,ID,FORM,LEMMA,UPOS,XPOS,FEATS,HEAD,DEPREL,DEPS,MISC
0,fr-ud-train_00001,1,Les,le,DET,_,Definite=Def|Number=Plur|PronType=Art,2,det,_,wordform=les
1,fr-ud-train_00001,2,commotions,commotion,NOUN,_,Gender=Fem|Number=Plur,5,nsubj,_,_
2,fr-ud-train_00001,3,cérébrales,cérébral,ADJ,_,Gender=Fem|Number=Plur,2,amod,_,_
3,fr-ud-train_00001,4,sont,être,AUX,_,Mood=Ind|Number=Plur|Person=3|Tense=Pres|VerbF...,5,aux:tense,_,_
4,fr-ud-train_00001,5,devenu,devenir,VERB,_,Gender=Masc|Number=Sing|Tense=Past|Typo=Yes|Ve...,0,root,_,CorrectForm=devenues|CorrectGender=Fem|Correct...
...,...,...,...,...,...,...,...,...,...,...,...
354562,fr-ud-train_14554,39,d',de,ADP,_,_,40,case,_,SpaceAfter=No
354563,fr-ud-train_14554,40,en,en,ADP,_,_,38,advmod,_,ExtPos=ADV|Idiom=Yes
354564,fr-ud-train_14554,41,haut,haut,NOUN,_,Gender=Masc|Number=Sing,40,fixed,_,InIdiom=Yes
354565,fr-ud-train_14554,42,»,»,PUNCT,_,_,34,punct,_,SpaceAfter=No


In [195]:
word_frequencies = data['FORM'].value_counts()

distribution = data.groupby('FORM')['UPOS'].value_counts(normalize=True).unstack().fillna(0)

df=distribution.reindex(word_frequencies.index)
df

UPOS,ADJ,ADP,ADV,AUX,CCONJ,DET,INTJ,NOUN,NUM,PRON,PROPN,PUNCT,SCONJ,SYM,VERB,X
de,0.0,0.984778,0.000042,0.00000,0.0,0.014002,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.00000,0.001177
",",0.0,0.000000,0.000000,0.00000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,1.0,0.0,0.0,0.00000,0.000000
.,0.0,0.000000,0.000000,0.00000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,1.0,0.0,0.0,0.00000,0.000000
le,0.0,0.000000,0.000000,0.00000,0.0,0.979395,0.0,0.0,0.0,0.020444,0.0,0.0,0.0,0.0,0.00000,0.000161
à,0.0,0.999450,0.000000,0.00033,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.00011,0.000110
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
comprimant,0.0,0.000000,0.000000,0.00000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,1.00000,0.000000
recyclant,0.0,0.000000,0.000000,0.00000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,1.00000,0.000000
évaporateurs,0.0,0.000000,0.000000,0.00000,0.0,0.000000,0.0,1.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.00000,0.000000
Kriegsmarine,0.0,0.000000,0.000000,0.00000,0.0,0.000000,0.0,0.0,0.0,0.000000,1.0,0.0,0.0,0.0,0.00000,0.000000


In [196]:
distribution[]

UPOS,ADJ,ADP,ADV,AUX,CCONJ,DET,INTJ,NOUN,NUM,PRON,PROPN,PUNCT,SCONJ,SYM,VERB,X
de,0.0,0.984778,0.000042,0.00000,0.0,0.014002,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.00000,0.001177
",",0.0,0.000000,0.000000,0.00000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,1.0,0.0,0.0,0.00000,0.000000
.,0.0,0.000000,0.000000,0.00000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,1.0,0.0,0.0,0.00000,0.000000
le,0.0,0.000000,0.000000,0.00000,0.0,0.979395,0.0,0.0,0.0,0.020444,0.0,0.0,0.0,0.0,0.00000,0.000161
à,0.0,0.999450,0.000000,0.00033,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.00011,0.000110
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
comprimant,0.0,0.000000,0.000000,0.00000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,1.00000,0.000000
recyclant,0.0,0.000000,0.000000,0.00000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,1.00000,0.000000
évaporateurs,0.0,0.000000,0.000000,0.00000,0.0,0.000000,0.0,1.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.00000,0.000000
Kriegsmarine,0.0,0.000000,0.000000,0.00000,0.0,0.000000,0.0,0.0,0.0,0.000000,1.0,0.0,0.0,0.0,0.00000,0.000000


In [None]:
for word, group in distribution:
    [word] = group['UPOS'].value_counts(normalize=True).to_dict()