In [1]:
from itertools import chain
import nltk
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelBinarizer
import sklearn
import pycrfsuite

print(sklearn.__version__)

0.15.2


# Let's use CRF for a fine grain opinion detection

We'll show an exemple using different features, textuals and audio.


In [3]:
from __future__ import division
from mesures import F1_token
import os
from itertools import product
import numpy as np
from sys import platform
from numpy import sign

# To know if I am on the MAC or on the PC with Linux             
CURRENT_OS = platform   
if CURRENT_OS == 'darwin':         
    INIT_PATH = "/Users/Valou/"
elif CURRENT_OS == 'linux2':
    INIT_PATH = "/home/valentin/"

path = INIT_PATH + "Dropbox/TELECOM_PARISTECH/Stage_Lucas/Datasets/Semaine/"
path_model = INIT_PATH + 'Dropbox/TELECOM_PARISTECH/Stage_Lucas/MonProjet/models/' + 'ipython/'
path_results = INIT_PATH + '/Dropbox/TELECOM_PARISTECH/Stage_Lucas/MonProjet/results/' + 'ipython/'

ALL_LABELS = {'attitude_positive', 'attitude_negative', 'source', 'target'}
ALL_FILES = sorted(os.listdir(path+"all/dump/")) # nom de tous les fichiers contenus dans path+"all/dump" tries dans l'ordre

## Features

### Basic functions (features_text.py)
Here we define the function that we will call to make the features

In [5]:
from nltk import word_tokenize
from nltk.corpus import sentiwordnet as swn

def __rules2features(features, sent, i):
    """
    features['inRule'] = 1.0 si la phrase est un pattern detecté par caro
    features['inTarget'] = 1.0 si mot i est target du pattern
    Cette fonction est foireuse car elle utilise la vérité terrain pour s'entrainer
    """
    
    formated_sent = " ".join([sent[k][0] for k in range(len(sent))])
    if formated_sent in PATTERNS:
        features['inRule'] = 1.0
        target = PATTERNS[formated_sent]
        if sent[i][0] in target:
            features['inTarget'] = 1.0
    return features
    
    
def __features_base(sent, i, nb_neighbours):
    """
    Basic features of each word, including the word and pos-tags of the context
    """
    word = sent[i][0].lower() # literallement le mot sans les maj
    postag = sent[i][1][:2] #2 premieres lettres du POS-tag uniquement car decrit plus simplement
    features = {
        'bias': 1.0, # pourquoi ce bias ?
        'word': word,
        'postag': postag
    }   
    
    # Number of words that you take into the context
    if nb_neighbours == None:
        nb_neighbours = 2
    
    for k in range(1,nb_neighbours+1): # Begin at k = 1
        if i > k-1: # If not k-th word of the sentence
        
            word_neigh_buff = sent[i-k][0].lower()
            postag_buff = sent[i-k][1][:2]
            features.update({
                ('%d:word.lower=' %-k) : word_neigh_buff,
                ('%d:postag=' %-k) : postag_buff,
            })
        else: # If (k-1)-th word = Place In Sentence
            features[('P%dIS' %k)] = 1.0
        
        if i < len(sent) - k: # If not k-th last word of the sentence
            word_neigh_buff = sent[i+k][0].lower()
            postag_buff = sent[i+k][1][:2]
            features.update({
                ('%d:word.lower=' %k) : word_neigh_buff,
                ('%d:postag=' %k) : postag_buff,
            })
        else: # If (len(sent) - k)-th word doesn't exist
            features[('P%dIS' %-k)] = 1.0

    return features            

MORPHY_TAG = {'NN': 'n', 'JJ': 'a', 'VB': 'v', 'RB': 'r', 'WR': 'r'}
def __swn_scores(features):
    """
    The SentiWordNet scores of each word
    """
    try:
        # rappel : MORPHY_TAG = {'NN': 'n', 'JJ': 'a', 'VB': 'v', 'RB': 'r', 'WR': 'r'}
        tag_conversion = MORPHY_TAG[features['postag']]
    
        synset = list(swn.senti_synsets(features['word'], pos=tag_conversion))[0] # variable SWN assez long au niveau du tps
        # On choisit le 0         
        polarity = [synset.pos_score(), synset.neg_score(), synset.obj_score()] # score SWN (triplet)
        features.update({
            'sentisynset.pos': polarity[0],
            'sentisynset.neg': polarity[1],
            'sentisynset.obj': polarity[2]
        })
#        return polarity
    except (KeyError, IndexError):
#        return (None,None,None)
        pass
    
    return features
    
def __phrase_type(sent):
    """
    Return VB if there is a verbe in the sentence
    """
    boolVP = False
    for j in range(len(sent)):
        if sent[j][1][:2] == 'VB': # 2 premieres lettres du postag du mot j
            boolVP = True
            return 'VP'
    if boolVP is False:
        return 'NP'  

#### New features implemented (features_text.py)

Used in 
- <i>Recognizing Contextual Polarity in Phrase-Level Sentiment Analysis</i>,  Wilson et al. 
- <i>Lexicon-Based Methods for Sentiment Analysis</i>, Taboada et al.

In [6]:
PATH_LEXICON = INIT_PATH + 'Dropbox/TELECOM_PARISTECH/Corpus-Lexiques/Lexiques/'

import pickle
SO_CAL_NAME = PATH_LEXICON + 'SO-CAL/SO-CAL_Lexicon.PKL'

def return_lexicon(PATH):
    f = open(PATH)
    var = f.readlines()
    f.close()
    lex = []
    for i in range(len(var)):
        lex.append(var[i][:-1])
    return lex
    
# lex [POS+word]=score ; POS : 876 'adv', 2820 'adj', 1539 'noun', 1130 'verb' 217 'int' 
lex = pickle.load(open(SO_CAL_NAME,'rb'))
lex_keys = lex.keys()

# the keys are the beginning POS-tag contained in Caro's dumps
MORPHY_TAG_LEX = {'NN': 'noun', 'JJ': 'adj', 'VB': 'verb', 'RB': 'adv', 'WR': 'adv'}

# On aurait : if MORPHY_TAG_LEX[POS]+word in lex.keys() le mot aurait une valeur d'opinion
list_intens = {}
for key in lex_keys:
    if key[:3] == 'int': list_intens[key[3:]]=lex[key]
del list_intens[''] # petit bug a corriger

NEGATION_TOKENS = PATH_LEXICON + 'Minqing_Hu/negation-tokens.txt'  
list_nega = return_lexicon(NEGATION_TOKENS)
list_nega_0 = ["n't",'not','no']

In [7]:
def __negation(sent, i, context):
    """
    If there is a negation at least context word before the word
    ---------->WORK<----------
    """
    bool_neg = False
    for k in range(1,context+1): # Begin at k = 1
        if i > k-1 and ( sent[i-k][0].lower() in list_nega_0) : # If not k-th word of the sentence
            bool_neg = True
    
    return bool_neg
    
def __newI(sent,i):
    """
    Return True if there is a "and" (pos = CC) then a I
    
    ---------->DON'T WORK<----------
    """
    if i > 0 and sent[i-1][1][:2] == "CC" and sent[i][0] =="i":
        return True
    else:
        return False

################################ New features ############################
def __adj(sent,i):
    """
    Return True if the word is a noun preceded by an adj
    """
    if i > 0 and sent[i-1][1][:2] == "JJ" and sent[i][1][:2] =="NN":
        return True
    else:
        return False    

def __adv(sent,i):
    """
    Return True if the preceding word is an adverb other than not
    """
    if i > 0 and sent[i-1][1][:2] == "RB" and sent[i-1][0].lower() !="not":
        return True
    else:
        return False    


######## Features de Lexicon-based approach #############

def __intens(sent,i):
    """
    Return True if the preceding word is an intensifier
    """
    if i > 0 and sent[i-1][0].lower() in list_intens:
        return True
    else:
        return False
    
def __intens_is(sent,i):
    """
    Return True if the word itself is an intensifier
    """
    if sent[i][0].lower() in list_intens:
        return True
    else:
        return False 


def __SO_value(sent,i):
    """
    Return the SO value of the word if it has one, False either
    """
    word = sent[i][0].lower() # literallement le mot sans les maj
    postag = sent[i][1][:2] #2 premieres lettres du POS-tag uniquement car decrit plus simplement
    
    if postag+word in lex_keys:
        return lex[postag+word]
    else: 
        return False

### Word2feature function (features_text.py)
Function that will transform the different words into the features used by the CRF.

In [13]:
# Add the new features in the list of you add it in params
LIST_FEATURES = ['nb_neighbours','context_negation','rules_synt', 'newI', \
'rules_synt', 'swn_scores','swn_pos', 'swn_neg', 'swn_obj', 'inverse_score', \
'adj', 'adv','intens','intens_is','SO_value','SO_intensifier','SO_negation']

def __word2features(sent, i, params):
    u"""Features lexicaux et syntaxiques.
    nb_neighbours est la taille du contexte que l'on prend en nombre de mots
    ---->Si l'on veut ajouter une feature, il faut la mettre dans params et ensuite 
    l'ajouter dans la liste LIST_FEATURES pour la prendre en compte si elle y est   
    """      
    boolean = {}
    
    for k in LIST_FEATURES:
        if k in params:
            boolean[k] = params[k]
        else:
            boolean[k] = 0
      
    # Basic features of each word : those ones are always there by default
    features = __features_base(sent, i, boolean['nb_neighbours'])
    
    # si ya VB ds la phrase VP, sinon NP
    features['phrase_type'] = __phrase_type(sent) 
    
    ### SWN score ###
    if boolean['swn_scores'] != 0:
        features = __swn_scores(features)
        
    if 'sentisynset.pos' in features.keys(): 
        bool_senti = True
    else: 
        bool_senti = False 

    ### More SWN score ####
    if boolean['swn_pos'] != 0 and bool_senti:
        features['swn_pos_score_bis'] = features['sentisynset.pos']    
    if boolean['swn_neg'] != 0 and bool_senti:
        features['swn_pos_score_bis'] = features['sentisynset.neg']            
    if boolean['swn_obj'] != 0 and bool_senti:
        features['swn_obj_score_bis'] = features['sentisynset.obj']      
    
    if boolean['context_negation'] != 0:
        features['negation'] = __negation(sent, i, boolean['context_negation'])
    
    # True if there is a 'I' after a 'and'
    if boolean['newI'] == True:
        features['newI'] = __newI(sent,i)
    
    # rules_syntaxic
    if boolean['rules_synt'] == True:
        features = __rules2features(features, sent, i)
        
    # If negation before, inverse the pos and neg score
    if boolean['inverse_score'] == True and features['negation'] == True and bool_senti:
        buff = features['sentisynset.pos']        
        features['sentisynset.pos'] = features['sentisynset.neg']
        features['sentisynset.neg'] = buff
    
    if boolean['adj'] == True:
        features['adj'] = __adj(sent,i)
        
    if boolean['adv'] == True:
        features['adv'] = __adv(sent,i)

    if boolean['intens'] == True:
        features['intens'] = __intens(sent,i)
        
    if boolean['intens_is'] == True:
        features['intens_is'] = __intens_is(sent,i)        
    
        
    if boolean['SO_value'] == True:
        SO_value = __SO_value(sent,i)
        # If there is a value
        if SO_value:
            # if the word is before intensifier
            if boolean['SO_intensifier']:
                if __intens(sent,i) and SO_value:
                    SO_value = (1+list_intens[sent[i].lower()])*SO_value
                #if __intens_is(sent,i) and __SO_value(sent,i+1):
                #    SO_val = (1+list_intens[sent[i].lower()])*__SO_value(sent,i+1)
            
            if boolean['SO_negation']:        
                if __negation(sent, i, boolean['SO_negation']):
                    SO_value = SO_value - 4*sign(SO_value)
            
            features['SO_value'] = SO_value
    
    
    return features 

### Prepare features to crfsuite (exctraction.py)

Prepare the text features from the functions above and the audio features from the dumps (for now, after we'll use the pandas)

In [6]:
import re

MULTILABEL = ('B-evaluation', 'B-affect', 'I-evaluation', 'I-affect',
              'B-source', 'I-source', 'B-target', 'I-target')
              
HIERARCHY = {'I-attitude_positive': 1, 'B-attitude_positive': 2, 'I-attitude_negative': 3, 'B-attitude_negative': 4, 'I-source': 5, 'B-source': 6,
             'I-target': 7, 'B-target': 8, 'O': 9}

In [7]:
def __merge_dicts(*dict_args):
    u"""Fusionne n'importe quel nombre de dict."""
    z = {}
    for y in dict_args:
        z.update(y)
    return z


def __audio2features(audio, i):
    """
    STRING --> DICTIONNAIRE pour un seul mot (le i)
    Permet de mettre le dictionnaire qui n'etait qu'une string sous vrai forme de dictionnaire
    i est le numero du mot, on obtient donc toutes les features audio pour un seul mot
    """

    dict_pitch = eval(audio[i])
    # result au lieu de dict : refaire dico en enlevant les None
    result_pitch = {}
    for k, v in dict_pitch.items():
        if dict_pitch[k] != None:
            result_pitch[k] = v
    return result_pitch


def __sent2features(sent, audio, mfcc, params):
    u"""Choisir les types de features utilisés ici.
    Avec opt, on ne garde qu'audio ou texte si on veut faire des test séparement.
    Il n'y a qu'a fusionner les dict voulus pour chaque mot
    
    sent, audio et mfcc sont d'une seule phrase --> len(sent) est le nbr de mot i le numero du mot
    """
    opt = params['opt']
    if opt == 'MULTI':
        return [__merge_dicts(__word2features(sent, i, params),
                          __audio2features(audio,i)) for i in range(len(sent))] 
    elif opt == 'AUDIO':             
        return [__merge_dicts(__audio2features(audio,i)) for i in range(len(sent))] 
    else: # then it's just text
        return [__merge_dicts(__word2features(sent, i, params)) for i in range(len(sent))] 

def __sent2label(sent, label):
    """
    Return a list with the labels of eahc word in 1 sentence
    """
    return [__decision(str_labels, label) for token, postag,
            str_labels in sent]


def __sent2tokens(sent):
    """List of the words from a sentence
    NOT USED    
    """
    return [token for token, postag, label in sent]


def __decision(str_labels, label):
    """
    label peut etre attitude, source ou target : pour un entrainement séparé 
    mais qui est moins efficace que s'il est bien fait ensemble
    """
    list_labels = str_labels.split(";") # S'il y a plusieurs labels par mot
    if label == 'BIO': # garder toutes les annotations 
        
#        rappel : HIERARCHY = {'I-attitude_positive': 1, 'B-attitude_positive': 2, 'I-attitude_negative': 3, 'B-attitude_negative': 4, 'I-source': 5, 'B-source': 6,
#             'I-target': 7, 'B-target': 8, 'O': 9}
             
        list_nb = [HIERARCHY[lab] for lab in list_labels] # donne un "rang" aux differents labels du mot
        return "".join([k for k, v in HIERARCHY.items() # regarde les rangs des differents labels
                        if v == np.min(list_nb)]) # label qui a le "rang" le plus eleve (nb le + petit) gagne
 
    else: # Si pas BIO, c'est attitude par exemple, on les entraine separement !! (d'abord attitude ou source ou ?)
        
        if label.__class__ == list: # plusieurs labels de sortie --> ex : ['attitde_positive' ,'attitude_negative']
            for k in range(len(label)):            
                if "I-"+label[k] in list_labels:
                    return "I-"+label[k]
                elif "B-"+label[k] in list_labels:
                    return "B-"+label[k]    
                    
            return "O"# if no label
        else: # only one label
            if "I-"+label in list_labels:
                return "I-"+label
            elif "B-"+label in list_labels:
                return "B-"+label
            else:
                return "O"

def text_sents(path):
    u"""Traite le texte. 
    Pas utilisé, on le fait avec nltk.corpus.conll2002.iob_sents(path_text)
    """
    f = open(path, 'Ur')
    sents = f.read().split('\n\n\n') # Phrase 
    sents_list = []
    for sent in sents:
        words = sent.split('\n') # Mots 
        words_list = []
        for word in words:
            features = tuple(word.split('\t'))[:2]
            words_list.append(features)
        sents_list.append(words_list)
    return sents_list
    

def audio_sents(path):
    u"""Traite l'audio.
    Va cherche les dumps et les met dans des variables
    /Datasets/Semaine/all/+ dump_audio/ ou dump_mfcc/    
    """
    f = open(path, 'Ur')
    sents = f.read().split('\n\n\n') # sent[0] 1ere phrase
    sents_list = []
    for sent in sents: # pour chaque phrase
        words = sent.split('\n') # separation par mot words[0] 1er mot
        words_list = []
        for word in words:
            try:
                features = word.split('\t')[1] # Separe le mot ex 'HI' des features (u'moy_loc_B1': -0.059, u'moy_loc_B2': -0.199)
                if features == 'None':
                    features = "{}"
                words_list.append(features)
                m = re.findall(r"\>|\<|'|GONNA|WANNA", word.split('\t')[0]) # trouve les gonna/wanna/'/ qui font 2 mots avc POSTAG 
                #  --> don't = do not, pour l'audio on met les memes features audio pour les 2 mots
                for k in range(len(m)):
                    words_list.append(features)
            except IndexError:
                #  print('END OF FILE %s' % path.split('.')[0][-3:]) #fin du fichier, donne le nom de la session en +
                break
        sents_list.append(words_list)
    return sents_list  


def extract2CRFsuite(path_text, path_audio, path_mfcc, label='BIO', params = None):
    u"""PLUS IMPORTANTE.
    
    Extrait features et label pour une session
    à partir d'un dossier contenant les dump au format Conll
    """
    # Just to charge the good ones : 
    text = None
    audio = None
    mfcc = None
    opt = params['opt']
    text = nltk.corpus.conll2002.iob_sents(path_text) # text[phrase][mot] = (mot, genre NN, BIO-attitude)
    
    # Labels first sice we need the text   
    y = [__sent2label(s, label) for s in text]    
    
    # Then the variables
    if opt == 'TEXT':
        audio = [None]*len(text)
        mfcc =  [None]*len(text)
    elif opt == 'AUDIO':
        audio = audio_sents(path_audio) # audio[phrase][mot] = string avec les valeurs (string d'un dictionnaire)
        #  par ex : "{u'moy_loc_B1': -0.059879489425627798, u'moy_loc_B2': -0.19947861555547755, u'moy_loc_F1': 0.026468}"
        mfcc = audio_sents(path_mfcc)
        text =  [None]*len(audio)
    elif opt == 'MULTI':
        audio = audio_sents(path_audio) # audio[phrase][mot] = string avec les valeurs (string d'un dictionnaire)
        #  par ex : "{u'moy_loc_B1': -0.059879489425627798, u'moy_loc_B2': -0.19947861555547755, u'moy_loc_F1': 0.026468}"
        mfcc = audio_sents(path_mfcc)
        
    X = [__sent2features(s, t, u, params) for (s, t, u) in zip(text, audio, mfcc)] # on prend phrase par phrase
    
    return X, y

## Training and testing (main_val.py)

### Parameters

#### CRF
Set training parameters. We will use L-BFGS training algorithm (it is default) without Elastic Net (L1 + L2) regularization but just L2 norm regularization since we don't have a lot of features.

In [8]:
params = {}
params['c1'] = 0
params['c2'] = 1e-2
params['max_it'] = 50

params['c2'] = 1e-3
params['c1'] = 0

#### Features
We choose to apply the algo only on the TEXT, on the AUDIO, or on BOTH with the 'opt' parameter.

We can choose which textual features we are going to use (permitt to test a lot of different features and just select the best ones. 

In [9]:
params['opt'] = 'TEXT' 
params['context_negation'] = 2
params['nb_neighbours'] = 2
params['newI'] = False
params['swn_scores'] = True

params['rules_synt'] = False
params['swn_score'] = True
params['inverse_score'] = False

params['swn_pos'] = True
params['swn_neg'] = True

params['adj'] = True
params['adv'] = True

### Kind of labels 

We can select the labels we want to have on the different words and the label we wanna detect at the end. 

If we want to keep the difference between positive and negative attitude and just detect the attitude positive : 
--> label_att = ['attitude_negative','attitude_positive'] ; label_select = 'attitude_positive' 

If we want to just detect the positive attitude and not the negative one
--> label_att = 'attitude_positive' ; label_select = 'attitude_positive' 


In [10]:
label_att = ['attitude_negative','attitude_positive'] ; label_select = 'attitude_positive' ; valence = True

#label_att = 'attitude_positive' ; label_select = 'attitude_positive' ; valence = True
#label_att = 'attitude' ; label_select = None ; valence = False

What do the features of a word look like : 

In [14]:
filename = ALL_FILES[0]
X_test, y_test = extract2CRFsuite(path+"all/dump"+valence*"_attitudeposneg_only"+"/"+filename,
                                path+"all/dump_audio/"+filename,
                                path+"all/dump_mfcc/"+filename,
                                label_att, params)
X_test[3][5]

{'-1:postag=': u'VB',
 '-1:word.lower=': u'red',
 '-2:postag=': u'VB',
 '-2:word.lower=': u"'ve",
 '1:postag=': u'CC',
 '1:word.lower=': u'and',
 '2:postag=': u'DT',
 '2:word.lower=': u'a',
 'adj': False,
 'adv': False,
 'bias': 1.0,
 'negation': False,
 'phrase_type': 'VP',
 'postag': u'NN',
 'sentisynset.neg': 0.0,
 'sentisynset.obj': 1.0,
 'sentisynset.pos': 0.0,
 'swn_pos_score_bis': 0.0,
 'word': u'hair'}

### Cross-Validation Loop

In [23]:
def dump_resultats(precision, recall, F1, filename):
    u"""Dump the results."""
    f = open(filename, 'w')
    f.write("Session\t\tPrecision\tRecall\tF1\n")
    
    session = 'overall' # Every sessions on the 1st line, then
    f.write("%s\t\t%s\t\t%s\t\t%.2f\n" % (session, precision[session], recall[session], F1))
    alz = precision.copy()
    del alz['overall']
    for session in alz.keys():
        f.write("%s\t%s\t\t%s\n" % (session, precision[session], recall[session]))
        
    f.close()

def cvloo(label, path_results, params, label_select=None, LOOP_TEST=False, valence = False):
    u"""Compute the Cross-validation for the given label.
    valence is True if we wanna distinguish the positive and negative attitudes    
    """
    if label_select is None:
        label_select = label
    opt = params['opt']
    
    truepos_o, falsepos_o, falseneg_o = (0, 0, 0)    
    precision = {}
    recall = {}
    
    trainer = pycrfsuite.Trainer(verbose=False)
        
    
    for i in range(len(ALL_FILES)):
        filename = ALL_FILES[i]
        X, y = extract2CRFsuite(path+"all/dump"+valence*"_attitudeposneg_only"+"/"+filename,
                                path+"all/dump_audio/"+filename,
                                path+"all/dump_mfcc/"+filename,
                                label, params)
        for x_seq, y_seq in zip(X, y):
            trainer.append(x_seq, y_seq, i)
        
    trainer.set_params({
        'c1': params['c1'],   # coefficient for L1 penalty
        'c2': params['c2'],  # coefficient for L2 penalty
        'max_iterations': params['max_it'],  # stop earlier

        # include transitions that are possible, but not observed
        'feature.possible_transitions': False,
    })
    #print("Beginning of the training")
    for i in range(len(ALL_FILES)):
    #for i in range(1):
        
        filename = ALL_FILES[i]
        filename_model = filename.split('.')[0] # to threw away the extension
        
        # Training 
        trainer.train(path_model+'model_%s_' %opt + filename_model, i)

        # Testing
        X_test, y_test = extract2CRFsuite(path+"all/dump"+valence*"_attitudeposneg_only"+"/"+filename,
                                path+"all/dump_audio/"+filename,
                                path+"all/dump_mfcc/"+filename,
                                label, params)
        tagger = pycrfsuite.Tagger(verbose=False)
        tagger.open(path_model + 'model_%s_' %opt + filename_model)
        
        truepos, falsepos, falseneg = (0, 0, 0)
        for sent, corr_labels in zip(X_test, y_test):
            pred_labels = tagger.tag(sent)
            trueposAdd, falseposAdd, falsenegAdd = \
                F1_token(
                    pred_labels,
                    corr_labels,
                    label_select)
            truepos += trueposAdd
            falsepos += falseposAdd
            falseneg += falsenegAdd
        
        precision[filename] = "%.2f" % (truepos/(truepos+falsepos+0.01) * 100)
        recall[filename] = "%.2f" % (truepos/(truepos+falseneg+0.01) * 100)            
        truepos_o += truepos
        falsepos_o += falsepos
        falseneg_o += falseneg
        
    precision['overall'] = "%.2f" % (truepos_o/(truepos_o+falsepos_o+0.01) * 100)
    recall['overall'] = "%.2f" % (truepos_o/(truepos_o+falseneg_o+0.01) * 100)
    F1 = 2*float(precision['overall'])*float(recall['overall'])/(float(precision['overall'])+float(recall['overall'])+1e-5)

    # If there is pos and neg differentiation for the attitudes
    if valence == True and label.__class__ == list: label = 'attitud_posneg'

    ext = '.txt'
    dump_resultats(precision, recall, F1, path_results + 'results_CVLOO_%s_' %(opt) +label+"_"+label_select+ext)
    if LOOP_TEST: # if loop test dump the ALL the results in 1 file : useful for hyperparams test
        dump_resultats_total(precision, recall, F1, path_results + 'results_total_%s_' %(opt) +label+"_"+label_select+ext, params)
    return_sent = 'Precision : %s, Recall : %s, F1 : %.2f' %(precision['overall'], recall['overall'], F1)
    return return_sent

In [28]:
params['c2'] = 1e-3
params['c1'] = 0
params['opt'] = 'TEXT' 
params['context_negation'] = 2
params['nb_neighbours'] = 2
params['rules_synt'] = False
params['newI'] = False
params['swn_score'] = True

# inverse the swn score if there is a negation
params['inverse_score'] = False
# put several time the swn_pos/neg score in the features
params['swn_pos'] = False
params['swn_neg'] = False
params['swn_obj'] = False

# Detect adverb, adjective or intensifier
params['adv'] = False
params['adj'] = False
params['intens'] = False
params['intens_is'] = False


# With the Semontic Opinion score : other lexique
params['SO_value'] = False 
params['SO_intensifier'] = False
params['SO_negation'] = False


#label_att = ['attitude_negative','attitude_positive'] ; label_select = 'attitude_positive' ; valence = True
#label_att = 'attitude_positive' ; label_select = 'attitude_positive' ; valence = True
label_att = 'attitude' ; label_select = None ; valence = False

print 'For the label %s : ' %(label_select)
print cvloo(label_att, path_results, params, label_select = label_select, valence=valence)

For the label None : 
Precision : 57.89, Recall : 40.67, F1 : 47.78


Faire un test avec d'autres labels : 

In [34]:



label_att = 'attitude' ; label_select = None ; valence = False
print 'For the label %s : ' %(label_select)
print cvloo(label_att, path_results, params, label_select = label_select, valence=valence)

For the label None : 
Precision : 57.17, Recall : 39.26, F1 : 46.55


## What the classifier learned

### Transitions between words regarding the labels

In [21]:
from collections import Counter

filename_model = ALL_FILES[0].split('.')[0] # to threw away the extension
tagger = pycrfsuite.Tagger(verbose=False)
tagger.open(path_model + 'model_%s_' %params['opt'] + filename_model)

# What we use to have the weights : tagger is really important
info = tagger.info()

def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))

def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print("%0.6f %-6s %s" % (weight, label, attr))    

Top likely transitions:
I-attitude_negative -> I-attitude_negative 5.978109
B-attitude_positive -> I-attitude_positive 5.512919
I-attitude_positive -> I-attitude_positive 5.480694
O      -> O       4.986429
B-attitude_negative -> I-attitude_negative 4.896438
O      -> B-attitude_positive 0.295560
O      -> B-attitude_negative 0.143329
I-attitude_positive -> B-attitude_positive 0.059109
I-attitude_negative -> B-attitude_negative -0.162060
I-attitude_positive -> B-attitude_negative -0.429104

Top unlikely transitions:
B-attitude_positive -> O       -1.434754
I-attitude_positive -> O       -1.827552
I-attitude_negative -> O       -2.410462
O      -> I-attitude_negative -3.331075
O      -> I-attitude_positive -3.723805


In [None]:
# transform the dict of weights in counter type and sort the list 
# info.transitions[('B-LOC','B-LOC')] gives the weights between B-LOC then B-LOC

# gives the 10 firsts
print("Top likely transitions:")
print_transitions(Counter(info.transitions).most_common(10))

# The 5 lasts
print("\nTop unlikely transitions:")
print_transitions(Counter(info.transitions).most_common()[-5:])

### Most/Less weighted features regarding the labels

In [22]:
print("Top positive:")
print_state_features(Counter(info.state_features).most_common(20))

print("\nTop negative:")
print_state_features(Counter(info.state_features).most_common()[-20:])

Top positive:
2.457445 O      P1IS
2.362237 O      P-1IS
1.985765 B-attitude_positive P1IS
1.880477 B-attitude_negative P1IS
1.051011 O      P-2IS
0.997364 O      postag:NN
0.872209 I-attitude_negative P-2IS
0.833562 O      word:em
0.816481 I-attitude_positive P-2IS
0.800147 I-attitude_positive -1:postag=:NN
0.785202 O      word:yeah
0.749577 I-attitude_negative -1:postag=:NN
0.732676 O      bias
0.708124 O      phrase_type:NP
0.682290 I-attitude_positive sentisynset.pos
0.681614 O      postag:CC
0.626068 O      word:and
0.602122 I-attitude_negative -1:word.lower=:hm
0.601450 O      2:word.lower=:>
0.601450 O      word:<

Top negative:
-0.489057 I-attitude_positive -2:postag=:PO
-0.496427 I-attitude_positive word:and
-0.500166 I-attitude_positive word:have
-0.502683 O      word:you
-0.502894 O      postag:VB
-0.503884 I-attitude_negative postag:CC
-0.527920 B-attitude_positive phrase_type:NP
-0.544279 O      word:god
-0.580185 B-attitude_negative phrase_type:NP
-0.594465 O      word:ex

We can see that the CRF regard a lot if the word is surrounded by a word at the left and at the right