In [4]:
import pymongo as pym
import nltk.data
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import TreebankWordTokenizer
import stop_words
from nltk.stem import *
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
from sklearn.naive_bayes import MultinomialNB
import numpy as np


client = pym.MongoClient('localhost',27017)
collection = client.tweet.cleaned
tokenizer = TreebankWordTokenizer()
stops = set(stopwords.words('french')+stop_words.get_stop_words('fr')+['rt','ds','qd','ss','ns','vs','nn','amp','gt','gd','gds','tt','pr','ac','mm'])

def getCleanTweetsByCandidates(collection):
    tweets = collection.find({},{'_id':False})
    listTweets = []
    for t in tweets : 
        t['text'] = re.sub(r'\w*…','',t['text'])
        t['text'] = re.sub(r',;:!?\.\/\*``"#@(){}','',re.sub(r'\xad','-',re.sub(r'\n',
            ' ',re.sub(r'\W*(?!\S)','',re.sub(r'(?:htt)\S*','',re.sub(r'^rt.*: ','',string=t['text']))))))
        t['text'] = tokenizer.tokenize(t['text'])
        t['text'] = [re.sub(r'[^a-zA-Z0-9-éèêàâùçîœ’\']+','',token,re.UNICODE) for token in t['text'] if token not in stops]
        t['text'] = [re.sub(r'l\'|\squ\'|l’|\squ’|d\'|d’','',token) for token in t['text']]
        while '' in t['text'] : t['text'].remove('')
        listTweets.append(t)
    return listTweets

def vectorize(tokenizedCorpus):
    vectorizer = TfidfVectorizer(max_df = 0.8, min_df=0.0005)
    X = vectorizer.fit_transform(tokenizedCorpus)
    return X

def getSentiments() : 
    df = pd.DataFrame(getCleanTweetsByCandidates(collection))
    tfidfmat = vectorize(df['text'].apply(' '.join))
    print(tfidfmat.shape)
    nb = MultinomialNB()
    nb.fit(tfidfmat[::2], df.ix[::2,'sentiment'])
    predictions = nb.predict(tfidfmat[1::2])
    accuracy = np.sum(predictions == df.ix[1::2,'sentiment']) / len(df.ix[1::2,'sentiment'])
    print("accuracy of the sklearn naive bayes : ", accuracy)


In [5]:
getSentiments()

(3503, 3177)
accuracy of the sklearn naive bayes :  0.651056539121


In [6]:
for a in getCleanTweetsByCandidates(collection)[:40]:
    print(a)

{'text': ['costumes', 'fillon', 'cote', 'entre', '7', '8', '000', 'euros', 'pièce'], 'candidat': 'fillon', 'sentiment': -1}
{'text': ['pouvoir', 'un', 'autre', 'immunité', 'présidentielle', 'puis', 'immunité', 'donnée', 'anciens'], 'sentiment': -1}
{'text': ['vocation', 'attention', 'crime', 'contre', 'humanité', 'paternalisme', 'coupable', 'vocation', 'france', "c'est", 'être'], 'sentiment': -1}
{'text': ['agriculteur', 'trois', 'jours', 'suicident', 'pays', "c'est", 'délinquant', 'monde', 'parle'], 'sentiment': -1.0}
{'text': ['video', 'françois', 'hollande', 'adresse', 'message', 'emmanuel', 'macron', 'dîner', 'crif'], 'candidat': 'macron', 'sentiment': 0}
{'text': ['bourdindirect', 'bfmtv', 'faite', 'macron', 'programme', 'retraites', 'compte', 'goutte'], 'candidat': 'macron', 'sentiment': -1}
{'text': ['jcnh83', 'anneyunie59000', 'twystsky', 'jupp', 'fillon', 'autres', 'justice', 'médias', 'pourri', 'election'], 'candidat': 'fillon', 'sentiment': -1}
{'text': ['qg', 'modem', 'mili