In [2]:
import json
import unicodedata
import re
import pandas as pd
pd.set_option('max_colwidth',1000)
from lxml import objectify
import numpy as np



from string import punctuation
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import SnowballStemmer

from string import punctuation

from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import cross_val_score
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer      


#Required packages from nltk
#nltk.download('punkt')
#nltk.download('stopwords')

## Load CSV file or create from XML

In [3]:
try:
    general_tweets_corpus_train = pd.read_csv('datasets/csv/general-tweets-train-tagged.csv', encoding='utf-8')
except:
    xml = objectify.parse(open('datasets/xml/general-tweets-train-tagged.xml'))
    #sample tweet object
    root = xml.getroot()
    general_tweets_corpus_train = pd.DataFrame(columns=('content', 'polarity', 'agreement'))
    tweets = root.getchildren()
    for i in range(0,len(tweets)):
        tweet = tweets[i]
        row = dict(zip(['content', 'polarity', 'agreement'], 
                       [tweet.content.text, tweet.sentiments.polarity.value.text, 
                        tweet.sentiments.polarity.type.text]))
        row_s = pd.Series(row)
        row_s.name = i
        general_tweets_corpus_train = general_tweets_corpus_train.append(row_s)
    general_tweets_corpus_train.to_csv('datasets/csv/general-tweets-train-tagged.csv', index=False, encoding='utf-8')

In [4]:
tweets_corpus = pd.concat([general_tweets_corpus_train])
tweets_corpus = tweets_corpus.query('agreement != "DISAGREEMENT" and polarity != "NONE"')
tweets_corpus = tweets_corpus[-tweets_corpus.content.str.contains('^http.*$')]
tweets_corpus.sample(5)

Unnamed: 0,content,polarity,agreement
5670,"Opel ha sido, es y será empresa estratégica para #Aragón. El éxito d esta planta contribuirá en gran medida al éxito d la Comunidad Autónoma",P+,AGREEMENT
3196,"RT @miabuelasabia: Haz tu vida como hacen los pulmones, quédate con lo bueno y dejar ir lo que no es necesario.",P,AGREEMENT
779,"Curiosidad del insomne: si alguien conocía desde 2007 operaciones irregulares de Urdangarin, ¿no estaba obligado a denunciarlas?",N+,AGREEMENT
3512,#yeswespainISDIFFERENT BCNA!!!;) vamosssss!!!;),P,AGREEMENT
5108,La presidenta @mdcospedal cumple. El nuevo IRPF de Cospedal http://t.co/Ngii4vS2 vía @eldigitalcastillalamancha,P+,AGREEMENT


In [5]:
try:
    general_tweets_corpus_test = pd.read_csv('datasets/csv/general-tweets-test1k.csv')#, encoding='utf-8')
except:
    xml = objectify.parse(open('datasets/xml/general-tweets-test1k.xml'))
    #sample tweet object
    root = xml.getroot()
    general_tweets_corpus_test = pd.DataFrame(columns=('content', 'polarity'))
    tweets = root.getchildren()
    for i in range(0,len(tweets)):
        tweet = tweets[i]
        row = dict(zip(['content'], [tweet.content.text]))
        row_s = pd.Series(row)
        row_s.name = i
        general_tweets_corpus_test = general_tweets_corpus_test.append(row_s)
    general_tweets_corpus_test.to_csv('datasets/csv/general-tweets-test1k.csv', index=False)#, encoding='utf-8')


In [6]:
tweets_test = pd.concat([general_tweets_corpus_test])

tweets_test.sample(5)

Unnamed: 0,content,polarity
283,"Si si, siempre es eso...Valenciano: La acusación contra el hijo de Chaves es otra campaña contra él http://t.co/gEiGEzHg",
957,En la radio están poniendo a Glenn Medeiros o es que he hecho flashback...? #EncimaMeLaSe #AdolescenciaDificil,
380,"“@FelipeAlcarazM: Volved a los camarotes, gritó el capitán Rajoy, esto lo arreglamos entre unos pocos.”",
405,Alierta agradece el impulso del gobierno para fomentar la internacionalización empresarial @TelediarioInter 20:30,
621,Se rumorea que la tardanza en conocer el resultado de la votación del #17congresoPP es xq Rajoy ha salido reelegido por un 102% ;),


In [7]:
try:
    tagged_tweets_corpus_test = pd.read_csv('datasets/csv/general-tweets-test1k-tagged.csv', encoding='utf-8')
except:

    from lxml import objectify
    xml = objectify.parse(open('datasets/xml/general-tweets-test1k-tagged.xml'))
    #sample tweet object
    root = xml.getroot()
    tagged_tweets_corpus_test = pd.DataFrame(columns=('content', 'polarity'))
    tweets = root.getchildren()
    for i in range(0,len(tweets)):
        tweet = tweets[i]
        row = dict(zip(['content', 'polarity', 'agreement'], [tweet.content.text, tweet.sentiments.polarity.value.text]))
        row_s = pd.Series(row)
        row_s.name = i
        tagged_tweets_corpus_test = tagged_tweets_corpus_test.append(row_s)
    tagged_tweets_corpus_test.to_csv('datasets/csv/general-tweets-test1k-tagged.csv', index=False, encoding='utf-8')

In [8]:
tweets_tagged = pd.concat([tagged_tweets_corpus_test])
tweets_tagged = tweets_tagged.query('polarity != "NONE"')
diff = np.setdiff1d(tweets_test.index.values, tweets_tagged.index.values)

tweets_test = tweets_test.drop(diff)
tweets_tagged.sample(5)


Unnamed: 0,content,polarity
205,";-))) gracias RT @JRSMMII: @mariviromero Desde todos los rincones de España, lucharemos con ello.Yo me pienso ofrecer de Interventor. ;)",P+
662,Esta es una muy muy mala noticia. El pluralismo pierde una voz genuina y honrada. Cierra el diario 'Público' http://t.co/bp7THNBN,N
98,Con el año nuevo habrá nuevo Presupuesto de PP+IU. Y con él nuevas responsabilidades para unos y otros.,P+
251,"Entre 2008 y 2010 Andalucía redujo gasto no financiero en un 7,5%, sin recortar derechos. Resto CCAA lo aumentaron 2,5%. (datos MEH)",N
344,La prima de riesgo de Portugal se dispara hasta los 1.241 puntos base.Totalmente insensato!,N+


## Tokenize and Stems Sentences 

In [9]:
#Stem: Cut word in root (wait: wait, waited: wait, waiting: wait)
def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

#Each word is a token
def tokenize(text):
    text = ''.join([c for c in text if c not in non_words])
    tokens =  word_tokenize(text)

    # stem
    try:
        stems = stem_tokens(tokens, stemmer)
    except Exception as e:
        print(e)
        print(text)
        stems = ['']
    return stems

In [10]:
#Stopwords: Empty word (i.e articles)

spanish_stopwords = stopwords.words('spanish')
stemmer = SnowballStemmer('spanish')


#Non Words: Symbols and Numbers
non_words = list(punctuation)
non_words.extend(['¿', '¡'])
non_words.extend(map(str,range(10)))


## Creating Model (Linear SVM) and Training

In [12]:
#Binarizing

tweets_corpus['polarity_bin'] = 0
index = tweets_corpus.polarity.isin(['P', 'P+'])
tweets_corpus.polarity_bin.loc[index] = 1
print tweets_corpus.polarity_bin.value_counts(normalize=True)

tweets_test['polarity_bin'] = 0

tweets_tagged['polarity_bin'] = 0
index = tweets_tagged.polarity.isin(['P', 'P+'])
tweets_tagged.polarity_bin.loc[index] = 1
tweets_tagged.polarity_bin.value_counts(normalize=True)

y = tweets_tagged.polarity_bin.values

tweets_corpus.sample(3)

1    0.53994
0    0.46006
Name: polarity_bin, dtype: float64


Unnamed: 0,content,polarity,agreement,polarity_bin
950,"Rubalcaba le pregunta a Rajoy, y con razón, cuándo va a anunciar las malas noticias.",N,AGREEMENT,0
224,Empieza el acto de homenaje a la Constitución con el próximo Presidente del Gobierno. #vivalaconstitucion http://t.co/gRTAQmVu,P+,AGREEMENT,1
6838,"Desde hace 2meses,Gobierno trabaja en modificación C.Penal para equiparar violencia de los antisistema con la callejera ligada al terrorismo",NEU,AGREEMENT,0


# Find Parameters

In [None]:
vectorizer = CountVectorizer(
                analyzer = 'word',
                tokenizer = tokenize,
                lowercase = True,
                stop_words = spanish_stopwords)

pipeline = Pipeline([
    ('vect', vectorizer),
    ('cls', LinearSVC()),
])



parameters = {
    'vect__max_df': (0.5, 1.9),
    'vect__min_df': (10, 20,50),
    'vect__max_features': (500, 1000),
    'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
    'cls__C': (0.2, 0.5, 0.7),
    'cls__loss': ('hinge', 'squared_hinge'),
    'cls__max_iter': (500, 1000)
}


grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1 , scoring='roc_auc')
grid_search.fit(tweets_corpus.content, tweets_corpus.polarity_bin)



In [198]:
pipeline = Pipeline([
    ('vect', CountVectorizer(
            analyzer = 'word',
            tokenizer = tokenize,
            lowercase = True,
            stop_words = spanish_stopwords,
            min_df = 50,
            max_df = 1.9,
            ngram_range=(1, 1),
            max_features=1000
            )),
    ('cls', LinearSVC(C=.2, loss='squared_hinge',max_iter=1000,multi_class='ovr',
             random_state=None,
             penalty='l2',
             tol=0.0001
             )),
])

pipeline.fit(tweets_corpus.content, tweets_corpus.polarity_bin)
tweets_test['polarity_bin'] = pipeline.predict(tweets_test.content)



In [199]:
p = pipeline.fit(X_train.content, X_train.polarity_bin)

scores = cross_val_score(p, tweets_corpus.content, tweets_corpus.polarity_bin, cv=5)

np.mean(scores)



0.68227481108362864

In [202]:
y_t = tweets_test.polarity_bin.values
result = np.abs(y_t - y)
np.bincount(result)[0]/float(result.shape[0])


0.71975497702909652