In [188]:
import json
import unicodedata
import re
import pandas as pd
pd.set_option('max_colwidth',1000)
from lxml import objectify
import numpy as np



from string import punctuation
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import SnowballStemmer

from string import punctuation

from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import cross_val_score
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer      


#Required packages from nltk
#nltk.download('punkt')
#nltk.download('stopwords')

## Load CSV file or create from XML

In [189]:
try:
    general_tweets_corpus_train = pd.read_csv('datasets/csv/general-tweets-train-tagged.csv', encoding='utf-8')
except:
    xml = objectify.parse(open('datasets/xml/general-tweets-train-tagged.xml'))
    #sample tweet object
    root = xml.getroot()
    general_tweets_corpus_train = pd.DataFrame(columns=('content', 'polarity', 'agreement'))
    tweets = root.getchildren()
    for i in range(0,len(tweets)):
        tweet = tweets[i]
        row = dict(zip(['content', 'polarity', 'agreement'], 
                       [tweet.content.text, tweet.sentiments.polarity.value.text, 
                        tweet.sentiments.polarity.type.text]))
        row_s = pd.Series(row)
        row_s.name = i
        general_tweets_corpus_train = general_tweets_corpus_train.append(row_s)
    general_tweets_corpus_train.to_csv('datasets/csv/general-tweets-train-tagged.csv', index=False, encoding='utf-8')

In [190]:
tweets_corpus = pd.concat([general_tweets_corpus_train])
tweets_corpus = tweets_corpus.query('agreement != "DISAGREEMENT" and polarity != "NONE"')
tweets_corpus = tweets_corpus[-tweets_corpus.content.str.contains('^http.*$')]
tweets_corpus.sample(5)

Unnamed: 0,content,polarity,agreement
1800,#cartaalosreyesmagos. Rajoy necesita una sala de prensa. La q hay no le debe de gustar,N+,AGREEMENT
6156,Anda! Te pones un parche magnético en la piel y vibra cuando se recibe una llamada en el móvil. http://t.co/BDhLtz3B” !! Que disparate!!,N,AGREEMENT
2065,graciasss... Trabajamos x ello “@EDU_AM08: Felicidades!! Presidente aver si con el trabajo de todos mejoramos la situación.”,P+,AGREEMENT
6777,Estoy en un sitio en el que no puedo ver el partido. Veo que el Madrid va ganando. Contadme por favor lo esencial. Es clave ganar hoy.,P+,AGREEMENT
1868,¡Noticias descombacantes! está disponible! http://t.co/pdWFYyRa ? Historias del día por @burdinjaun,P,AGREEMENT


In [191]:
try:
    general_tweets_corpus_test = pd.read_csv('datasets/csv/general-tweets-test1k.csv')#, encoding='utf-8')
except:
    xml = objectify.parse(open('datasets/xml/general-tweets-test1k.xml'))
    #sample tweet object
    root = xml.getroot()
    general_tweets_corpus_test = pd.DataFrame(columns=('content', 'polarity'))
    tweets = root.getchildren()
    for i in range(0,len(tweets)):
        tweet = tweets[i]
        row = dict(zip(['content'], [tweet.content.text]))
        row_s = pd.Series(row)
        row_s.name = i
        general_tweets_corpus_test = general_tweets_corpus_test.append(row_s)
    general_tweets_corpus_test.to_csv('datasets/csv/general-tweets-test1k.csv', index=False)#, encoding='utf-8')


In [192]:
tweets_test = pd.concat([general_tweets_corpus_test])

tweets_test.sample(5)

Unnamed: 0,content,polarity
287,Mil gracias por vuestras felicitaciones!!! #MasViejos #MasSabios?,
511,"Rubalcaba quiere girar más a la izquierda,pues el PP debe ir más a la derecha o nos seguirán imponiendo su ruinoso modelo de sociedad..",
982,Muy bien la gente de antequera pero malisimos los responsables de la sala kairo nos han dejado tirados sin billetes en el hotel.,
225,"@lazaroelmundo feliz año, amigo",
994,"Este premio reconoce que Valencia es una ciudad pensada para la integración de todos, y en esa línea seguimos... http://t.co/8Jiu1pjW",


In [193]:
try:
    tagged_tweets_corpus_test = pd.read_csv('datasets/csv/general-tweets-test1k-tagged.csv', encoding='utf-8')
except:

    from lxml import objectify
    xml = objectify.parse(open('datasets/xml/general-tweets-test1k-tagged.xml'))
    #sample tweet object
    root = xml.getroot()
    tagged_tweets_corpus_test = pd.DataFrame(columns=('content', 'polarity'))
    tweets = root.getchildren()
    for i in range(0,len(tweets)):
        tweet = tweets[i]
        row = dict(zip(['content', 'polarity', 'agreement'], [tweet.content.text, tweet.sentiments.polarity.value.text]))
        row_s = pd.Series(row)
        row_s.name = i
        tagged_tweets_corpus_test = tagged_tweets_corpus_test.append(row_s)
    tagged_tweets_corpus_test.to_csv('datasets/csv/general-tweets-test1k-tagged.csv', index=False, encoding='utf-8')

In [194]:
tweets_tagged = pd.concat([tagged_tweets_corpus_test])
tweets_tagged = tweets_tagged.query('polarity != "NONE"')
diff = np.setdiff1d(tweets_test.index.values, tweets_tagged.index.values)

tweets_test = tweets_test.drop(diff)
tweets_tagged.sample(5)


Unnamed: 0,content,polarity
798,Rajoy contesta a los portavoces en el pleno del Congreso como si le molestara estar aquí y estuviera pasando un trámite que le aburre,N
716,@ccifuentes @PaquiVicente buen sábado para las dos.,P+
161,"Todos han jurado.soraya se ha equivocado,De Guindos ha puesto la mano sobre el papel y Morenes,taconazo muy militar.",N
751,"El acuerdo de recorte de derechos laborales aprobada por PP, CIU, UPN y FAC, es un ataque brutal que debe tener justa respuesta global.",N
995,Argentina estaria cerca de mi aunque estuviera en otro planeta. RT @CecVill: @AlejandroSanz @adasaro YO QUIERO UN ... http://t.co/J7XwvA4F,P


## Tokenize and Stems Sentences 

In [195]:
#Stem: Cut word in root (wait: wait, waited: wait, waiting: wait)
def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

#Each word is a token
def tokenize(text):
    text = ''.join([c for c in text if c not in non_words])
    tokens =  word_tokenize(text)

    # stem
    try:
        stems = stem_tokens(tokens, stemmer)
    except Exception as e:
        print(e)
        print(text)
        stems = ['']
    return stems

In [196]:
#Stopwords: Empty word (i.e articles)

spanish_stopwords = stopwords.words('spanish')
stemmer = SnowballStemmer('spanish')


#Non Words: Symbols and Numbers
non_words = list(punctuation)
non_words.extend(['¿', '¡'])
non_words.extend(map(str,range(10)))


## Creating Model (Linear SVM) and Training

In [197]:
#Binarizing

tweets_corpus['polarity_bin'] = 0
index = tweets_corpus.polarity.isin(['P', 'P+'])
tweets_corpus.polarity_bin.loc[index] = 1
print tweets_corpus.polarity_bin.value_counts(normalize=True)

tweets_test['polarity_bin'] = 0

tweets_tagged['polarity_bin'] = 0
index = tweets_tagged.polarity.isin(['P', 'P+'])
tweets_tagged.polarity_bin.loc[index] = 1
tweets_tagged.polarity_bin.value_counts(normalize=True)

y = tweets_tagged.polarity_bin.values

1    0.53994
0    0.46006
Name: polarity_bin, dtype: float64


# Find Parameters

In [32]:
vectorizer = CountVectorizer(
                analyzer = 'word',
                tokenizer = tokenize,
                lowercase = True,
                stop_words = spanish_stopwords)

pipeline = Pipeline([
    ('vect', vectorizer),
    ('cls', LinearSVC()),
])



parameters = {
    'vect__max_df': (0.5, 1.9),
    'vect__min_df': (10, 20,50),
    'vect__max_features': (500, 1000),
    'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
    'cls__C': (0.2, 0.5, 0.7),
    'cls__loss': ('hinge', 'squared_hinge'),
    'cls__max_iter': (500, 1000)
}


grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1 , scoring='roc_auc')
grid_search.fit(tweets_corpus.content, tweets_corpus.polarity_bin)



KeyboardInterrupt: 

In [198]:
pipeline = Pipeline([
    ('vect', CountVectorizer(
            analyzer = 'word',
            tokenizer = tokenize,
            lowercase = True,
            stop_words = spanish_stopwords,
            min_df = 50,
            max_df = 1.9,
            ngram_range=(1, 1),
            max_features=1000
            )),
    ('cls', LinearSVC(C=.2, loss='squared_hinge',max_iter=1000,multi_class='ovr',
             random_state=None,
             penalty='l2',
             tol=0.0001
             )),
])

pipeline.fit(tweets_corpus.content, tweets_corpus.polarity_bin)
tweets_test['polarity_bin'] = pipeline.predict(tweets_test.content)



In [199]:
p = pipeline.fit(X_train.content, X_train.polarity_bin)

scores = cross_val_score(p, tweets_corpus.content, tweets_corpus.polarity_bin, cv=5)

np.mean(scores)



0.68227481108362864

In [201]:
y_t = tweets_test.polarity_bin.values
result = np.abs(y_t - y)
np.bincount(result)[0]/float(result.shape[0])


0.71975497702909652