In [21]:
import json
import unicodedata
import re
import pandas as pd
pd.set_option('max_colwidth',1000)
from lxml import objectify
import numpy as np



from string import punctuation
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import SnowballStemmer

from string import punctuation

from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import cross_val_score
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer      


#Required packages from nltk
#nltk.download('punkt')
#nltk.download('stopwords')

## Load CSV file or create from XML

In [22]:
try:
    general_tweets_corpus_train = pd.read_csv('datasets/csv/general-tweets-train-tagged.csv', encoding='utf-8')
except:
    xml = objectify.parse(open('datasets/xml/general-tweets-train-tagged.xml'))
    #sample tweet object
    root = xml.getroot()
    general_tweets_corpus_train = pd.DataFrame(columns=('content', 'polarity', 'agreement'))
    tweets = root.getchildren()
    for i in range(0,len(tweets)):
        tweet = tweets[i]
        row = dict(zip(['content', 'polarity', 'agreement'], 
                       [tweet.content.text, tweet.sentiments.polarity.value.text, 
                        tweet.sentiments.polarity.type.text]))
        row_s = pd.Series(row)
        row_s.name = i
        general_tweets_corpus_train = general_tweets_corpus_train.append(row_s)
    general_tweets_corpus_train.to_csv('datasets/csv/general-tweets-train-tagged.csv', index=False, encoding='utf-8')

In [23]:
tweets_corpus = pd.concat([general_tweets_corpus_train])
tweets_corpus = tweets_corpus.query('agreement != "DISAGREEMENT" and polarity != "NONE"')
tweets_corpus = tweets_corpus[-tweets_corpus.content.str.contains('^http.*$')]
tweets_corpus.sample(5)

Unnamed: 0,content,polarity,agreement
4358,Buenos días ;-)) q tengáis un sábado estupendo ;-)),P+,AGREEMENT
4194,"RT @Majo_eltren: @Los40_Spain quiero escuchar #GeneracionPerdida del disco #Positive Generation, Voces x un ... http://t.co/MOyaNTvd",P,AGREEMENT
1087,"Mira, Rajoy, aprende de los tecnócratas. http://t.co/HDUZqqJx",N+,AGREEMENT
6936,Otro presupuesto es posible. Hemos hablado con Ghesta y Rallo. @TelediarioInter 20:30,P,AGREEMENT
373,Estoy harto del clásico. Qué cansino todo el mundo con el partido. Creo que Pep saldrá con defensa de 3.,N,AGREEMENT


In [24]:
try:
    general_tweets_corpus_test = pd.read_csv('datasets/csv/general-tweets-test1k.csv')#, encoding='utf-8')
except:
    xml = objectify.parse(open('datasets/xml/general-tweets-test1k.xml'))
    #sample tweet object
    root = xml.getroot()
    general_tweets_corpus_test = pd.DataFrame(columns=('content', 'polarity'))
    tweets = root.getchildren()
    for i in range(0,len(tweets)):
        tweet = tweets[i]
        row = dict(zip(['content'], [tweet.content.text]))
        row_s = pd.Series(row)
        row_s.name = i
        general_tweets_corpus_test = general_tweets_corpus_test.append(row_s)
    general_tweets_corpus_test.to_csv('datasets/csv/general-tweets-test1k.csv', index=False)#, encoding='utf-8')


In [25]:
tweets_test = pd.concat([general_tweets_corpus_test])

tweets_test.sample(5)

Unnamed: 0,content,polarity
516,"Lo siento muchísimo, era estupenda “@loriferrer: @pedroj_ramirez falleció ayer #raimundadepeñafort nuestra juez favorita. Descanse en paz”",
841,Portada de Liberation http://t.co/2J4MNxba,
891,"La diferencia entre PP y PSOE es solo de 40.000 votos, un 1% de diferencia. IU sube 110.000.",
129,Pensiones. Garantizar sostenibilidad. Combatir prejubilaciones encubiertas. Incentivar prolongación voluntaria de la vida laboral.,
573,"Como en el caso Dreyfus, situados entre la verdad y el prestigio de la Institución, al final ni verdad ni prestigio.",


In [26]:
try:
    tagged_tweets_corpus_test = pd.read_csv('datasets/csv/general-tweets-test1k-tagged.csv', encoding='utf-8')
except:

    from lxml import objectify
    xml = objectify.parse(open('datasets/xml/general-tweets-test1k-tagged.xml'))
    #sample tweet object
    root = xml.getroot()
    tagged_tweets_corpus_test = pd.DataFrame(columns=('content', 'polarity'))
    tweets = root.getchildren()
    for i in range(0,len(tweets)):
        tweet = tweets[i]
        row = dict(zip(['content', 'polarity', 'agreement'], [tweet.content.text, tweet.sentiments.polarity.value.text]))
        row_s = pd.Series(row)
        row_s.name = i
        tagged_tweets_corpus_test = tagged_tweets_corpus_test.append(row_s)
    tagged_tweets_corpus_test.to_csv('datasets/csv/general-tweets-test1k-tagged.csv', index=False, encoding='utf-8')

In [27]:
tweets_tagged = pd.concat([tagged_tweets_corpus_test])
tweets_tagged = tweets_tagged.query('polarity != "NONE"')
diff = np.setdiff1d(tweets_test.index.values, tweets_tagged.index.values)

tweets_test = tweets_test.drop(diff)
tweets_tagged.sample(5)


Unnamed: 0,content,polarity
307,Somosaguas Place! Hahahahahahahahah genial! @LosClones me muero de la risa! Hahahahaha brutal,P+
183,Entre los méritos de De Guindos está gestionar la salida a bolsa de la CAM (desde Lehman Brothers) ¡2 quiebras en 1! http://t.co/Q5DPL4LC,P
978,Buenas chicos!! Esta noche en sala kairo en antequera con mi primer tema #Quitateeltop dale duro!!!!,P+
37,"Sembrar dudas sobre la autoría del 11M, para negar su derrota electoral, mientras se desprecia a las familias de las víctimas, es vergonzoso",N+
354,Y su responsabilidad? @PepeGrinan admite que el caso de los ERE puede tener un coste electoral «grande» http://t.co/DTeeck2l,N+


## Tokenize and Stems Sentences 

In [28]:
#Stem: Cut word in root (wait: wait, waited: wait, waiting: wait)
def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

#Each word is a token
def tokenize(text):
    text = ''.join([c for c in text if c not in non_words])
    tokens =  word_tokenize(text)

    # stem
    try:
        stems = stem_tokens(tokens, stemmer)
    except Exception as e:
        print(e)
        print(text)
        stems = ['']
    return stems

In [29]:
#Stopwords: Empty word (i.e articles)

spanish_stopwords = stopwords.words('spanish')
stemmer = SnowballStemmer('spanish')


#Non Words: Symbols and Numbers
non_words = list(punctuation)
non_words.extend(['¿', '¡'])
non_words.extend(map(str,range(10)))


## Creating Model (Linear SVM) and Training

In [30]:
#Binarizing

tweets_corpus['polarity_bin'] = 0
index = tweets_corpus.polarity.isin(['P', 'P+'])
tweets_corpus.polarity_bin.loc[index] = 1
print tweets_corpus.polarity_bin.value_counts(normalize=True)

tweets_test['polarity_bin'] = 0

tweets_tagged['polarity_bin'] = 0
index = tweets_tagged.polarity.isin(['P', 'P+'])
tweets_tagged.polarity_bin.loc[index] = 1
tweets_tagged.polarity_bin.value_counts(normalize=True)

y = tweets_tagged.polarity_bin.values

tweets_corpus.sample(3)

1    0.53994
0    0.46006
Name: polarity_bin, dtype: float64


Unnamed: 0,content,polarity,agreement,polarity_bin
4457,Gran discurso de @javierarenas_pp hoy en el congreso del @partidopopular. Cada dia es mayor su capacidad de llegar a la gente y convencer,P+,AGREEMENT,1
5081,Un besazo hasta Sevilla. RT @PAQUIKAVITO: @Jas_Sevilla @siempreesdnoche @alejandrosanz gracias salao!!! Besos pa Sevilla !!!,P+,AGREEMENT,1
7191,.@diegocruz_: “El Ayuntamiento va con retraso en la simplificación de licencias”. Esta es la realidad. http://t.co/G6M0IkWp,N,AGREEMENT,0


# Find Parameters

In [None]:
vectorizer = CountVectorizer(
                analyzer = 'word',
                tokenizer = tokenize,
                lowercase = True,
                stop_words = spanish_stopwords)

pipeline = Pipeline([
    ('vect', vectorizer),
    ('cls', LinearSVC()),
])



parameters = {
    'vect__max_df': (0.5, 1.9),
    'vect__min_df': (10, 20,50),
    'vect__max_features': (500, 1000),
    'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
    'cls__C': (0.2, 0.5, 0.7),
    'cls__loss': ('hinge', 'squared_hinge'),
    'cls__max_iter': (500, 1000)
}


grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1 , scoring='roc_auc')
grid_search.fit(tweets_corpus.content, tweets_corpus.polarity_bin)



In [31]:
best_params = {'vect__ngram_range': (1, 2), 'cls__loss': 'hinge', 'vect__max_df': 0.5
 , 'cls__max_iter': 1000, 'vect__min_df': 10, 'vect__max_features': 1000
 , 'cls__C': 0.2}

best_pipe = Pipeline([
    ('vect', CountVectorizer(
            analyzer = 'word',
            tokenizer = tokenize,
            lowercase = True,
            stop_words = spanish_stopwords,
            min_df = 10,
            max_df = 0.5,
            ngram_range=(1, 2),
            max_features=1000
            )),
    ('cls', LinearSVC(C=.2, loss='hinge',max_iter=1000,multi_class='ovr',
             random_state=None,
             penalty='l2',
             tol=0.0001
             )),
])

best_pipe.fit(tweets_corpus.content, tweets_corpus.polarity_bin)
best_test = tweets_test
best_test['polarity_bin'] = best_pipe.predict(best_test.content)





In [32]:
y_b = best_test.polarity_bin.values
best_result = np.abs(y_b - y)
np.bincount(best_result)[0]/float(best_result.shape[0])


0.80398162327718226

In [33]:
pipeline = Pipeline([
    ('vect', CountVectorizer(
            analyzer = 'word',
            tokenizer = tokenize,
            lowercase = True,
            stop_words = spanish_stopwords,
            min_df = 50,
            max_df = 1.9,
            ngram_range=(1, 1),
            max_features=1000
            )),
    ('cls', LinearSVC(C=.2, loss='squared_hinge',max_iter=1000,multi_class='ovr',
             random_state=None,
             penalty='l2',
             tol=0.0001
             )),
])

pipeline.fit(tweets_corpus.content, tweets_corpus.polarity_bin)
tweets_test['polarity_bin'] = pipeline.predict(tweets_test.content)



In [34]:
y_t = tweets_test.polarity_bin.values
result = np.abs(y_t - y)
np.bincount(result)[0]/float(result.shape[0])


0.71975497702909652

In [37]:
p = pipeline.fit(tweets_corpus.content, tweets_corpus.polarity_bin)

scores = cross_val_score(p, tweets_corpus.content, tweets_corpus.polarity_bin, cv=5)

np.mean(scores)



0.68227481108362864

In [38]:
best_scores = cross_val_score(best_pipe, tweets_corpus.content, tweets_corpus.polarity_bin, cv=5)

np.mean(best_scores)



0.73773511948822568