In [15]:
import json
import unicodedata
import re
import pandas as pd
pd.set_option('max_colwidth',1000)
from lxml import objectify
import numpy as np
from pandas import read_pickle, read_csv, DataFrame


from string import punctuation
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import SnowballStemmer

from string import punctuation

from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import cross_val_score
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer      


#Required packages from nltk
#nltk.download('punkt')
#nltk.download('stopwords')

## Load CSV file or create from XML

In [16]:
try:
    general_tweets_corpus_train = pd.read_csv('datasets/csv/general-tweets-train-tagged.csv', encoding='utf-8')
except:
    xml = objectify.parse(open('datasets/xml/general-tweets-train-tagged.xml'))
    #sample tweet object
    root = xml.getroot()
    general_tweets_corpus_train = pd.DataFrame(columns=('content', 'polarity', 'agreement'))
    tweets = root.getchildren()
    for i in range(0,len(tweets)):
        tweet = tweets[i]
        row = dict(zip(['content', 'polarity', 'agreement'], 
                       [tweet.content.text, tweet.sentiments.polarity.value.text, 
                        tweet.sentiments.polarity.type.text]))
        row_s = pd.Series(row)
        row_s.name = i
        general_tweets_corpus_train = general_tweets_corpus_train.append(row_s)
    general_tweets_corpus_train.to_csv('datasets/csv/general-tweets-train-tagged.csv', index=False, encoding='utf-8')

In [17]:
tweets_corpus = pd.concat([general_tweets_corpus_train])
tweets_corpus = tweets_corpus.query('agreement != "DISAGREEMENT" and polarity != "NONE"')
tweets_corpus = tweets_corpus[-tweets_corpus.content.str.contains('^http.*$')]
tweets_corpus.sample(5)

Unnamed: 0,content,polarity,agreement
2664,Si. Señor.. Ahi vamos RT @jerry_bob31: @AlejandroSanz Vamos por el Nuevo Disco!!,P,AGREEMENT
447,Es un muñeco! (pero la verdad es que lo han clavao...) http://t.co/SFtib43B,P,AGREEMENT
4134,"#quemiedo como nos ""rescaten""",NEU,AGREEMENT
82,Tranquilidad en las carreteras en el segundo día de Puente de la Constitución - http://t.co/NRk2uftQ,P+,AGREEMENT
5538,El primero. @elmundoes: La juez envía a prisión al exdirector de Trabajo andaluz imputado por el caso de los ERE falsos http://t.co/OadFKKj2,N+,AGREEMENT


In [18]:
try:
    general_tweets_corpus_test = pd.read_csv('datasets/csv/general-tweets-test1k.csv')#, encoding='utf-8')
except:
    xml = objectify.parse(open('datasets/xml/general-tweets-test1k.xml'))
    #sample tweet object
    root = xml.getroot()
    general_tweets_corpus_test = pd.DataFrame(columns=('content', 'polarity'))
    tweets = root.getchildren()
    for i in range(0,len(tweets)):
        tweet = tweets[i]
        row = dict(zip(['content'], [tweet.content.text]))
        row_s = pd.Series(row)
        row_s.name = i
        general_tweets_corpus_test = general_tweets_corpus_test.append(row_s)
    general_tweets_corpus_test.to_csv('datasets/csv/general-tweets-test1k.csv', index=False)#, encoding='utf-8')


In [19]:
tweets_test = pd.concat([general_tweets_corpus_test])

tweets_test.sample(5)

Unnamed: 0,content,polarity
454,"""@luiscalvo92: has leído 'Desde el otro lado del escaparate' de Toni Segarra?"" Claro. Lo mejor sobre publicidad escrito hasta la fecha.",
971,"Wow! Cómo se la ha jugado mi hermandad, la Vera Cruz, saliendo hoy. Lo ha dado todo y solo ha podido recorrer una calle.",
680,"Tampoco respeto ni acato una ley electoral que supone negar el principio democrático de una persona, un voto. Fraudulenta.",
613,Mil gracias ;-))) RT @cibeles5: @mariviromero #FF Feliz congreso,
993,"Los 10.000 millones de recortes en sanidad y educación sin competencias para ellos, solo demuestra acojono gubernamental. Así, leche segura.",


In [20]:
try:
    tagged_tweets_corpus_test = pd.read_csv('datasets/csv/general-tweets-test1k-tagged.csv', encoding='utf-8')
except:

    from lxml import objectify
    xml = objectify.parse(open('datasets/xml/general-tweets-test1k-tagged.xml'))
    #sample tweet object
    root = xml.getroot()
    tagged_tweets_corpus_test = pd.DataFrame(columns=('content', 'polarity'))
    tweets = root.getchildren()
    for i in range(0,len(tweets)):
        tweet = tweets[i]
        row = dict(zip(['content', 'polarity', 'agreement'], [tweet.content.text, tweet.sentiments.polarity.value.text]))
        row_s = pd.Series(row)
        row_s.name = i
        tagged_tweets_corpus_test = tagged_tweets_corpus_test.append(row_s)
    tagged_tweets_corpus_test.to_csv('datasets/csv/general-tweets-test1k-tagged.csv', index=False, encoding='utf-8')

In [21]:
tweets_tagged = pd.concat([tagged_tweets_corpus_test])
tweets_tagged = tweets_tagged.query('polarity != "NONE"')
diff = np.setdiff1d(tweets_test.index.values, tweets_tagged.index.values)

tweets_test = tweets_test.drop(diff)
tweets_tagged.sample(5)


Unnamed: 0,content,polarity
289,En @poloniatv3 estrenando nuevo personaje. El ministro DE GUINDOS!! Veremos q tal recortamos...;)),P+
520,Aparato de hierro (Rubalcaba forma un bloque compacto para la peor situación del PSOE) Blog El Patio del Congreso http://t.co/64HaGHan,N
299,"""Os pido el apoyo para un proyecto nuevo en una sociedad que cambia"".",P+
415,"El blog de @mariarojo82 dietas sanas, nutrición y falsos mitos | Dietas sin milagros - Blog diariosur.es http://t.co/LrAmWEpI",P+
908,PArece útil! Gracias.“@susalonso: Una sola página para descubrir las noticias más leídas de principales medios online http://t.co/9SYIL5b1”,P+


## Tokenize and Stems Sentences 

In [22]:
#Stem: Cut word in root (wait: wait, waited: wait, waiting: wait)
def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

#Each word is a token
def tokenize(text):
    text = ''.join([c for c in text if c not in non_words])
    tokens =  word_tokenize(text)

    # stem
    try:
        stems = stem_tokens(tokens, stemmer)
    except Exception as e:
        print(e)
        print(text)
        stems = ['']
    return stems

In [23]:
#Stopwords: Empty word (i.e articles)

spanish_stopwords = stopwords.words('spanish')
stemmer = SnowballStemmer('spanish')


#Non Words: Symbols and Numbers
non_words = list(punctuation)
non_words.extend(['¿', '¡'])
non_words.extend(map(str,range(10)))


## Creating Model (Linear SVM) and Training

In [24]:
#Binarizing

tweets_corpus['polarity_bin'] = 0
index = tweets_corpus.polarity.isin(['P', 'P+'])
tweets_corpus.polarity_bin.loc[index] = 1
print tweets_corpus.polarity_bin.value_counts(normalize=True)

tweets_test['polarity_bin'] = 0

tweets_tagged['polarity_bin'] = 0
index = tweets_tagged.polarity.isin(['P', 'P+'])
tweets_tagged.polarity_bin.loc[index] = 1
tweets_tagged.polarity_bin.value_counts(normalize=True)

y = tweets_tagged.polarity_bin.values

tweets_corpus.sample(3)

1    0.53994
0    0.46006
Name: polarity_bin, dtype: float64


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


Unnamed: 0,content,polarity,agreement,polarity_bin
1551,RT “@sgueina: He apoyado la petición para q a “@sanchez_sonia Rajoy le condecore con la Gran Cruz Carlos III y Collar Isabel La Católica”,P+,AGREEMENT,1
5631,"Muy bueno Reixa: la fabricación de placas inaugurales en la época de don Manuel, una industria boyante #cuandoeramoscultos",P+,AGREEMENT,1
1892,"El alcalde de Olula, del PP, agradece que el nombre del pueblo salga en los medios con la presentación de Chacón. http://t.co/lLIQG8gt",P+,AGREEMENT,1


# Find Parameters

In [None]:
vectorizer = CountVectorizer(
                analyzer = 'word',
                tokenizer = tokenize,
                lowercase = True,
                stop_words = spanish_stopwords)

pipeline = Pipeline([
    ('vect', vectorizer),
    ('cls', LinearSVC()),
])



parameters = {
    'vect__max_df': (0.5, 1.9),
    'vect__min_df': (10, 20,50),
    'vect__max_features': (500, 1000),
    'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
    'cls__C': (0.2, 0.5, 0.7),
    'cls__loss': ('hinge', 'squared_hinge'),
    'cls__max_iter': (500, 1000)
}


grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1 , scoring='roc_auc')
grid_search.fit(tweets_corpus.content, tweets_corpus.polarity_bin)



In [25]:
best_params = {'vect__ngram_range': (1, 2), 'cls__loss': 'hinge', 'vect__max_df': 0.5
 , 'cls__max_iter': 1000, 'vect__min_df': 10, 'vect__max_features': 1000
 , 'cls__C': 0.2}

best_pipe = Pipeline([
    ('vect', CountVectorizer(
            analyzer = 'word',
            tokenizer = tokenize,
            lowercase = True,
            stop_words = spanish_stopwords,
            min_df = 10,
            max_df = 0.5,
            ngram_range=(1, 2),
            max_features=1000
            )),
    ('cls', LinearSVC(C=.2, loss='hinge',max_iter=1000,multi_class='ovr',
             random_state=None,
             penalty='l2',
             tol=0.0001
             )),
])

best_pipe.fit(tweets_corpus.content, tweets_corpus.polarity_bin)
#best_test = tweets_test
#best_test['polarity_bin'] = best_pipe.predict(best_test.content)





Pipeline(steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=0.5, max_features=1000, min_df=10,
        ngram_range=(1, 2), preprocessor=None,
        stop_words=[u'de', ...e', max_iter=1000, multi_class='ovr',
     penalty='l2', random_state=None, tol=0.0001, verbose=0))])

In [71]:
y_b = best_test.polarity_bin.values
best_result = np.abs(y_b - y)
np.bincount(best_result)[0]/float(best_result.shape[0])


0.80398162327718226

In [72]:
pipeline = Pipeline([
    ('vect', CountVectorizer(
            analyzer = 'word',
            tokenizer = tokenize,
            lowercase = True,
            stop_words = spanish_stopwords,
            min_df = 50,
            max_df = 1.9,
            ngram_range=(1, 1),
            max_features=1000
            )),
    ('cls', LinearSVC(C=.2, loss='squared_hinge',max_iter=1000,multi_class='ovr',
             random_state=None,
             penalty='l2',
             tol=0.0001
             )),
])

pipeline.fit(tweets_corpus.content, tweets_corpus.polarity_bin)
tweets_test['polarity_bin'] = pipeline.predict(tweets_test.content)



In [73]:
y_t = tweets_test.polarity_bin.values
result = np.abs(y_t - y)
np.bincount(result)[0]/float(result.shape[0])


0.71975497702909652

In [74]:
p = pipeline.fit(tweets_corpus.content, tweets_corpus.polarity_bin)

scores = cross_val_score(p, tweets_corpus.content, tweets_corpus.polarity_bin, cv=5)

np.mean(scores)



0.68227481108362864

In [38]:
best_scores = cross_val_score(best_pipe, tweets_corpus.content, tweets_corpus.polarity_bin, cv=5)

np.mean(best_scores)



0.73773511948822568

In [3]:
try:
    re_test = pd.read_csv('datasets/csv/MiniREa.csv', encoding='utf-8')
except:
    xml = objectify.parse(open('datasets/xml/MiniRE.xml'))
    #sample tweet object
    root = xml.getroot()
    re_test = pd.DataFrame(columns=('content', 'polarity'))
    tweets = root.getchildren()
    for i in range(0,len(tweets)):
        tweet = tweets[i]
        row = dict(zip(['content'], [tweet.content.text]))
        row_s = pd.Series(row)
        if row_s[0] != None:
            row_s.name = i
            re_test = re_test.append(row_s)
    re_test.to_csv('datasets/csv/MiniRE.csv', index=False, encoding='utf-8')

In [33]:
re = read_pickle('../twitterProject/Data/ProcessedTags/pickle/re')
rl = read_pickle('../twitterProject/Data/ProcessedTags/pickle/rl')
rc = read_pickle('../twitterProject/Data/ProcessedTags/pickle/rc')
rt = read_pickle('../twitterProject/Data/ProcessedTags/pickle/rt')

In [36]:
re['polarity'] = best_pipe.predict(re.text)



In [42]:
rc['polarity'] = best_pipe.predict(rc.text)



In [45]:
rl['polarity'] = best_pipe.predict(rl.text)



In [46]:
rt['polarity'] = best_pipe.predict(rt.text)



In [52]:
re.to_pickle('../twitterProject/Data/ProcessedTags/pickle/re')
rc.to_pickle('../twitterProject/Data/ProcessedTags/pickle/rc')
rt.to_pickle('../twitterProject/Data/ProcessedTags/pickle/rt')
rl.to_pickle('../twitterProject/Data/ProcessedTags/pickle/rl')
