In [238]:
import json
import unicodedata
import re
import pandas as pd
pd.set_option('max_colwidth',1000)
from lxml import objectify
import numpy as np



from string import punctuation
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.data import load
from nltk.stem import SnowballStemmer

from string import punctuation

from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import cross_val_score
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer      


#Required packages from nltk
#nltk.download('punkt')
#nltk.download('stopwords')

## Load CSV file or create from XML

In [230]:
try:
    general_tweets_corpus_train = pd.read_csv('datasets/csv/general-tweets-train-tagged.csv', encoding='utf-8')
except:
    xml = objectify.parse(open('datasets/xml/general-tweets-train-tagged.xml'))
    #sample tweet object
    root = xml.getroot()
    general_tweets_corpus_train = pd.DataFrame(columns=('content', 'polarity', 'agreement'))
    tweets = root.getchildren()
    for i in range(0,len(tweets)):
        tweet = tweets[i]
        row = dict(zip(['content', 'polarity', 'agreement'], 
                       [tweet.content.text, tweet.sentiments.polarity.value.text, 
                        tweet.sentiments.polarity.type.text]))
        row_s = pd.Series(row)
        row_s.name = i
        general_tweets_corpus_train = general_tweets_corpus_train.append(row_s)
    general_tweets_corpus_train.to_csv('datasets/csv/general-tweets-train-tagged.csv', index=False, encoding='utf-8')

In [231]:
tweets_corpus = pd.concat([general_tweets_corpus_train])
tweets_corpus = tweets_corpus.query('agreement != "DISAGREEMENT" and polarity != "NONE"')
tweets_corpus = tweets_corpus[-tweets_corpus.content.str.contains('^http.*$')]
tweets_corpus.sample(20)

Unnamed: 0,content,polarity,agreement
5469,@germanyanke nos vemos en Bucarest en la final? Hacemos la tertulia de RNE desde allí ?,P,AGREEMENT
5143,Tu mas ;-)) RT @FdoFrances: @PepeDamianRS @mariviromero @naranjoisabel #FF os tenéis el cielo ganado por ser tan buenos,P+,AGREEMENT
935,"""A veces, cuando las nubes de la pesadumbre ocultan el cielo, parece q somos incapaces de imaginar q puedan desaparecer.Desaparecerán"" Rajoy",N,AGREEMENT
1296,Antes de hacer #Twitteroff os deseo que paséis una Feliz Nochebuena y una estupenda Navidad. Que seáis felices... Y comáis perdices :-),P+,AGREEMENT
2496,La nueva mesa de la comisión de cultura http://t.co/zmJnSSad,P,AGREEMENT
1705,"Que miedo, cuando se acabe la navidad y tengamos que enfrentarnos a la realidad, no? #porfiquenolleguereyes",N,AGREEMENT
1925,#Chacón considera q el faisán es un tema fantasma,N,AGREEMENT
2789,"Guindos y Montoro, esa pareja irrepetible. Menos mal que Rajoy preside la Comision delegada de asuntos económicos...",NEU,AGREEMENT
1982,"Un fuerte catarro, empeora la salud de Manuel Fraga, de 89 años. Convalece en su casa de Madrid, acompañado de su familia",N,AGREEMENT
2648,#FF RT @SantyUrbaneja: @MayteOtero @Andreita_ae @virginiacanadas @Milamorr @melina_cm @encarnialvarez @DoriLuque me sumo #FF para todas!!!,P,AGREEMENT


In [232]:
try:
    general_tweets_corpus_test = pd.read_csv('datasets/csv/general-tweets-test1k.csv')#, encoding='utf-8')
except:
    xml = objectify.parse(open('datasets/xml/general-tweets-test1k.xml'))
    #sample tweet object
    root = xml.getroot()
    general_tweets_corpus_test = pd.DataFrame(columns=('content', 'polarity'))
    tweets = root.getchildren()
    for i in range(0,len(tweets)):
        tweet = tweets[i]
        row = dict(zip(['content'], [tweet.content.text]))
        row_s = pd.Series(row)
        row_s.name = i
        general_tweets_corpus_test = general_tweets_corpus_test.append(row_s)
    general_tweets_corpus_test.to_csv('datasets/csv/general-tweets-test1k.csv', index=False)#, encoding='utf-8')

## Tokenize and Stems Sentences 

In [233]:
#Stem: Cut word in root (wait: wait, waited: wait, waiting: wait)
def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

#Each word is a token
def tokenize(text):
    text = ''.join([c for c in text if c not in non_words])
    tokens =  word_tokenize(text)

    # stem
    try:
        stems = stem_tokens(tokens, stemmer)
    except Exception as e:
        print(e)
        print(text)
        stems = ['']
    return stems

In [234]:
#Stopwords: Empty word (i.e articles)

spanish_stopwords = stopwords.words('spanish')
stemmer = SnowballStemmer('spanish')


#Non Words: Symbols and Numbers
non_words = list(punctuation)
non_words.extend(['¿', '¡'])
non_words.extend(map(str,range(10)))


## Creating Model (Linear SVM) and Training

In [235]:
#Binarizing

tweets_corpus['polarity_bin'] = 0
index = tweets_corpus.polarity.isin(['P', 'P+'])
tweets_corpus.polarity_bin.loc[index] = 1
tweets_corpus.polarity_bin.value_counts(normalize=True)



1    0.53994
0    0.46006
Name: polarity_bin, dtype: float64

In [None]:
vectorizer = CountVectorizer(
                analyzer = 'word',
                tokenizer = tokenize,
                lowercase = True,
                stop_words = spanish_stopwords)

pipeline = Pipeline([
    ('vect', vectorizer),
    ('cls', LinearSVC()),
])



parameters = {
    'vect__max_df': (0.5, 1.9),
    'vect__min_df': (10, 20,50),
    'vect__max_features': (500, 1000),
    'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
    'cls__C': (0.2, 0.5, 0.7),
    'cls__loss': ('hinge', 'squared_hinge'),
    'cls__max_iter': (500, 1000)
}


grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1 , scoring='roc_auc')
grid_search.fit(tweets_corpus.content, tweets_corpus.polarity_bin)