
 ![alt text](images/piensa_logo.png "Logo Title Text 1")

## Daisy: Sentiment Analysis Code

## Overview
#### ** Introduce myself
#### ** Jupyter Notebook
#### ** DAISY ALGORITHM
#### ** TF IDF Transform
#### ** Support Vector Machines
#### ** Neuronal Networks

<img src="images/saudi_meme.jpg" alt="Drawing" style="width: 500px;"/>


In [1]:
#python libs
import sys
import re
import json
import nltk
import os
import numpy as np
import time 
import scipy
import csv

from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm

In [2]:
data_dir = r'/Users/waybarrios/Documents/SVM'

In [3]:
classes = ['pos','neg']

In [4]:
    # Hashtags
    hash_regex = re.compile(r"#(\w+)")
    def hash_repl(match):
        return '__HASH_'+match.group(1).upper()

    # Handels
    hndl_regex = re.compile(r"@(\w+)")
    def hndl_repl(match):
        return '__HNDL'#_'+match.group(1).upper()
    # URLs
    url_regex = re.compile(r"(http|https|ftp)://[a-zA-Z0-9\./]+")

    # Spliting by word boundaries
    word_bound_regex = re.compile(r"\W+")

    # Repeating words like hurrrryyyyyy
    rpt_regex = re.compile(r"(.)\1{1,}", re.IGNORECASE);
    def rpt_repl(match):
        return match.group(1)+match.group(1)

    # Emoticons
    emoticons = \
        [('__EMOT_SMILEY',[':-)', ':)', '(:', '(-:', ] ),\
            ('__EMOT_LAUGH',[':-D', ':D', 'X-D', 'XD', 'xD', ] ),\
            ('__EMOT_LOVE',['<3', ':\*', ] ),\
            ('__EMOT_WINK',[';-)', ';)', ';-D', ';D', '(;', '(-;', ] ),\
            ('__EMOT_FROWN',[':-(', ':(', '(:', '(-:', ] ),\
            ('__EMOT_CRY',[':,(', ':\'(', ':"(', ':(('] ),\
        ]

    # Punctuations
    punctuations = \
        [ #('',['.', ] ),\
            #('',[',', ] ),\
            #('',['\'', '\"', ] ),\
            ('__PUNC_EXCL',['!', '¡', ] ),\
            ('__PUNC_QUES',['?', '¿', ] ),\
            ('__PUNC_ELLP',['...', '…', ] ),\
            #FIXME : MORE? http://en.wikipedia.org/wiki/Punctuation
        ]
        
    #For emoticon regexes
    def escape_paren(arr):
        return [text.replace(')', '[)}\]]').replace('(', '[({\[]') for text in arr]

    def regex_union(arr):
        return '(' + '|'.join( arr ) + ')'

    emoticons_regex = [ (repl, re.compile(regex_union(escape_paren(regx))) ) \
                        for (repl, regx) in emoticons ]

    #For punctuation replacement
    def punctuations_repl(match):
        text = match.group(0)
        repl = []
        for (key, parr) in punctuations :
            for punc in parr :
                if punc in text:
                    repl.append(key)
        if( len(repl)>0 ) :
            return ' '+' '.join(repl)+' '
        else :
            return ' '

    def processHashtags(text, subject='', query=[]):
        return re.sub( hash_regex, hash_repl, text )

    def processHandles(text, subject='', query=[]):
        return re.sub( hndl_regex, hndl_repl, text )

    def processUrls(text, subject='', query=[]):
        return re.sub( url_regex, ' __URL ', text )

    def processEmoticons(text, subject='', query=[]):
        for (repl, regx) in emoticons_regex :
            text = re.sub(regx, ' '+repl+' ', text)
        return text

    def processPunctuations( text, subject='', query=[]):
        return re.sub( word_bound_regex , punctuations_repl, text )

    def processRepeatings( 	text, subject='', query=[]):
        return re.sub( rpt_regex, rpt_repl, text )

    def processQueryTerm( 	text, subject='', query=[]):
        query_regex = "|".join([ re.escape(q) for q in query])
        return re.sub( query_regex, '__QUER', text, flags=re.IGNORECASE )

    def countHandles(text):
        return len( re.findall( hndl_regex, text) )
    def countHashtags(text):
        return len( re.findall( hash_regex, text) )
    def countUrls(text):
        return len( re.findall( url_regex, text) )
    def countEmoticons(text):
        count = 0
        for (repl, regx) in emoticons_regex :
            count += len( re.findall( regx, text) )
        return count

    #FIXME: preprocessing.preprocess()! wtf! will need to move.
    #FIXME: use process functions inside
    def processAll(text, subject='', query=[]):

        if(len(query)>0):
            query_regex = "|".join([ re.escape(q) for q in query])
            text = re.sub( query_regex, '__QUER', text, flags=re.IGNORECASE )

        text = re.sub( hash_regex, hash_repl, text )
        text = re.sub( hndl_regex, hndl_repl, text )
        text = re.sub( url_regex, ' __URL ', text )

        for (repl, regx) in emoticons_regex :
            text = re.sub(regx, ' '+repl+' ', text)


        text = text.replace('\'','')
        # FIXME: Jugad

        text = re.sub( word_bound_regex , punctuations_repl, text )
        text = re.sub( rpt_regex, rpt_repl, text )

        return text


In [5]:
train_files = ['train-neg.txt','train-pos.txt']
test_files = ['test-neg.txt','test-pos.txt']

In [6]:
train_data = []
train_labels = []
test_data = []
test_labels = []

In [7]:
for fname in train_files:
    with open(os.path.join(data_dir,fname),'r') as f:
        reader = csv.reader(f)
        if 'neg' in fname:
            for row in reader:
                row_new = ''.join(row)
                row_process=processAll(row_new)
                #print(row_process)
                train_data.append(row[0])
                train_labels.append(0)
        else:
            for row in reader:
                row_new = ''.join(row)
                row_process=processAll(row_new)
                train_data.append(row[0])
                train_labels.append(1)  
                
for fname in test_files:
    with open(os.path.join(data_dir,fname),'r') as f:
        reader = csv.reader(f)
        if 'neg' in fname:
            for row in reader:
                row_new = ''.join(row)
                row_process=processAll(row_new)
                test_data.append(row[0])
                test_labels.append(0)
        else:
            for row in reader:
                row_new = ''.join(row)
                row_process=processAll(row_new)
                test_data.append(row[0])
                test_labels.append(1)  

In [8]:
sub_data = np.concatenate((train_data[0:100000],train_data[-100000:]))
sub_labels = np.concatenate((train_labels[0:100000],train_labels[-100000:]))

sub_test = np.concatenate((test_data[0:1875],test_data[-1875:]))
sub_tlabel = np.concatenate((test_labels[0:1875],test_labels[-1875:]))



### TF-IDF Transform

<p>In information retrieval, tf–idf, short for term frequency–inverse document frequency, is a numerical statistic that is intended to reflect how important a word is to a document in a collection or corpus. It is often used as a weighting factor in information retrieval and text mining. The tf-idf value increases proportionally to the number of times a word appears in the document, but is offset by the frequency of the word in the corpus, which helps to adjust for the fact that some words appear more frequently in general.</p>

<p>Variations of the tf–idf weighting scheme are often used by search engines as a central tool in scoring and ranking a document's relevance given a user query. tf–idf can be successfully used for stop-words filtering in various subject fields including text summarization and classification.</p>

<p>One of the simplest ranking functions is computed by summing the tf–idf for each query term; many more sophisticated ranking functions are variants of this simple model.</p>

In [9]:

vectorizer = TfidfVectorizer(min_df=5, max_df = 0.8, sublinear_tf = True,
                           use_idf = True)
#print(vectorizer)

In [10]:
train_vectors = vectorizer.fit_transform(train_data)

## Support Vector Machines
### Definition: 
Support vector machines (SVMs) are a set of supervised learning methods used for classification, regression and outliers detection.

The advantages of support vector machines are:
* Effective in high dimensional spaces.
* Still effective in cases where number of dimensions is greater than the number of samples.
* Uses a subset of training points in the decision function (called support vectors), so it is also memory efficient.
* Versatile: different Kernel functions can be specified for the decision function. Common kernels are provided, but it is also possible to specify custom kernels.

The disadvantages of support vector machines include:

* If the number of features is much greater than the number of samples, the method is likely to give poor performances.
* SVMs do not directly provide probability estimates, these are calculated using an expensive five-fold cross-validation.

### Kernel Classification: 
![SVM Classification](images/plot_iris_0012.png)


In [11]:
tt1 = time.time()
classifier_liblinear = svm.LinearSVC()
classifier_liblinear.fit(train_vectors,train_labels)
tt2 = time.time()
print "training time: %f" %(tt2-tt1)


training time: 11.485854


In [12]:
tt3 = time.time()
test_vectors = vectorizer.transform(test_data)
prediction_liblinear = classifier_liblinear.predict(test_vectors)
tt4 = time.time()
print(classification_report(test_labels,prediction_liblinear))
print "testing time: %f" %(tt4-tt3)


             precision    recall  f1-score   support

          0       0.77      0.75      0.76      7500
          1       0.75      0.77      0.76      7500

avg / total       0.76      0.76      0.76     15000

testing time: 0.456012


In [17]:
from sklearn.svm import SVC
t0 = time.time()
clf = SVC(C=1.2, kernel='linear', degree=3, gamma='auto', coef0=0.0, 
          shrinking=True, probability=False, tol=0.00001, cache_size=850, 
          class_weight=None, verbose=False, max_iter=10000, 
          decision_function_shape=None, random_state=None)
clf.fit(train_vectors, train_labels)
t1 = time.time()
print "training time: %f" %(t1-t0)

training time: 462.770762


In [18]:

test_vectors = vectorizer.transform(test_data)
#test_vectors = vectorizer.transform(sub_test)



In [19]:
 t2 = time.time()
print "SVM SKLEARN PERFORMANCE"
trad = clf.predict(test_vectors)
t3 = time.time()
print(classification_report(test_labels,trad))
print "testing time: %f" %(t3-t2)

SVM SKLEARN PERFORMANCE
             precision    recall  f1-score   support

          0       0.60      0.49      0.54      7500
          1       0.57      0.68      0.62      7500

avg / total       0.59      0.58      0.58     15000

testing time: 21.657634


In [None]:
X = train_vectors.toarray()
xt = test_vectors.toarray()

![Tensorflow](images/logo-tensor.png)

TensorFlow is an open source software library for numerical computation using data flow graphs. Nodes in the graph represent mathematical operations, while the graph edges represent the multidimensional data arrays (tensors) communicated between them. The flexible architecture allows you to deploy computation to one or more CPUs or GPUs in a desktop, server, or mobile device with a single API. TensorFlow was originally developed by researchers and engineers working on the Google Brain Team within Google's Machine Intelligence research organization for the purposes of conducting machine learning and deep neural networks research, but the system is general enough to be applicable in a wide variety of other domains as well.

<p>Github profile: https://github.com/tensorflow/tensorflow </p>
<p>Docs: https://www.tensorflow.org/versions/r0.9/api_docs/index.html </p>
<p>Tutorials: https://www.tensorflow.org/versions/r0.9/tutorials/index.html </p>
<p>Wayner's examples:</p> 
* Tensorflow CNN: https://github.com/waybarrios/TensorFlow_CNN
* CNN vs Softmax: https://github.com/waybarrios/CNN_vs_Softmax_Tensorflow
* MNIST EXAMPLE TENSORFLOW: https://github.com/waybarrios/Easily-Example-TensorFLow 

In [13]:
import skflow
from sklearn import datasets, metrics

In [18]:

classifier = skflow.TensorFlowLinearClassifier(n_classes=2)
t4 = time.time()
classifier.fit(np.array(X), np.array(sub_labels))
t5 = time.time()
print "skflow training time: %f" %(t5-t4)
skflow_pred = classifier.predict(xt)
t6 = time.time()





skflow training time: 113.183056


In [20]:
print "TENSORFLOW SVM PERFORMANCE"
print "skflow testing time: %f" %(t6-t5)
print(classification_report(test_labels,skflow_pred))

TENSORFLOW SVM PERFORMANCE
skflow testing time: 5.682661
             precision    recall  f1-score   support

          0       0.72      0.63      0.67      7500
          1       0.67      0.76      0.71      7500

avg / total       0.69      0.69      0.69     15000



<img src="images/deep_meme.png" alt="meme" style="width: 560px;"/>

## Neuronal Networks
In machine learning and cognitive science, artificial neural networks (ANNs) are a family of models inspired by biological neural networks (the central nervous systems of animals, in particular the brain) which are used to estimate or approximate functions that can depend on a large number of inputs and are generally unknown. Artificial neural networks are typically specified using three things:

 **Architecture** specifies what variables are involved in the network and their topological relationships—for example the variables involved in a neural network might be the weights of the connections between the neurons, along with activities of the neurons



<p> **Activity Rule** Most neural network models have short time-scale dynamics: local rules define how the activities of the neurons change in response to each other. Typically the activity rule depends on the weights (the parameters) in the network. </p>

<p> **Learning Rule** The learning rule specifies the way in which the neural network's weights change with time. This learning is usually viewed as taking place on a longer time scale than the time scale of the dynamics under the activity rule. Usually the learning rule will depend on the activities of the neurons. It may also depend on the values of the target values supplied by a teacher and on the current value of the weights.There are three major learning paradigms, each corresponding to a particular abstract learning task. These are **supervised learning**, **unsupervised learning** and **reinforcement learning**.</p>

* Supervised learning:In supervised learning, we are given a set of example pairs ${\displaystyle \textstyle (x,y),x\in X,y\in Y} \textstyle (x,y),x\in X,y\in Y$ and the aim is to find a function ${\displaystyle \textstyle f:X\rightarrow Y} $ in the allowed class of functions that matches the examples. In other words, we wish to infer the mapping implied by the data; the cost function is related to the mismatch between our mapping and the data and it implicitly contains prior knowledge about the problem domain.
* Unsupervised Learning: In unsupervised learning, some data ${\displaystyle \textstyle x}$ is given and the cost function to be minimized, that can be any function of the data ${\displaystyle \textstyle x} $ and the network's output, ${\displaystyle \textstyle f}$. The cost function is dependent on the task (what we are trying to model) and our a priori assumptions (the implicit properties of our model, its parameters and the observed variables).

* Reinforcement learning: In reinforcement learning, data ${\displaystyle \textstyle x}$ are usually not given, but generated by an agent's interactions with the environment. At each point in time ${\displaystyle \textstyle t}$ , the agent performs an action ${\displaystyle \textstyle y_{t}}$  and the environment generates an observation ${\displaystyle \textstyle x_{t}}$  and an instantaneous cost ${\displaystyle \textstyle c_{t}}$ , according to some (usually unknown) dynamics. The aim is to discover a policy for selecting actions that minimizes some measure of a long-term cost, e.g., the expected cumulative cost. The environment's dynamics and the long-term cost for each policy are usually unknown, but can be estimated.


**Recommendation Stanford Course: http://cs231n.github.io/ **

### Gradient Descent 

Gradient descent is a first-order optimization algorithm. To find a local minimum of a function using gradient descent, one takes steps proportional to the negative of the gradient (or of the approximate gradient) of the function at the current point. If instead one takes steps proportional to the positive of the gradient, one approaches a local maximum of that function; the procedure is then known as gradient ascent. Gradient descent is also known as steepest descent, or the method of steepest descent.


![Tensorflow](images/gradient.png)




Gradient descent is based on the observation that if the multi-variable function ${\displaystyle F(\mathbf {x} )}$ is defined and differentiable in a neighborhood of a point ${\displaystyle \mathbf {a} } $ , then ${\displaystyle F(\mathbf {x} )}$ decreases fastest if one goes from ${\displaystyle \mathbf {a} } $  in the direction of the negative gradient of $ F$ at ${\displaystyle \mathbf {a} }  , {\displaystyle -\nabla F(\mathbf {a} )} $. 
<p>It follows that, if: </p>

$${\displaystyle \mathbf {b} =\mathbf {a} -\gamma \nabla F(\mathbf {a} )}$$

### Optimization Algorithms for Gradient Descent

![Tensorflow](images/updater.gif)


In [14]:
t7 = time.time()
clas_nn = skflow.TensorFlowDNNClassifier(hidden_units=[10,20], n_classes=
clas_nn.fit(np.array(X), np.array(sub_labels))
t8 = time.time()
print "DNN training time: %f" %(t8 - t7)
nn_pred = clas_nn.predict(xt)
t9 = time.time()
print "DNN testing time: %f" %(t9 - t8)
score = metrics.accuracy_score(test_labels,nn_pred)
print("Accuracy: %f" % score)



DNN training time: 92.069817
DNN testing time: 3.664607
Accuracy: 0.500533


In [15]:
print "Neuronal Networks Performance"
print(classification_report(test_labels,nn_pred ))

Neuronal Networks Performance
             precision    recall  f1-score   support

          0       0.90      0.00      0.00      7500
          1       0.50      1.00      0.67      7500

avg / total       0.70      0.50      0.33     15000



<img src="images/deep_toy.png" alt="meme_toy" style="width: 560px;"/>