# Sentiment Analysis

In [99]:
import numpy as np
import pandas as pd
import tensorflow as tf
import spacy
import collections
import operator
from tqdm.autonotebook import tqdm

In [64]:
columns = ["target", "ids", "date", "flag", "user", "text"]
encoding = "ISO-8859-1"
dataset = pd.read_csv('dataset/tweets.csv', encoding=encoding, names=columns)

In [65]:
dataset.drop(columns=["ids", "date", "flag", "user"], inplace=True)
dataset.head()

Unnamed: 0,target,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


In [66]:
%%time
dataset.target.replace({4: 1}, inplace=True)

CPU times: user 5.31 ms, sys: 4.36 ms, total: 9.67 ms
Wall time: 13.3 ms


In [67]:
dataset = dataset.sample(frac=1).reset_index(drop=True)
dataset = dataset.iloc[:50000]

In [68]:
dataset.head()

Unnamed: 0,target,text
0,0,@sfgiantsgirl call him christopher john wilson...
1,0,@BrennaBreakdown i have to actually agree wit...
2,1,@jnearing Hahaha! It's all good. Phil and I go...
3,0,cars in the shop but im driving a Charger as ...
4,1,@JordanLindvall you romantic you.


In [None]:
nlp = spacy.load("en_core_web_lg")

In [69]:
tweets_iterator = nlp.pipe(dataset.text, n_threads=-1, batch_size=32)

In [73]:
words = collections.defaultdict(int)
for tweet in tqdm(tweets_iterator, total=dataset.shape[0]):
    for token in tweet:
        if not token.is_stop:
            lexeme = nlp.vocab[token.lemma]
            if lexeme.has_vector:
                words[lexeme] += 1

HBox(children=(IntProgress(value=0, max=50000), HTML(value='')))

In [74]:
dataset.head()

Unnamed: 0,target,text
0,0,@sfgiantsgirl call him christopher john wilson...
1,0,@BrennaBreakdown i have to actually agree wit...
2,1,@jnearing Hahaha! It's all good. Phil and I go...
3,0,cars in the shop but im driving a Charger as ...
4,1,@JordanLindvall you romantic you.


In [100]:
sorted_words = sorted(words.items(), key=operator.itemgetter(1), reverse=True)
sorted_words[0]

(<spacy.lexeme.Lexeme at 0x20de0dee8>, 28387)

In [101]:
embedding = np.zeros((len(sorted_words) + 1, 300))
embedding.shape

(29235, 300)

In [103]:
for (i, lexeme) in enumerate(sorted_words, start=1):
    embedding[i] = lexeme[0].vector

In [104]:
embedding

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.26554   ,  0.33531001,  0.2186    , ..., -0.17859   ,
        -0.062878  ,  0.16232   ],
       [ 0.012001  ,  0.20750999, -0.12578   , ...,  0.13871001,
        -0.36048999, -0.035     ],
       ...,
       [ 0.45242   ,  0.35409999, -0.47035   , ...,  0.047044  ,
        -0.63622999,  0.31116   ],
       [ 0.36107001,  0.0013    ,  0.20163999, ...,  0.27814999,
        -0.46634999, -0.041166  ],
       [ 0.46969   , -0.18385001, -0.050012  , ..., -0.20558999,
        -0.42987001,  0.05961   ]])

In [112]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(len(embedding), 300, weights=[embedding], input_length=100),
    tf.keras.layers.Conv1D(16, kernel_size=3),
    tf.keras.layers.ReLU(),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.GlobalMaxPooling1D(),
    tf.keras.layers.Dense(32),
    tf.keras.layers.ReLU(),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dense(1)
])

In [113]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 100, 300)          8770500   
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 98, 16)            14416     
_________________________________________________________________
re_lu_5 (ReLU)               (None, 98, 16)            0         
_________________________________________________________________
batch_normalization_v1_3 (Ba (None, 98, 16)            64        
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 16)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 32)                544       
_________________________________________________________________
re_lu_6 (ReLU)               (None, 32)                0         
__________