# Simple Sentiment Analysis using Tensorflow

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import string
import plotly.express as px
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
pd.set_option('display.max_rows', None)

In [3]:
raw_data = pd.read_csv('../input/nlp-getting-started/train.csv')

In [4]:
y = raw_data.iloc[:, -1]
raw_data.shape

(7613, 5)

In [5]:
raw_data.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [6]:
raw_data.isnull().sum()

id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

In [7]:
stopwords = ["a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", "at",
             "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", "did", "do",
             "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", "have", "having",
             "he", "hed", "hes", "her", "here", "heres", "hers", "herself", "him", "himself", "his", "how",
             "hows", "i", "id", "ill", "im", "ive", "if", "in", "into", "is", "it", "its", "itself",
             "lets", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought",
             "our", "ours", "ourselves", "out", "over", "own", "same", "she", "shed", "shell", "shes", "should",
             "so", "some", "such", "than", "that", "thats", "the", "their", "theirs", "them", "themselves", "then",
             "there", "theres", "these", "they", "theyd", "theyll", "theyre", "theyve", "this", "those", "through",
             "to", "too", "under", "until", "up", "very", "was", "we", "wed", "well", "were", "weve", "were",
             "what", "whats", "when", "whens", "where", "wheres", "which", "while", "who", "whos", "whom", "why",
             "whys", "with", "would", "you", "youd", "youll", "youre", "youve", "your", "yours", "yourself",
             "yourselves"]

table = str.maketrans('', '', string.punctuation)

In [8]:
def PreProcessing(df):
    SENTENCE = []
    for idx in df.index:
        sentence = raw_data.iloc[idx, 3].lower()
        words = sentence.split()
        filtered_sentence = ''
        for word in words:
            word =word.translate(table)
            if word not in stopwords:
                filtered_sentence = filtered_sentence + word+' '
        SENTENCE.append(filtered_sentence)
    return SENTENCE

In [9]:
def TOKENIZATION(df, vocab_size=20000, max_length=10, trunc_type='post', padding_type='post', oov_tok = '<OOV>'):
    SENTENCE = PreProcessing(df)
    #print(SENTENCE)
    tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
    tokenizer.fit_on_texts(SENTENCE)
    sequences = tokenizer.texts_to_sequences(SENTENCE)
    word_index = tokenizer.word_index
    training_padded = pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
    return training_padded

In [10]:
VOCAB_SIZE = 20000
EMBD_DIM = 10
x_train = TOKENIZATION(raw_data)
y_train = raw_data.iloc[:, -1]

In [11]:
x_train.shape

(7613, 10)

In [12]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(VOCAB_SIZE, EMBD_DIM),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

2022-06-14 10:49:11.321998: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.


In [13]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [14]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 10)          200000    
_________________________________________________________________
global_average_pooling1d (Gl (None, 10)                0         
_________________________________________________________________
dense (Dense)                (None, 24)                264       
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 25        
Total params: 200,289
Trainable params: 200,289
Non-trainable params: 0
_________________________________________________________________


In [15]:
history = model.fit(x_train, y_train, epochs=25, validation_split=0.2)

2022-06-14 10:49:11.626932: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


In [16]:
hist_df = history.history
hist_df = pd.DataFrame(hist_df)

In [17]:
fig = px.line(hist_df)
fig.update_layout(template='plotly_dark', width=1200, title='Metrics')
fig.show()

In [18]:
test_df = pd.read_csv('../input/nlp-getting-started/test.csv')
id_col = test_df.iloc[:, 0]
x_test = TOKENIZATION(test_df)

In [19]:
pred = model.predict(x_test)
pred = np.where(pred>=0.5, 1, 0)
pred = pred.reshape(-1)

In [20]:
df = {'Id':id_col, 'target':pred}
df = pd.DataFrame(df)
df.to_csv('Submission.csv', index=False)

> Thank You