# Import needed libraries

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from keras.callbacks import TensorBoard
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras import regularizers
from keras.layers import Dense, Embedding, LSTM, Conv1D, Conv2D, MaxPooling1D, Reshape, Flatten, Dropout, CuDNNLSTM
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical

import operator


# Import our dependencies
import tensorflow as tf

import tensorflow_hub as hub
import os
import re
from keras import backend as K
import keras.layers as layers
from keras.models import Model, load_model
from keras.engine import Layer


# Initialize session
sess = tf.Session()
K.set_session(sess)
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer

#from appos import appos

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Start with loading all necessary libraries
import numpy as np
import pandas as pd
from os import path
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

import matplotlib.pyplot as plt
% matplotlib inline

# Loading, extracting and pre-processing the data

In [2]:
import pandas as pd

# Replace 'file_path' with the actual path to your CSV file on your local system
file_path = '/Users/teja/Downloads/Twitter_Data.csv'

# Read the CSV file into a pandas DataFrame
data = pd.read_csv(file_path)
data = data[['Text', 'Score']]



In [None]:
#*Convert the ratings to the sentiments negative and positive.*
Each sentiment will be represented by integers 0 or 1.

0: negative

1: positive


In [3]:
# Assume 'data' is your DataFrame with 'Score' column containing 0 and 1 values

# Define the mapping from 0, 1 ratings to negative or positive sentiment
rating_to_sentiment = { 0: 0, 1: 1 }

# Apply the mapping to create the 'Sentiment' column
data['Sentiment'] = data['Score'].map(rating_to_sentiment)

# Count number of negative and positive reviews
neg_num = (data['Sentiment'] == 0).sum()
pos_num = (data['Sentiment'] == 1).sum()

print('# negative reviews before: {}'.format(neg_num))
print('# positive reviews before: {}'.format(pos_num))

# Make the dataset balanced
balanced_sample_num = min(neg_num, pos_num)

# Randomly select <'balanced_sample_num'> numbers of negative and positive reviews
neg_sample = data[data['Sentiment'] == 0].sample(n=balanced_sample_num, replace=False)
pos_sample = data[data['Sentiment'] == 1].sample(n=balanced_sample_num, replace=False)

# Concatenate the samples to create the balanced dataset
data_balanced = pd.concat([neg_sample, pos_sample], ignore_index=True)

# Shuffle the rows so that 0's and 1's are mixed
data_balanced = data_balanced.sample(frac=1).reset_index(drop=True)

print('\n# negative reviews after: {}'.format((data_balanced['Sentiment'] == 0).sum()))
print('# positive reviews after: {}'.format((data_balanced['Sentiment'] == 1).sum()))

# Get one-hot encoding for the labels
Y = pd.get_dummies(data_balanced['Sentiment']).values


# negative reviews before: 90723
# positive reviews before: 72250

# negative reviews after: 72250
# positive reviews after: 72250


# Perform pre-processing on the text

In [5]:
!pip install --upgrade nltk
import re
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')
from transformers import AutoTokenizer
import numpy as np
from nltk.corpus import stopwords
nltk.download('wordnet')
from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer
import pandas as pd



# Replace 'path_to_your_csv_file.csv' with the actual path to your CSV file on your local system
file_path = '/Users/teja/Downloads/Twitter_Data.csv'

# Read the CSV file into a pandas DataFrame
data = pd.read_csv(file_path)
# All characters to lower case
data['Text'] = data['Text'].apply(lambda x: x.lower() if isinstance(x, str) else str(x))

# # Convert words with apostrophes to its corresponding words, e.g. "it's" -> "it is"
# data['Text'] = data['Text'].apply(lambda x: x.split())
# data['Text'] = data['Text'].apply(lambda x: " ".join([appos[word] if word in appos else word for word in x]))

# Remove html-tags, punctuation, commas, numbers etc
data['Text'] = data['Text'].apply((lambda x: re.sub('<[^<]+?>', ' ', str(x))))
data['Text'] = data['Text'].apply((lambda x: re.sub('[^a-zA-z0-9\s]', ' ', (x))))
data['Text'] = data['Text'].apply((lambda x: re.sub('^\d+\s|\s\d+\s|\s\d+$', ' ', (x))))

# Convert text into tokens, in this case sentences into words
data['Text'] = data.apply(lambda x: word_tokenize(x['Text']), axis = 1)

# Remove most commonly occuring words which are not relevant in the context of the data
irrelevant_words = stopwords.words('english')
data['Text'] = data['Text'].apply(lambda x: [word for word in x if word not in irrelevant_words])

# Find the base form of the word (lemmatization)
lemma = WordNetLemmatizer()
data['Text'] = data['Text'].apply(lambda x: " ".join([lemma.lemmatize(word) for word in x]))

# Vectorize the text by turning each review into a sequence of integers (each integer being the index of a token in a dictionary)
# Also, pad so that every review has the same length
#num_top_words = 10000
#tokenizer = Tokenizer(num_words = num_top_words, split = ' ')
#tokenizer.fit_on_texts(data['Text'].values)
#X = tokenizer.texts_to_sequences(data['Text'].values)
#X = pad_sequences(X)


# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# Define the maximum sequence length
max_length = 100  # Adjust as needed

# Tokenize the text data
X = []

for text in data['Text'].values:
    # Tokenize each text and truncate/pad to the specified maximum length
    tokenized_text = tokenizer(text, padding="max_length", truncation=True, max_length=max_length)
    X.append(tokenized_text['input_ids'])

# Convert the tokenized sequences to numpy array
X = np.array(X)





[nltk_data] Downloading package punkt to /Users/teja/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/teja/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Split data-set into 2

In [None]:
from sklearn.model_selection import train_test_split

# Split the data into train and test sets with specified test and validation sizes
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size)

# Further split the train set into train and validation sets
X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size=val_size)

print('Train set: X: {}, Y: {}'.format(X_train.shape, Y_train.shape))
print('Validation set: X: {}, Y: {}'.format(X_val.shape, Y_val.shape))
print('Test set: X: {}, Y: {}'.format(X_test.shape, Y_test.shape))

X train: (70080, 3), Y train: (70080, 2)
X test: (17520, 3), Y test: (17520, 2)


"smaller_set_percentage = 0.5\nsplit = int(round(smaller_set_percentage * data.shape[0]))\n\ntrain_x=data[split:]\ntest_x=data[:split]\ntrain_y=Y[split:]\ntest_y=Y[:split]\n\nprint(train_x)\nprint(test_y)\nprint('training data set: X: {} Y: {}'.format(train_x.shape, train_y.shape))\nprint('testing data set: X: {} Y: {}'.format(test_x.shape, test_y.shape))"

In [None]:
# Import necessary libraries
from keras.layers import Layer
import tensorflow as tf
import tensorflow_hub as hub
from keras import backend as K

# Define a custom layer for Elmo embeddings
class ElmoEmbeddingLayer(Layer):
    def __init__(self, **kwargs):
        self.dimensions = 1024  # Dimensionality of Elmo embeddings
        self.trainable=True  # Allow the layer to update weights
        super(ElmoEmbeddingLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        # Add the trainable weights of the Elmo module to the layer's weights
        self.trainable_weights += K.tf.trainable_variables(scope="^{}_module/.*".format(self.name))
        super(ElmoEmbeddingLayer, self).build(input_shape)

    def call(self, x, mask=None):
        # Cast the input to string and remove extra dimensions
        result = self.elmo(K.squeeze(K.cast(x, tf.string), axis=1),
                           as_dict=True,
                           signature='default',
                           )['default']
        return result

    def compute_mask(self, inputs, mask=None):
        # Compute mask to remove padding tokens
        return K.not_equal(inputs, '--PAD--')

    def compute_output_shape(self, input_shape):
        # Define the output shape of the layer
        return (input_shape[0], self.dimensions)


In [None]:
!pip install allennlp

from allennlp.training.metrics.metric import Metric
from allennlp.training.metrics import f1_measure



Collecting botocore==1.12.150 (from awscli>=1.11.91->allennlp)
[?25l  Downloading https://files.pythonhosted.org/packages/a4/a3/14582589522f21684726da7b0f57dc2258cc6e3adb6046675f3bd9eba834/botocore-1.12.150-py2.py3-none-any.whl (5.4MB)
[K     |████████████████████████████████| 5.4MB 32kB/s 
Installing collected packages: botocore
  Found existing installation: botocore 1.12.147
    Uninstalling botocore-1.12.147:
      Successfully uninstalled botocore-1.12.147
Successfully installed botocore-1.12.150


In [None]:
# Install keras_metrics package
!pip install keras_metrics

# Import necessary libraries
import keras
import keras_metrics
from keras import layers
from keras.models import Model

# Define a function to build the model
def build_model(): 
    # Define input layer for text data
    input_text = layers.Input(shape=(1,), dtype="string")
    
    # Embedding layer using ElmoEmbeddingLayer
    embedding = ElmoEmbeddingLayer()(input_text)
    
    # Fully connected layer with 256 units and ReLU activation function
    dense = layers.Dense(256, activation='relu')(embedding)
    
    # Output layer with 1 unit and sigmoid activation function for binary classification
    pred = layers.Dense(1, activation='sigmoid')(dense)

    # Create the model with input and output layers
    model = Model(inputs=[input_text], outputs=pred)

    # Compile the model with binary cross-entropy loss and Adam optimizer
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    # Print model summary
    model.summary()

    return model




In [None]:
# Create train dataset
# Extract 'Text' column from train_x DataFrame and convert it to a list
train_text = train_x['Text'].tolist()
# Limit each text to 150 words by splitting and joining
train_text = [' '.join(t.split()[0:150]) for t in train_text]
# Convert train_text to a numpy array with an additional axis
train_text = np.array(train_text, dtype=object)[:, np.newaxis]
# Extract 'Sentiment' column from train_x DataFrame and convert it to a list
train_label = train_x['Sentiment'].tolist()

# Create test dataset
# Extract 'Text' column from test_x DataFrame and convert it to a list
test_text = test_x['Text'].tolist()
# Limit each text to 150 words by splitting and joining
test_text = [' '.join(t.split()[0:150]) for t in test_text]
# Convert test_text to a numpy array with an additional axis
test_text = np.array(test_text, dtype=object)[:, np.newaxis]
# Extract 'Sentiment' column from test_x DataFrame and convert it to a list
test_label = test_x['Sentiment'].tolist()


[['thick strong likely adding soup rice']
 ['wonderful product dog really love one thing make product better chew box would one every day month']
 ['movie always popular different entertaining fast moving hour half film available bllu ray look better nothing outstanding definite improvement shapnress several character mainly michael keaton unique sometimes revolting title character beetlejuice always fascinating watch whole movie also get lot humor scary special effect odd see alec baldwin low key role 90 played type guy davis look act like well davis many time played nice people viewer like took four viewing finally appreciated catharine hara comedic talent movie favorite someone find absolutely hilarious messed wife mother family move haunted house inhabited baldwin davis keaton made name actor whacked robin williams type role although never really followed anything popular film winona rider cute teenage daughter get fun supporting role diverse people talk show host dick cavett singe

In [None]:
# Build and fit
val_size=0.2
model = build_model()
model.fit(train_text, 
          train_label,
          validation_split=val_size,
          epochs=4,
          batch_size=32)

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


I0517 14:44:49.580165 140253575722880 saver.py:1483] Saver not created because there are no variables in the graph to restore


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         (None, 1)                 0         
_________________________________________________________________
elmo_embedding_layer_4 (Elmo (None, 1024)              4         
_________________________________________________________________
dense_7 (Dense)              (None, 256)               262400    
_________________________________________________________________
dense_8 (Dense)              (None, 1)                 257       
Total params: 262,661
Trainable params: 262,661
Non-trainable params: 0
_________________________________________________________________
Train on 56064 samples, validate on 14016 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x7f8eb0e49518>

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

In [None]:

# predict probabilities for test set
yhat_probs = model.predict(X_test['Text'], verbose=1)
# predict crisp classes for test set
labels = (yhat_probs > 0.5).astype(np.int)
print(labels)

#yhat_classes = np.argmax(yhat_probs,axis=1)
# accuracy: (tp + tn) / (p + n)



 1696/17520 [=>............................] - ETA: 6:24

ResourceExhaustedError: ignored

In [None]:
print(yhat_probs)
print(Y_test_new)

In [None]:

#test_label1= np.array(test_label)
#print(test_label1)
#print(test_label1.shape)
#Y_test_new=test_label1[:,1]

print("elmo_model")
accuracy = accuracy_score(Y_test_new,labels)
print('Accuracy: %f' % accuracy)
# precision tp / (tp + fp)
precision = precision_score(Y_test_new, labels)
print('Precision: %f' % precision)
# recall: tp / (tp + fn)
recall = recall_score(Y_test_new,labels)
print('Recall: %f' % recall)
# f1: 2 tp / (2 tp + fp + fn)
f1 = f1_score(Y_test_new,labels)
print('F1 score: %f \n\n' % f1)