In [23]:
#import packages
#pip install textblob
#pip install keras
#pip install tensorflow
from textblob import TextBlob
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import keras.optimizers


In [9]:
#df_09= pd.read_pickle(r"C:\Users\danie\Documents\GitHub\Masters-Thesis\2009_preprocessed_date.pickle") 
df_09= pd.read_pickle(r"/Users/yolandaferreirofranchi/Documents/GitHub/Masters-Thesis/2009_preprocessed_date.pickle")

In [10]:
def subjectivity(sentence):
    subjectivity = ""

    subjectivity = TextBlob(sentence).sentiment.subjectivity

    return subjectivity

def polarity(sentence):
    polarity = ""

    polarity = TextBlob(sentence).sentiment.polarity

    return polarity

In [11]:
df_09['subjectivity'] = df_09['sentences'].apply(subjectivity)
df_09['polarity'] = df_09['sentences'].apply(polarity)

In [12]:
df_09.head(5)

Unnamed: 0,sentences,article_id,year,encoded_sentences,subjectivity,polarity
91571,Liam Gallagher has broken the silence surround...,5048,2009,"([break, silence, surround, ', break, say, ban...",0.35,-0.2
91572,"However, in an interview with The Times Liam G...",5048,2009,"([however, interview, the, say, longer], 2, 0, 0)",0.0,0.0
91573,I think we all know that.,5048,2009,"([i, think, know], 0, 0, 0)",0.0,0.0
91574,So that's done.,5048,2009,"([so], 0, 0, 0)",0.0,0.0
91575,"""""It's a shame, but that's life.",5048,2009,"([shame, life], 0, 0, 0)",0.0,0.0


In [62]:
#create a 3 point criteria from -1 to 1 (range of polarity)
def map_sentiment(value):
    if value <= -0.33:
        return -1
    elif value >= 0.33:
        return 1
    else:
        return 0


df_09['sentiment'] = df_09['polarity'].apply(map_sentiment)
df_09['sentiment'] = df_09["sentiment"].astype(float)
df_09["sentiment"].value_counts()

 0.0    7802
 1.0    1516
-1.0     394
Name: sentiment, dtype: int64

**RNN LSTM Model for Sentiment Analysis** 

In [67]:
#define X and y 
X = df_09['sentences']
y = df_09['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size =0.2, random_state = 42)

# Tokenize the data
tokenizer = Tokenizer(num_words=1000)
tokenizer.fit_on_texts(X_train)

# Convert the texts to sequences
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

# Pad the sequences to ensure equal length
maxlen = 100
X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

# Define the LSTM model
model = Sequential()
model.add(Embedding(input_dim=1000, output_dim=64, input_length=maxlen))
model.add(LSTM(64))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', "mean_squared_error"])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

# Evaluate the model
score = model.evaluate(X_test, y_test, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test loss: 0.3425159454345703
Test accuracy: 0.7951621413230896


**CNN Model for Sentiment Analysis**

In [68]:
from keras.models import Sequential
from keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

# Define X and y
X = df_09['sentences']
y = df_09['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Tokenize the data
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)

# Convert the texts to sequences
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

# Pad the sequences to ensure equal length
maxlen = 100
X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

# Define the CNN model
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128, input_length=maxlen))
model.add(Conv1D(128, 3, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', "mean_squared_error"])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=64, validation_data=(X_test, y_test))

# Evaluate the model
score = model.evaluate(X_test, y_test, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test loss: -334.3150634765625
Test accuracy: 0.8661863207817078


**Table Comparing Model Results**

In [70]:
results = {' ': ['LSTM', 'CNN'],
           'Accuracy': [0.795, 0.866],
           'Mean Squared Error': [0.193, 0.1339],
           'Test loss': [0.3425, -334.315]}

# Create a pandas dataframe from the dictionary
df = pd.DataFrame(results)

# Set the index of the dataframe to the Kernel column
df.set_index(' ', inplace=True)

# Display the dataframe
print(df)

      Accuracy  Mean Squared Error  Test loss
                                             
LSTM     0.795              0.1930     0.3425
CNN      0.866              0.1339  -334.3150
