In [1]:
# Download dataset from UCI repository
!curl -o uci-labelled-sentences.zip https://archive.ics.uci.edu/static/public/331/sentiment+labelled+sentences.zip

# Unzip the dataset
!unzip uci-labelled-sentences.zip

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 84188    0 84188    0     0   121k      0 --:--:-- --:--:-- --:--:--  121k
Archive:  uci-labelled-sentences.zip
   creating: sentiment labelled sentences/
  inflating: sentiment labelled sentences/.DS_Store  
   creating: __MACOSX/
   creating: __MACOSX/sentiment labelled sentences/
  inflating: __MACOSX/sentiment labelled sentences/._.DS_Store  
  inflating: sentiment labelled sentences/amazon_cells_labelled.txt  
  inflating: sentiment labelled sentences/imdb_labelled.txt  
  inflating: __MACOSX/sentiment labelled sentences/._imdb_labelled.txt  
  inflating: sentiment labelled sentences/readme.txt  
  inflating: __MACOSX/sentiment labelled sentences/._readme.txt  
  inflating: sentiment labelled sentences/yelp_labelled.txt  
  inflating: __MACOSX/._sentiment labelled sentences  


In [3]:
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM
from keras.callbacks import EarlyStopping

In [4]:
df_list = []

# Yelp
df_yelp = pd.read_csv('sentiment labelled sentences/yelp_labelled.txt', names=['sentence', 'label'], sep='\t')
df_yelp['source'] = 'yelp'
df_list.append(df_yelp)

# Amazon
df_amazon = pd.read_csv('sentiment labelled sentences/amazon_cells_labelled.txt', names=['sentence', 'label'], sep='\t')
df_amazon['source'] = 'amazon'
df_list.append(df_amazon)

# IMDB
df_imdb = pd.read_csv('sentiment labelled sentences/imdb_labelled.txt', names=['sentence', 'label'], sep='\t')
df_imdb['source'] = 'imdb'
df_list.append(df_imdb)

# Combine datasets
df = pd.concat(df_list)
df.head()


Unnamed: 0,sentence,label,source
0,Wow... Loved this place.,1,yelp
1,Crust is not good.,0,yelp
2,Not tasty and the texture was just nasty.,0,yelp
3,Stopped by during the late May bank holiday of...,1,yelp
4,The selection on the menu was great and so wer...,1,yelp


In [6]:
max_features = 2000  # Use the 2,000 most frequent words

tokenizer = Tokenizer(num_words=max_features, split=' ')
tokenizer.fit_on_texts(df['sentence'].values)

X = tokenizer.texts_to_sequences(df['sentence'].values)
X = pad_sequences(X)  # Ensure equal length input sequences

y = df['label'].values

In [7]:
print("Shape of X:", X.shape)
print("Shape of y:", y.shape)

Shape of X: (2748, 1225)
Shape of y: (2748,)


After running the cell above, you should see the shapes of `X` and `y`.
- `X.shape` will be `(number_of_samples, sequence_length)`, where `number_of_samples` is the total number of sentences in your combined dataset (2748) and `sequence_length` is the maximum length of the padded sequences. The `pad_sequences` function automatically determines this maximum length based on the longest sequence in your data.
- `y.shape` will be `(number_of_samples,)`, which is a 1D array containing the labels for each sentence.

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.12)

In [10]:
def create_model():
    model = Sequential()
    model.add(Embedding(max_features, 64))
    model.add(LSTM(16))
    model.add(Dense(1, activation='sigmoid'))  # Binary classification
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

model = create_model()

In [11]:
history = model.fit(
    X_train,
    y_train,
    epochs=6,
    batch_size=16,
    validation_data=(X_test, y_test),
    callbacks=[EarlyStopping(monitor='val_accuracy', min_delta=0.001, patience=2, verbose=1)]
)


Epoch 1/6
[1m152/152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m88s[0m 557ms/step - accuracy: 0.5920 - loss: 0.6751 - val_accuracy: 0.7636 - val_loss: 0.5012
Epoch 2/6
[1m152/152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m144s[0m 573ms/step - accuracy: 0.7902 - loss: 0.4592 - val_accuracy: 0.7909 - val_loss: 0.4567
Epoch 3/6
[1m152/152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m139s[0m 555ms/step - accuracy: 0.8936 - loss: 0.2755 - val_accuracy: 0.8212 - val_loss: 0.4108
Epoch 4/6
[1m152/152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m144s[0m 567ms/step - accuracy: 0.9294 - loss: 0.1953 - val_accuracy: 0.8303 - val_loss: 0.4402
Epoch 5/6
[1m152/152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m146s[0m 594ms/step - accuracy: 0.9567 - loss: 0.1316 - val_accuracy: 0.8121 - val_loss: 0.4809
Epoch 6/6
[1m152/152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m88s[0m 576ms/step - accuracy: 0.9736 - loss: 0.0863 - val_accuracy: 0.8182 - val_loss: 0.5382
Epoch 6: ear

In [13]:
# Save model
model.save("uci_sentimentanalysis.keras")

# Save tokenizer
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.DEFAULT_PROTOCOL)


In [14]:
from google.colab import files

files.download("uci_sentimentanalysis.h5")
files.download("tokenizer.pickle")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>