In [1]:
#string matching
import re 

#reading files
import pandas as pd

#handling html data
from bs4 import BeautifulSoup

#visualization
import matplotlib.pyplot as plt  

pd.set_option('display.max_colwidth', 200)

In [2]:
data_train = pd.read_csv('train_sentiment_analysis_II.csv')

In [3]:
data_train.head()

Unnamed: 0,id,label,tweet
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1MfQV #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone
1,2,0,Finally a transparant silicon case ^^ Thanks to my uncle :) #yay #Sony #Xperia #S #sonyexperias… http://instagram.com/p/YGEt5JC6JM/
2,3,0,We love this! Would you go? #talk #makememories #unplug #relax #iphone #smartphone #wifi #connect... http://fb.me/6N3LsUpCu
3,4,0,I'm wired I know I'm George I was made that way ;) #iphone #cute #daventry #home http://instagr.am/p/Li_5_ujS4k/
4,5,1,What amazing service! Apple won't even talk to me about a question I have unless I pay them $19.95 for their stupid support!


In [4]:
twitter_label_end_pos = data_train['label']
data_train.drop('label', axis=1, inplace=True)
data_train['label'] = twitter_label_end_pos

In [5]:
data_train.head()

Unnamed: 0,id,tweet,label
0,1,#fingerprint #Pregnancy Test https://goo.gl/h1MfQV #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone,0
1,2,Finally a transparant silicon case ^^ Thanks to my uncle :) #yay #Sony #Xperia #S #sonyexperias… http://instagram.com/p/YGEt5JC6JM/,0
2,3,We love this! Would you go? #talk #makememories #unplug #relax #iphone #smartphone #wifi #connect... http://fb.me/6N3LsUpCu,0
3,4,I'm wired I know I'm George I was made that way ;) #iphone #cute #daventry #home http://instagr.am/p/Li_5_ujS4k/,0
4,5,What amazing service! Apple won't even talk to me about a question I have unless I pay them $19.95 for their stupid support!,1


In [6]:
import nltk
from nltk.corpus import stopwords

In [7]:
# Download stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\udmitra\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


### Text Cleaning

In [18]:
# Function to clean the text
def tweet_clean(text):
    text = re.sub(r"http\S+", "", text)  # Remove URLs
    text = re.sub(r"\$&@\*#", " PROFANE", text)  # Replace $&@*# with a special token
    text = re.sub(r"<3", "Love", text)  # Replace <3 with love
    text = re.sub(r",\"", "", text) # Remove ',"'
    text = re.sub(r"[#@*&/:)(!$^?€£<>_]", "", text)  # Remove individual occurrences of special characters
    text = re.sub(r"-", " ", text)
    text = re.sub(r"=", "equal to", text)
    text = re.sub(r"\bI'm\b", "I am", text)
    text = re.sub(r"\bI'd\b", "I had", text)
    text = re.sub(r"[^a-zA-Z\s]", "", text)  # Remove any other non-alphabetic characters
    text = text.lower()  # Convert to lowercase
    text = ' '.join(word for word in text.split() if word not in stop_words)  # Remove stopwords
    return text

In [19]:
# call preprocessing function
data_train['cleaned_tweet'] = data_train['tweet'].apply(tweet_clean)

In [20]:
data_train['tweet'][39]

'No bull$&@*# #InstaSize #Leggings #LightSkin #CurlyHair #iPhone #BellyPiercing #smile #POTD… http://instagram.com/p/esxzmizbL8/'

In [21]:
data_train['cleaned_tweet'][39]

'bull profane instasize leggings lightskin curlyhair iphone bellypiercing smile potd'

In [22]:
data_train.shape

(7920, 4)

In [23]:
data_train.isnull().sum()

id               0
tweet            0
label            0
cleaned_tweet    0
dtype: int64

In [24]:
from sklearn.model_selection import train_test_split

In [25]:
# Split the data
Xtrain = data_train['cleaned_tweet']
ytrain = data_train['label']

In [26]:
x_tr, x_val, y_tr, y_val = train_test_split(Xtrain, ytrain, test_size=0.2, random_state=42)

### Text Representation

In [28]:
import tensorflow as tf

In [29]:
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences 

#prepare a tokenizer
x_tokenizer = Tokenizer() 

#prepare vocabulary
x_tokenizer.fit_on_texts(x_tr)

In [30]:
#define threshold for maximum length of a setence
max_len=100

#convert text sequences into integer sequences
x_tr_seq = x_tokenizer.texts_to_sequences(x_tr) 
x_val_seq = x_tokenizer.texts_to_sequences(x_val)

#padding up with zero 
x_tr_seq = pad_sequences(x_tr_seq,  padding='post', maxlen=max_len)
x_val_seq = pad_sequences(x_val_seq, padding='post', maxlen=max_len)

### Model Building

In [32]:
from keras.models import Sequential
from keras.layers import *
from keras.callbacks import ModelCheckpoint

In [33]:
x_voc_size = len(x_tokenizer.word_index) + 1

#### Defining Model architecture of LSTM model

In [68]:
#sequential model
model_lstm = Sequential()

#embedding layer
model_lstm.add(Embedding(x_voc_size, 100, trainable = True, input_shape=(max_len,), mask_zero=True))

#lstm 
model_lstm.add(LSTM(128))
model_lstm.add(Dropout(rate=0.5))

#dense layer
model_lstm.add(Dense(32, activation='relu'))

#output layer
model_lstm.add(Dense(1,activation='sigmoid'))

In [69]:
model_lstm.summary()

In [70]:
#define optimizer and loss
model_lstm.compile(optimizer='sgd',loss='binary_crossentropy', metrics=['accuracy'])

#checkpoint to save best model during training
mc = ModelCheckpoint("best_weights_lstm_gender_classification.keras", monitor='val_loss', verbose=1, save_best_only=True, mode='min')

#### Train the LSTM model

In [71]:
#train the model 
model_lstm.fit(x_tr_seq, y_tr, batch_size=32, epochs=50, verbose=1, validation_data=(x_val_seq, y_val), callbacks=[mc])

Epoch 1/50
[1m198/198[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step - accuracy: 0.7356 - loss: 0.6400
Epoch 1: val_loss improved from inf to 0.58626, saving model to best_weights_lstm_gender_classification.keras
[1m198/198[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 48ms/step - accuracy: 0.7357 - loss: 0.6399 - val_accuracy: 0.7273 - val_loss: 0.5863
Epoch 2/50
[1m197/198[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 39ms/step - accuracy: 0.7529 - loss: 0.5609
Epoch 2: val_loss improved from 0.58626 to 0.58433, saving model to best_weights_lstm_gender_classification.keras
[1m198/198[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 44ms/step - accuracy: 0.7528 - loss: 0.5610 - val_accuracy: 0.7273 - val_loss: 0.5843
Epoch 3/50
[1m197/198[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 40ms/step - accuracy: 0.7427 - loss: 0.5678
Epoch 3: val_loss did not improve from 0.58433
[1m198/198[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s

<keras.src.callbacks.history.History at 0x1c3a2fb9cc0>

In [72]:
from tensorflow.keras.models import load_model

# Load the model architecture if not already defined
# Assuming model_lstm is already defined
model_lstm.load_weights('best_weights_lstm_gender_classification.keras')

In [75]:
from sklearn.metrics import classification_report, f1_score

In [73]:
# Predict on validation data
val_predictions_lstm = model_lstm.predict(x_val_seq)
val_predictions_binary_lstm = (val_predictions_lstm > 0.5).astype(int)

[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 22ms/step


In [77]:
print(classification_report(y_val, val_predictions_binary_lstm))

              precision    recall  f1-score   support

           0       0.92      0.93      0.92      1152
           1       0.80      0.77      0.79       432

    accuracy                           0.89      1584
   macro avg       0.86      0.85      0.86      1584
weighted avg       0.89      0.89      0.89      1584



#### Define model architecture of the GRU model

In [81]:
#sequential model
model_gru = Sequential()

#embedding layer
model_gru.add(Embedding(x_voc_size, 100, trainable = True, input_shape=(max_len,), mask_zero=True))

#GRU 
model_gru.add(GRU(128))
model_gru.add(Dropout(rate=0.5))

#dense layer
model_gru.add(Dense(32, activation='relu'))

#output layer
model_gru.add(Dense(1,activation='sigmoid'))

  super().__init__(**kwargs)


In [65]:
model_gru.summary()

#### Train the GRU model

In [66]:
#define optimizer and loss
model_gru.compile(optimizer='sgd',loss='binary_crossentropy', metrics=['accuracy'])

#checkpoint to save best model during training
mc = ModelCheckpoint("best_weights_gru_gender_classification.keras", monitor='val_loss', verbose=1, save_best_only=True, mode='min')

In [67]:
#train the model 
model_gru.fit(x_tr_seq, y_tr, batch_size=32, epochs=100, verbose=1, validation_data=(x_val_seq, y_val), callbacks=[mc])

Epoch 1/100
[1m198/198[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 60ms/step - accuracy: 0.7190 - loss: 0.6440
Epoch 1: val_loss improved from inf to 0.58831, saving model to best_weights_gru_gender_classification.keras
[1m198/198[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 69ms/step - accuracy: 0.7191 - loss: 0.6438 - val_accuracy: 0.7273 - val_loss: 0.5883
Epoch 2/100
[1m197/198[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 47ms/step - accuracy: 0.7508 - loss: 0.5661
Epoch 2: val_loss improved from 0.58831 to 0.58488, saving model to best_weights_gru_gender_classification.keras
[1m198/198[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 55ms/step - accuracy: 0.7507 - loss: 0.5661 - val_accuracy: 0.7273 - val_loss: 0.5849
Epoch 3/100
[1m198/198[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step - accuracy: 0.7444 - loss: 0.5679
Epoch 3: val_loss did not improve from 0.58488
[1m198/198[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m

<keras.src.callbacks.history.History at 0x1c39e85dd20>

In [78]:
# Assuming model_gru is already defined
model_gru.load_weights('best_weights_gru_gender_classification.keras')

In [79]:
# Predict on validation data
val_predictions_gru = model_gru.predict(x_val_seq)
val_predictions_binary_gru = (val_predictions_gru > 0.5).astype(int)

[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 21ms/step


In [80]:
print(classification_report(y_val, val_predictions_binary_gru))

              precision    recall  f1-score   support

           0       0.91      0.93      0.92      1152
           1       0.81      0.77      0.79       432

    accuracy                           0.89      1584
   macro avg       0.86      0.85      0.85      1584
weighted avg       0.88      0.89      0.89      1584



#### Defining model architecture of CNN model

In [84]:
#sequential model
model_cnn = Sequential()

#embedding layer
model_cnn.add(Embedding(x_voc_size, 100, trainable = True, input_shape=(max_len,), mask_zero=True))

#CNN 
model_cnn.add(Conv1D(128,3,padding='same'))  #conv1d layer
model_cnn.add(Dropout(0.2))

model_cnn.add(GlobalMaxPooling1D())

#dense layer
model_cnn.add(Dense(32, activation='relu'))

#output layer
model_cnn.add(Dense(1,activation='sigmoid'))



In [85]:
model_cnn.summary()

#### Train the CNN model

In [86]:
#define optimizer and loss
model_cnn.compile(optimizer='sgd',loss='binary_crossentropy', metrics=['accuracy'])

#checkpoint to save best model during training
mc = ModelCheckpoint("best_weights_cnn_gender_classification.keras", monitor='val_loss', verbose=1, save_best_only=True, mode='min')

In [88]:
#train the model 
model_cnn.fit(x_tr_seq, y_tr, batch_size=32, epochs=80, verbose=1, validation_data=(x_val_seq, y_val), callbacks=[mc])

Epoch 1/80
[1m196/198[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 10ms/step - accuracy: 0.9369 - loss: 0.1796
Epoch 1: val_loss improved from 0.27777 to 0.27615, saving model to best_weights_cnn_gender_classification.keras
[1m198/198[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 12ms/step - accuracy: 0.9368 - loss: 0.1796 - val_accuracy: 0.8794 - val_loss: 0.2762
Epoch 2/80
[1m194/198[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 10ms/step - accuracy: 0.9441 - loss: 0.1660
Epoch 2: val_loss improved from 0.27615 to 0.27463, saving model to best_weights_cnn_gender_classification.keras
[1m198/198[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step - accuracy: 0.9439 - loss: 0.1662 - val_accuracy: 0.8876 - val_loss: 0.2746
Epoch 3/80
[1m198/198[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.9422 - loss: 0.1677
Epoch 3: val_loss improved from 0.27463 to 0.27458, saving model to best_weights_cnn_gender_classification.

<keras.src.callbacks.history.History at 0x1c3b3a757b0>

In [89]:
# Assuming model_cnn is already defined
model_cnn.load_weights('best_weights_cnn_gender_classification.keras')

In [90]:
# Predict on validation data
val_predictions_cnn = model_cnn.predict(x_val_seq)
val_predictions_binary_cnn = (val_predictions_cnn > 0.5).astype(int)

[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step


In [91]:
print(classification_report(y_val, val_predictions_binary_cnn))

              precision    recall  f1-score   support

           0       0.91      0.93      0.92      1152
           1       0.79      0.75      0.77       432

    accuracy                           0.88      1584
   macro avg       0.85      0.84      0.85      1584
weighted avg       0.88      0.88      0.88      1584



### Though all the 3 models have extremely good prediction capability as retrieved from the classification matrix, still finalizing LSTM model as it has a slight edge over the rest.

### Now, implementing the predictions on the Test data.

In [92]:
data_test = pd.read_csv('test_sentiment_analysis_II.csv')

In [93]:
data_test.head()

Unnamed: 0,id,tweet
0,7921,I hate the new #iphone upgrade. Won't let me download apps. #ugh #apple sucks
1,7922,currently shitting my fucking pants. #apple #iMac #cashmoney #raddest #swagswagswag http://instagr.am/p/UUIS0bIBZo/
2,7923,"I'd like to puts some CD-ROMS on my iPad, is that possible?' — Yes, but wouldn't that block the screen?\n"
3,7924,"My ipod is officially dead. I lost all my pictures and videos from the 1D and 5sos concert,and from Vet Camp #hatinglife #sobbing"
4,7925,Been fighting iTunes all night! I only want the music I $&@*# paid for


In [94]:
data_test.shape

(1953, 2)

In [95]:
data_test.isnull().sum()

id       0
tweet    0
dtype: int64

In [96]:
# call preprocessing function
data_test['cleaned_tweet'] = data_test['tweet'].apply(tweet_clean)

In [97]:
data_test.head()

Unnamed: 0,id,tweet,cleaned_tweet
0,7921,I hate the new #iphone upgrade. Won't let me download apps. #ugh #apple sucks,hate new iphone upgrade wont let download apps ugh apple sucks
1,7922,currently shitting my fucking pants. #apple #iMac #cashmoney #raddest #swagswagswag http://instagr.am/p/UUIS0bIBZo/,currently shitting fucking pants apple imac cashmoney raddest swagswagswag
2,7923,"I'd like to puts some CD-ROMS on my iPad, is that possible?' — Yes, but wouldn't that block the screen?\n",like puts cd roms ipad possible yes wouldnt block screen
3,7924,"My ipod is officially dead. I lost all my pictures and videos from the 1D and 5sos concert,and from Vet Camp #hatinglife #sobbing",ipod officially dead lost pictures videos sos concertand vet camp hatinglife sobbing
4,7925,Been fighting iTunes all night! I only want the music I $&@*# paid for,fighting itunes night want music profane paid


In [98]:
# Convert texts to sequences
X_test_seq = x_tokenizer.texts_to_sequences(data_test['cleaned_tweet'])

In [99]:
# Padding sequences
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len, padding='post')

### Prediction on test data

In [100]:
test_pred = model_lstm.predict(X_test_pad)

[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 20ms/step


In [101]:
test_pred = (test_pred > 0.5).astype(int)

In [102]:
# Prepare the submission file
final_pred = pd.DataFrame({'id': data_test['id'], 'label': test_pred.flatten()})
final_pred.to_csv('final_predictions_sentiment_analysis_II.csv', index=False)

In [103]:
final_pred['label'].value_counts()

0    1409
1     544
Name: label, dtype: int64