In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer 
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
df= pd.read_csv("Corona_NLP_train.csv", encoding= "latin-1")

In [3]:
df.head()

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,3799,48751,London,16-03-2020,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,3800,48752,UK,16-03-2020,advice Talk to your neighbours family to excha...,Positive
2,3801,48753,Vagabonds,16-03-2020,Coronavirus Australia: Woolworths to give elde...,Positive
3,3802,48754,,16-03-2020,My food stock is not the only one which is emp...,Positive
4,3803,48755,,16-03-2020,"Me, ready to go at supermarket during the #COV...",Extremely Negative


In [4]:
df.shape

(41157, 6)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41157 entries, 0 to 41156
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   UserName       41157 non-null  int64 
 1   ScreenName     41157 non-null  int64 
 2   Location       32567 non-null  object
 3   TweetAt        41157 non-null  object
 4   OriginalTweet  41157 non-null  object
 5   Sentiment      41157 non-null  object
dtypes: int64(2), object(4)
memory usage: 1.9+ MB


In [6]:
df.isnull().sum()

UserName            0
ScreenName          0
Location         8590
TweetAt             0
OriginalTweet       0
Sentiment           0
dtype: int64

In [7]:
cleaned_df= df.dropna(axis=1)

In [8]:
df.isnull().sum()

UserName            0
ScreenName          0
Location         8590
TweetAt             0
OriginalTweet       0
Sentiment           0
dtype: int64

In [9]:
cleaned_df.duplicated().sum()

0

In [10]:
cleaned_df.columns

Index(['UserName', 'ScreenName', 'TweetAt', 'OriginalTweet', 'Sentiment'], dtype='object')

In [11]:
cleaned_df.head()

Unnamed: 0,UserName,ScreenName,TweetAt,OriginalTweet,Sentiment
0,3799,48751,16-03-2020,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,3800,48752,16-03-2020,advice Talk to your neighbours family to excha...,Positive
2,3801,48753,16-03-2020,Coronavirus Australia: Woolworths to give elde...,Positive
3,3802,48754,16-03-2020,My food stock is not the only one which is emp...,Positive
4,3803,48755,16-03-2020,"Me, ready to go at supermarket during the #COV...",Extremely Negative


In [12]:
cleaned_df["Sentiment"].unique()

array(['Neutral', 'Positive', 'Extremely Negative', 'Negative',
       'Extremely Positive'], dtype=object)

In [13]:
cleaned_df["Sentiment"].value_counts()

Sentiment
Positive              11422
Negative               9917
Neutral                7713
Extremely Positive     6624
Extremely Negative     5481
Name: count, dtype: int64

In [14]:
cleaned_df.head(40)

Unnamed: 0,UserName,ScreenName,TweetAt,OriginalTweet,Sentiment
0,3799,48751,16-03-2020,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,3800,48752,16-03-2020,advice Talk to your neighbours family to excha...,Positive
2,3801,48753,16-03-2020,Coronavirus Australia: Woolworths to give elde...,Positive
3,3802,48754,16-03-2020,My food stock is not the only one which is emp...,Positive
4,3803,48755,16-03-2020,"Me, ready to go at supermarket during the #COV...",Extremely Negative
5,3804,48756,16-03-2020,As news of the regionÂs first confirmed COVID...,Positive
6,3805,48757,16-03-2020,Cashier at grocery store was sharing his insig...,Positive
7,3806,48758,16-03-2020,Was at the supermarket today. Didn't buy toile...,Neutral
8,3807,48759,16-03-2020,Due to COVID-19 our retail store and classroom...,Positive
9,3808,48760,16-03-2020,"For corona prevention,we should stop to buy th...",Negative


In [15]:
cleaned_df["Sentiment"]= cleaned_df["Sentiment"].replace({
    "Extremely Positive": "Positive",
    "Extremely Negative": "Negative"    
})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_df["Sentiment"]= cleaned_df["Sentiment"].replace({


In [16]:
cleaned_df["Sentiment"].unique()

array(['Neutral', 'Positive', 'Negative'], dtype=object)

In [17]:
cleaned_df["Sentiment"].value_counts()

Sentiment
Positive    18046
Negative    15398
Neutral      7713
Name: count, dtype: int64

In [18]:
#Preprocessing

import re

def clean_tweet(text):
    text= text.lower()
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces

    return text 

In [19]:
cleaned_df["OriginalTweet"]= cleaned_df["OriginalTweet"].apply(clean_tweet)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_df["OriginalTweet"]= cleaned_df["OriginalTweet"].apply(clean_tweet)


In [20]:
cleaned_df.head()

Unnamed: 0,UserName,ScreenName,TweetAt,OriginalTweet,Sentiment
0,3799,48751,16-03-2020,menyrbie philgahan chrisitv and and,Neutral
1,3800,48752,16-03-2020,advice talk to your neighbours family to excha...,Positive
2,3801,48753,16-03-2020,coronavirus australia woolworths to give elder...,Positive
3,3802,48754,16-03-2020,my food stock is not the only one which is emp...,Positive
4,3803,48755,16-03-2020,me ready to go at supermarket during the covid...,Negative


In [21]:
#Label Encoding

le= LabelEncoder()
cleaned_df["Sentiment"]= le.fit_transform(cleaned_df["Sentiment"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_df["Sentiment"]= le.fit_transform(cleaned_df["Sentiment"])


In [22]:
cleaned_df.head()

Unnamed: 0,UserName,ScreenName,TweetAt,OriginalTweet,Sentiment
0,3799,48751,16-03-2020,menyrbie philgahan chrisitv and and,1
1,3800,48752,16-03-2020,advice talk to your neighbours family to excha...,2
2,3801,48753,16-03-2020,coronavirus australia woolworths to give elder...,2
3,3802,48754,16-03-2020,my food stock is not the only one which is emp...,2
4,3803,48755,16-03-2020,me ready to go at supermarket during the covid...,0


In [23]:
#Splitting

X= cleaned_df["OriginalTweet"].values
y= cleaned_df["Sentiment"].values

In [24]:
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.2, random_state=42)

In [25]:
#Tokenization
tokenizer= Tokenizer(num_words=10000, oov_token="<OOV>") 
tokenizer.fit_on_texts(X_train) ##{'<OOV>': 1, 'tweet': 2, 'beautiful': 3, 'programming'}

X_train_sequences= tokenizer.texts_to_sequences(X_train) #[[12,3,3], [1,25,5]]
X_test_sequences= tokenizer.texts_to_sequences(X_test)

In [26]:
X_train_sequences[0]

[915,
 1357,
 309,
 51,
 8,
 3655,
 16,
 160,
 895,
 1492,
 4,
 2,
 1249,
 12,
 85,
 3,
 59,
 2847]

In [27]:
len(X_train_sequences[0])

18

In [28]:
#padding
max_length= 100
X_train_padded= pad_sequences(X_train_sequences, maxlen= max_length, padding="post")
X_test_padded= pad_sequences(X_test_sequences, maxlen=max_length, padding="post")

In [29]:
X_train_padded[0]

array([ 915, 1357,  309,   51,    8, 3655,   16,  160,  895, 1492,    4,
          2, 1249,   12,   85,    3,   59, 2847,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0])

In [30]:
len(X_train_padded[0])

100

In [31]:
#Building ANN Model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, Dropout, Flatten

In [40]:
model= Sequential()
#input_dim=10000: Size of the vocabulary ... 10,000 words.
#output_dim=128: Dimensionality of the dense embedding vectors.
#input_length=max_length=100: Length of the input sequences

model.add(Embedding(input_dim=10000, output_dim=128, input_length= max_length))

#Converts the 2D output of the embedding layer (sequence length x embedding size) into a 1D vector
model.add(Flatten())

model.add(Dense(128, activation="relu"))

#Regularizes the model to prevent overfitting
model.add(Dropout(0.5))

model.add(Dense(64, activation="relu"))
model.add(Dense(3, activation="softmax"))



In [41]:
model.compile(loss= "sparse_categorical_crossentropy",
              optimizer= "adam",
              metrics= ["accuracy"]
              )

In [49]:
model.build(input_shape=(None, max_length))  # None for batch size, max_length for sequence length
model.summary()

In [44]:
#training the model

model.fit(X_train_padded,
          y_train,
          epochs= 5,
          validation_data= (X_test_padded, y_test),
          batch_size= 32
          )

Epoch 1/5
[1m1029/1029[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 56ms/step - accuracy: 0.9887 - loss: 0.0384 - val_accuracy: 0.7553 - val_loss: 1.2508
Epoch 2/5
[1m1029/1029[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m88s[0m 61ms/step - accuracy: 0.9918 - loss: 0.0272 - val_accuracy: 0.7543 - val_loss: 1.2389
Epoch 3/5
[1m1029/1029[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m66s[0m 45ms/step - accuracy: 0.9928 - loss: 0.0210 - val_accuracy: 0.7589 - val_loss: 1.2831
Epoch 4/5
[1m1029/1029[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 57ms/step - accuracy: 0.9927 - loss: 0.0222 - val_accuracy: 0.7569 - val_loss: 1.3166
Epoch 5/5
[1m1029/1029[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m79s[0m 54ms/step - accuracy: 0.9940 - loss: 0.0173 - val_accuracy: 0.7541 - val_loss: 1.5300


<keras.src.callbacks.history.History at 0x1cbe99b7410>

In [45]:
loss, accuracy= model.evaluate(X_test_padded, y_test)
print(f"Total Loss: {loss}")
print(f"Test Accuracy: {accuracy}")

[1m258/258[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 11ms/step - accuracy: 0.7513 - loss: 1.5633
Total Loss: 1.5299514532089233
Test Accuracy: 0.7541302442550659
