In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.tree import DecisionTreeClassifier

### step 1: Dataset Cleaning

Explore the student comment & parent comment features by creating a text corpus. Which all cleaning operation you think will be required on this corpus? Write a clean-up method and clean the text features.

In [7]:
# read dataset
data = pd.read_csv('sarcasm_dataset.csv')

In [8]:
# check for null values
data.isnull().sum()

ID                0
comment           0
date              0
down              0
parent_comment    0
score             0
top               0
topic             0
user              0
label             0
dtype: int64

In [9]:
data.columns

Index(['ID', 'comment', 'date', 'down', 'parent_comment', 'score', 'top',
       'topic', 'user', 'label'],
      dtype='object')

In [10]:
# drop insignificant columns
data_sarc=data.drop(['ID','date', 'down', 'score', 'top',
       'topic', 'user'],axis=1)

In [11]:
#print the filtered data
data_sarc.head()

Unnamed: 0,comment,parent_comment,label
0,"Well, let's be honest here, they don't actuall...",They should shut the fuck up and let the commu...,0
1,"Well, I didn't need evidence to believe in com...",You need evidence to kill people? I thought we...,1
2,"Who does an ""official promo"" in 360p?",2014 BMW S1000R: Official Promo,0
3,Grotto koth was the best,Not really that memorable lol if you want memo...,1
4,Neal's back baby,James Neal hit on Zach Parise,1


In [12]:
data_sarc.columns

Index(['comment', 'parent_comment', 'label'], dtype='object')

### create text corpus

In [13]:
# create a text corpus
data_sarc["text_corpus"] = data_sarc["comment"] + " " + data_sarc["parent_comment"]

In [14]:
# print the dataset having text corpus
data_sarc.head()

Unnamed: 0,comment,parent_comment,label,text_corpus
0,"Well, let's be honest here, they don't actuall...",They should shut the fuck up and let the commu...,0,"Well, let's be honest here, they don't actuall..."
1,"Well, I didn't need evidence to believe in com...",You need evidence to kill people? I thought we...,1,"Well, I didn't need evidence to believe in com..."
2,"Who does an ""official promo"" in 360p?",2014 BMW S1000R: Official Promo,0,"Who does an ""official promo"" in 360p? 2014 BMW..."
3,Grotto koth was the best,Not really that memorable lol if you want memo...,1,Grotto koth was the best Not really that memor...
4,Neal's back baby,James Neal hit on Zach Parise,1,Neal's back baby James Neal hit on Zach Parise


### cleaning of text corpus

 #### cleaning operations required are
    1. convert to lower characters
    
    2. remove stopwords
    
    3.remove panctuation
    
    4.remove special characters

#### convert it into lower

In [15]:
# convert text corpusinto lower text
data_sarc['text_corpus']=data_sarc['text_corpus'].str.lower()

In [16]:
data_sarc.head()

Unnamed: 0,comment,parent_comment,label,text_corpus
0,"Well, let's be honest here, they don't actuall...",They should shut the fuck up and let the commu...,0,"well, let's be honest here, they don't actuall..."
1,"Well, I didn't need evidence to believe in com...",You need evidence to kill people? I thought we...,1,"well, i didn't need evidence to believe in com..."
2,"Who does an ""official promo"" in 360p?",2014 BMW S1000R: Official Promo,0,"who does an ""official promo"" in 360p? 2014 bmw..."
3,Grotto koth was the best,Not really that memorable lol if you want memo...,1,grotto koth was the best not really that memor...
4,Neal's back baby,James Neal hit on Zach Parise,1,neal's back baby james neal hit on zach parise


In [17]:
data_sarc.shape

(15000, 4)

In [18]:
# check the preprocess
data_sarc.text_corpus.head()

0    well, let's be honest here, they don't actuall...
1    well, i didn't need evidence to believe in com...
2    who does an "official promo" in 360p? 2014 bmw...
3    grotto koth was the best not really that memor...
4       neal's back baby james neal hit on zach parise
Name: text_corpus, dtype: object

So all capital letters has been conerted into lower

### remove URLs from text corpus

In [19]:
data_sarc['text_corpus']=data_sarc['text_corpus'].replace('https:?//\S+www\.\S+',' ',regex=True)

In [20]:
data_sarc.text_corpus

0        well, let's be honest here, they don't actuall...
1        well, i didn't need evidence to believe in com...
2        who does an "official promo" in 360p? 2014 bmw...
3        grotto koth was the best not really that memor...
4           neal's back baby james neal hit on zach parise
                               ...                        
14995    well with a name like el cubano i'm surprised ...
14996    ... this is a good point. sounds like a pretty...
14997    yep. i know the type you speak of. the "die ci...
14998    that's what the government wants you to believ...
14999    because windows 10 has the glorious start menu...
Name: text_corpus, Length: 15000, dtype: object

### remove panctuations

In [21]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [22]:
#remove panctuations from text corpus
data_sarc['text_corpus']=data_sarc['text_corpus'].str.translate(str.maketrans('', '',string.punctuation))

In [23]:
data_sarc.text_corpus.head()

0    well lets be honest here they dont actually se...
1    well i didnt need evidence to believe in commu...
2    who does an official promo in 360p 2014 bmw s1...
3    grotto koth was the best not really that memor...
4        neals back baby james neal hit on zach parise
Name: text_corpus, dtype: object

In [24]:
### using user defined function
def remove_special_characters(text):
    # Create a translation table with special characters mapped to None
    translation_table = str.maketrans("", "", string.punctuation)
    # Remove special characters using the translation table
    cleaned_text = text.translate(translation_table)
    return cleaned_text

### remove special characters

In [25]:
import re
from warnings import filterwarnings
filterwarnings('ignore')

In [26]:
data_sarc['text_corpus']=data_sarc['text_corpus'].str.replace('[^a-zA-Z0-9\s]', '')

In [27]:
data_sarc.text_corpus

0        well lets be honest here they dont actually se...
1        well i didnt need evidence to believe in commu...
2        who does an official promo in 360p 2014 bmw s1...
3        grotto koth was the best not really that memor...
4            neals back baby james neal hit on zach parise
                               ...                        
14995    well with a name like el cubano im surprised h...
14996     this is a good point sounds like a pretty goo...
14997    yep i know the type you speak of the die cis s...
14998    thats what the government wants you to believe...
14999    because windows 10 has the glorious start menu...
Name: text_corpus, Length: 15000, dtype: object

### remove stopwords

In [28]:
from gensim.parsing import remove_stopwords

In [29]:
# using gensim library
data_sarc['text_corpus']=data_sarc['text_corpus'].apply(remove_stopwords)

In [30]:
data_sarc.text_corpus.head()

0    lets honest dont actually moderating spend tim...
1    didnt need evidence believe communism need evi...
2    official promo 360p 2014 bmw s1000r official p...
3    grotto koth best memorable lol want memorable ...
4                neals baby james neal hit zach parise
Name: text_corpus, dtype: object

In [31]:
# using nltk
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

#### some other methods

# Clean-up method for text features
def clean_text(text):
    # Remove special characters and punctuation
    text = re.sub(r"[^a-zA-Z]", " ", text)

    # Convert text to lowercase
    text = text.lower()

    # Remove numbers
    text = re.sub(r"\d+", "", text)

    # Remove URLs
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)

    # Remove email addresses
    text = re.sub(r"\S+@\S+", "", text)

    # Handle contractions
    text = re.sub(r"n't", " not", text)
    text = re.sub(r"'s", " is", text)
    text = re.sub(r"'re", " are", text)
    text = re.sub(r"'d", " would", text)
    text = re.sub(r"'ll", " will", text)
    text = re.sub(r"'t", " not", text)
    text = re.sub(r"'ve", " have", text)
    text = re.sub(r"'m", " am", text)

    # Tokenize the text
    tokens = word_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words("english"))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove extra whitespace
    text = " ".join(tokens)
    text = re.sub(r"\s+", " ", text).strip()

    return text

Create text corpus from student comment and parent comment

data_sarc["comment"] = data_sarc["comment"].apply(clean_text)

data_sarc["parent_comment"] = data_sarc["parent_comment"].apply(clean_text)


### Explore Classics ML models for your NLP model

Perform text to numeric conversion using CountVectorization and also with TF-IDF on the cleaned dataset. Now process the whole DataFrame (vectorize text + other features) with classic ML models for example- Logistic Regression, Naive Bayes, LDA, Decision tree etc. Tune you models and suggest what combination of vectorization technique & ML model is most suitable for the given data set.

### creata a data frame of text corpus

In [32]:
# check the columns of data_sarc
data_sarc.columns

Index(['comment', 'parent_comment', 'label', 'text_corpus'], dtype='object')

In [33]:
df_sarc=data_sarc.drop(['comment', 'parent_comment'],axis=1)

In [34]:
df_sarc.head()

Unnamed: 0,label,text_corpus
0,0,lets honest dont actually moderating spend tim...
1,1,didnt need evidence believe communism need evi...
2,0,official promo 360p 2014 bmw s1000r official p...
3,1,grotto koth best memorable lol want memorable ...
4,1,neals baby james neal hit zach parise


In [35]:
df_sarc.shape

(15000, 2)

### Build ML models

#### Vectorization methods

#### CountVectorization 

In [36]:
#count vectorizer
c_vecto=CountVectorizer()
df_cvecto=c_vecto.fit_transform(df_sarc['text_corpus'])

In [37]:
# convert to numerical
df_cvecto=df_cvecto.toarray()

In [38]:
# print numerical values of dataframe using countvectorizer
df_cvecto

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [39]:
#check the shape
df_cvecto.shape

(15000, 34627)

#### TF-Idf Vectorizer

In [40]:
# validate Tfidf vectorizer
tfidf_vecto=TfidfVectorizer()
df_tfvecto=tfidf_vecto.fit_transform(df_sarc['text_corpus'])

In [41]:
# convert it to numerical
df_tfvecto=df_tfvecto.toarray()

In [42]:
# print tfidf vectorizer numerical value
df_tfvecto

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [43]:
#check the shape
df_tfvecto.shape

(15000, 34627)

## Logistic Regression Model

### Logistic Regression model using countvectorizer

In [44]:
# split both countvectorized & TF-idf dataset into train and test set with standard split ratio method
#target column for both vectorizer
Y=df_sarc['label']

In [45]:
Y.shape

(15000,)

In [46]:
Y.head()

0    0
1    1
2    0
3    1
4    1
Name: label, dtype: int64

In [47]:
x_train_cvecto,x_test_cvecto,y_train_cvecto,y_test_cvecto=train_test_split(df_cvecto,Y,test_size=0.2,random_state=42)

In [48]:
#Logistic Regression
lr_cvecto=LogisticRegression()
lr_cvecto.fit(x_train_cvecto,y_train_cvecto)

LogisticRegression()

In [49]:
# find accurcy score for logistic countvectorizer
y_true=y_test_cvecto
y_pred_cvecto=lr_cvecto.predict(x_test_cvecto)

In [50]:
# check the accuracy score
accuracy_score(y_true,y_pred_cvecto)

0.5936666666666667

### Logistic Regression model using TFidfvectorizer

In [51]:
Y.shape

(15000,)

In [52]:
x_train_tfvecto,x_test_tfvecto,y_train_tfvecto,y_test_tfvecto=train_test_split(df_tfvecto,Y,test_size=0.2,random_state=42)

In [53]:
#Logistic Regression
lr_tfvecto=LogisticRegression()
lr_tfvecto.fit(x_train_tfvecto,y_train_tfvecto)

LogisticRegression()

In [54]:
# find accurcy score for logistic countvectorizer
y_true_tfvecto=y_test_tfvecto
y_pred_tfvecto=lr_tfvecto.predict(x_test_tfvecto)

In [55]:
# check the accuracy score
accuracy_score(y_true_tfvecto,y_pred_tfvecto)

0.6056666666666667

## Navie bayes Model & count vectorizer

In [56]:
from sklearn.naive_bayes import MultinomialNB

In [57]:
# Train the model using navie bayes model
nv_cvecto= MultinomialNB()
nv_cvecto.fit(x_train_cvecto,y_train_cvecto)

MultinomialNB()

In [58]:
# find accurcy score for navie bayes & countvectorizer
y_true_nv_cvecto=y_test_cvecto
y_pred_nv_cvecto=nv_cvecto.predict(x_test_cvecto)

In [59]:
accuracy_score(y_true_nv_cvecto,y_pred_nv_cvecto)

0.5866666666666667

## Navie bayes Model & TFidf vectorizer

In [60]:
# Train the model using navie bayes model
nv_tfvecto= MultinomialNB()
nv_tfvecto.fit(x_train_tfvecto,y_train_tfvecto)

MultinomialNB()

In [61]:
# find accurcy score for navie bayes & countvectorizer
y_true_nv_tfvecto=y_test_tfvecto
y_pred_nv_tfvecto=nv_tfvecto.predict(x_test_tfvecto)

In [62]:
accuracy_score(y_true_nv_tfvecto,y_pred_nv_tfvecto)

0.586

### LDA model using count vectorizer

In [63]:
# import library
from sklearn.decomposition import LatentDirichletAllocation

In [83]:
# Perform 70:30 train-test split
X_train, X_test, y_train, y_test = train_test_split(data_sarc['text_corpus'], data_sarc['label'], test_size=0.3, random_state=42)

In [86]:
# Preprocess the text data
c_vectorizer = CountVectorizer()
X_train_lda_cvecto = c_vectorizer.fit_transform(X_train)

In [87]:
# Apply LDA
lda = LatentDirichletAllocation()
lda.fit(X_train_lda_cvecto)

LatentDirichletAllocation()

In [89]:
# Transform the test data into the same format
X_test_lda_cvecto = c_vectorizer.transform(X_test)

In [90]:
# Predict the labels for the test data
y_pred_lda_cvecto = lda.transform(X_test_lda_cvecto)

In [91]:
# Convert the predicted topic probabilities into predicted labels
y_pred = y_pred_lda_cvecto.argmax(axis=1)

In [92]:
# Compute the accuracy of the model
accuracy = (y_pred == y_test).mean()

In [93]:
accuracy

0.14444444444444443

### LDA model using Tfidf vectorizer

In [94]:
# Preprocess the text data
tf_vectorizer = TfidfVectorizer()
X_train_lda_tfvecto = tf_vectorizer.fit_transform(X_train)

In [95]:
# Apply LDA
lda = LatentDirichletAllocation()
lda.fit(X_train_lda_tfvecto)

LatentDirichletAllocation()

In [96]:
# Transform the test data into the same format
X_test_lda_tfvecto = tf_vectorizer.transform(X_test)

In [97]:
# Predict the labels for the test data
y_pred_lda_tfvecto = lda.transform(X_test_lda_tfvecto)

In [98]:
# Convert the predicted topic probabilities into predicted labels
y_pred = y_pred_lda_tfvecto.argmax(axis=1)

In [99]:
# Compute the accuracy of the model
accuracy = (y_pred == y_test).mean()

In [100]:
accuracy

0.06288888888888888

### Explore Word Embeddings & classic DL models

Preform text vectorization using word embedding techniques (Word2Vec & Glove). Now use the embedding to build DL models such as RNN, LSTM & Bi-LSTM. Tune you models and suggest what combination of embedding technique and DL model is most suitable for the given data set. Now for the finalized embedding method use its pretrained word embeddings and test if model performance can be improved.

In [106]:
!pip install gensim
!pip install keras
!pip install --upgrade keras

Collecting keras
  Using cached keras-2.12.0-py2.py3-none-any.whl (1.7 MB)
Installing collected packages: keras
  Attempting uninstall: keras
    Found existing installation: keras 2.11.0
    Uninstalling keras-2.11.0:
      Successfully uninstalled keras-2.11.0
Successfully installed keras-2.12.0


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow-intel 2.11.0 requires keras<2.12,>=2.11.0, but you have keras 2.12.0 which is incompatible.


In [112]:
!pip uninstall keras tensorflow
!pip install tensorflow keras

^C


In [116]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Bidirectional
from keras.preprocessing.text import Tokenizer
from gensim.models import Word2Vec

In [117]:
df_sarc.head()

Unnamed: 0,label,text_corpus
0,0,lets honest dont actually moderating spend tim...
1,1,didnt need evidence believe communism need evi...
2,0,official promo 360p 2014 bmw s1000r official p...
3,1,grotto koth best memorable lol want memorable ...
4,1,neals baby james neal hit zach parise


In [119]:
# Remove any unnecessary columns
dataset = df_sarc[['text_corpus', 'label']]

In [121]:
dataset.head()

Unnamed: 0,text_corpus,label
0,lets honest dont actually moderating spend tim...,0
1,didnt need evidence believe communism need evi...,1
2,official promo 360p 2014 bmw s1000r official p...,0
3,grotto koth best memorable lol want memorable ...,1
4,neals baby james neal hit zach parise,1


In [122]:
dataset.shape

(15000, 2)

In [123]:
# Split the dataset into training and testing sets
train_texts, test_texts, train_labels, test_labels = train_test_split(dataset['text_corpus'], dataset['label'], test_size=0.2, random_state=42)

In [124]:
# Text vectorization using Word2Vec
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_texts)

In [125]:
# Convert text to sequences of word indexes
train_sequences = tokenizer.texts_to_sequences(train_texts)
test_sequences = tokenizer.texts_to_sequences(test_texts)

In [127]:
# Padding sequences to the same length
max_sequence_length = max([len(sequence) for sequence in train_sequences])
train_data = pad_sequences(train_sequences, maxlen=max_sequence_length)
test_data = pad_sequences(test_sequences, maxlen=max_sequence_length)

In [130]:
# Build the Word2Vec model
w2v_model = Word2Vec(train_texts, vector_size=100, window=5, min_count=1)

### Embedding and LSTM

In [133]:
 #Create word embeddings
word_index = tokenizer.word_index
embedding_matrix = np.zeros((len(word_index) + 1, 100))
for word, i in word_index.items():
    if word in w2v_model.wv:
        embedding_matrix[i] = w2v_model.wv[word]

In [134]:
# Model building and training
model = Sequential()
model.add(Embedding(len(word_index) + 1, 100, weights=[embedding_matrix], input_length=max_sequence_length, trainable=False))
model.add(LSTM(128))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(train_data, train_labels, validation_data=(test_data, test_labels), epochs=10, batch_size=32)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x29615efd9a0>

In [135]:
# Model evaluation
predictions = model.predict(test_data)
predictions = (predictions > 0.5).astype(int)
accuracy = accuracy_score(test_labels, predictions)
precision = precision_score(test_labels, predictions)
recall = recall_score(test_labels, predictions)
f1 = f1_score(test_labels, predictions)



In [136]:
print('Accuracy:', accuracy)
print('Precision:', precision)
print('Recall:', recall)
print('F1 Score:', f1)

Accuracy: 0.514
Precision: 0.5085514834205934
Recall: 0.966821499668215
F1 Score: 0.6665141811527906


### Embedding and RNN

In [138]:
from keras.layers import Embedding, SimpleRNN, Dense
# Model building and training
model = Sequential()
model.add(Embedding(len(word_index) + 1, 100, weights=[embedding_matrix], input_length=max_sequence_length, trainable=False))
model.add(SimpleRNN(128))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(train_data, train_labels, validation_data=(test_data, test_labels), epochs=10, batch_size=32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x2961bb416a0>

In [139]:
# Model evaluation
predictions = model.predict(test_data)
predictions = (predictions > 0.5).astype(int)
accuracy = accuracy_score(test_labels, predictions)
precision = precision_score(test_labels, predictions)
recall = recall_score(test_labels, predictions)
f1 = f1_score(test_labels, predictions)



In [140]:
print('Accuracy:', accuracy)
print('Precision:', precision)
print('Recall:', recall)
print('F1 Score:', f1)

Accuracy: 0.49866666666666665
Precision: 0.5714285714285714
Recall: 0.007962840079628402
F1 Score: 0.015706806282722516


### Embedding and LSTM

In [141]:
# Model building and training
model = Sequential()
model.add(Embedding(len(word_index) + 1, 100, weights=[embedding_matrix], input_length=max_sequence_length, trainable=False))
model.add(LSTM(128))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(train_data, train_labels, validation_data=(test_data, test_labels), epochs=10, batch_size=32)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x2961bea99d0>

In [142]:
# Model evaluation
predictions = model.predict(test_data)
predictions = (predictions > 0.5).astype(int)
accuracy = accuracy_score(test_labels, predictions)
precision = precision_score(test_labels, predictions)
recall = recall_score(test_labels, predictions)
f1 = f1_score(test_labels, predictions)



In [143]:
print('Accuracy:', accuracy)
print('Precision:', precision)
print('Recall:', recall)
print('F1 Score:', f1)

Accuracy: 0.508
Precision: 0.5055101315321721
Recall: 0.9435965494359655
F1 Score: 0.6583333333333334


### Embedding and Bi-LSTM

In [144]:
# Model building and training
model = Sequential()
model.add(Embedding(len(word_index) + 1, 100, weights=[embedding_matrix], input_length=max_sequence_length, trainable=False))
model.add(Bidirectional(LSTM(128)))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(train_data, train_labels, validation_data=(test_data, test_labels), epochs=10, batch_size=32)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x2961d9d0b80>

In [145]:
# Model evaluation
predictions = model.predict(test_data)
predictions = (predictions > 0.5).astype(int)
accuracy = accuracy_score(test_labels, predictions)
precision = precision_score(test_labels, predictions)
recall = recall_score(test_labels, predictions)
f1 = f1_score(test_labels, predictions)



In [156]:
print('Accuracy:', accuracy)
print('Precision:', precision)
print('Recall:', recall)
print('F1 Score:', f1)

Accuracy: 0.498
Precision: 0.5066666666666667
Recall: 0.025215660252156602
F1 Score: 0.04804045512010113


In [153]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from keras.models import Sequential
from keras.layers import Embedding, Bidirectional, LSTM, Dense
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from gensim.models import KeyedVectors

In [154]:
from keras.layers import *
from keras.models import *

## Explore State-of the Art Transformer models
Use any two state-of-the-art transformer models and check if you can improve NLP model performance further.

In [176]:
import numpy as np
import pandas as pd
import keras
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import Adam

In [177]:
from keras.callbacks import EarlyStopping

In [178]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.29.2-py3-none-any.whl (7.1 MB)
     ---------------------------------------- 7.1/7.1 MB 28.3 MB/s eta 0:00:00
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp39-cp39-win_amd64.whl (3.5 MB)
     ---------------------------------------- 3.5/3.5 MB 55.8 MB/s eta 0:00:00
Collecting huggingface-hub<1.0,>=0.14.1
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
     ---------------------------------------- 224.5/224.5 kB ? eta 0:00:00
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.29.2


In [179]:
from transformers import BertTokenizer, BertModel, GPT2Tokenizer, GPT2Model

In [180]:
# Split the dataset into training and testing sets
train_texts, test_texts, train_labels, test_labels = train_test_split(dataset['text_corpus'], dataset['label'], test_size=0.2, random_state=42)

In [187]:
# Load and tokenize the BERT model
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
train_sequences = bert_tokenizer.batch_encode_plus(
    train_texts.tolist(),
    padding='longest',
    truncation=True,
    return_token_type_ids=False
)

In [188]:
test_sequences = bert_tokenizer.batch_encode_plus(
    test_texts.tolist(),
    padding='longest',
    truncation=True,
    return_token_type_ids=False
)

In [189]:
# Load and tokenize the GPT model
gpt_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
gpt_tokenizer.add_special_tokens({'pad_token': '[PAD]'})
train_sequences_gpt = gpt_tokenizer.batch_encode_plus(
    train_texts.tolist(),
    padding='longest',
    truncation=True,
    return_token_type_ids=False
)

In [190]:
test_sequences_gpt = gpt_tokenizer.batch_encode_plus(
    test_texts.tolist(),
    padding='longest',
    truncation=True,
    return_token_type_ids=False
)

In [191]:
# Convert tokenized sequences to numpy arrays
train_data = np.array(train_sequences['input_ids'])
test_data = np.array(test_sequences['input_ids'])
train_data_gpt = np.array(train_sequences_gpt['input_ids'])
test_data_gpt = np.array(test_sequences_gpt['input_ids'])


In [192]:
# Define the BERT model
bert_model = BertModel.from_pretrained('bert-base-uncased')

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
# Get BERT embeddings
train_embeddings = []
for batch in train_data:
    input_ids = torch.tensor([batch])
    with torch.no_grad():
        last_hidden_states = bert_model(input_ids)[0]
    train_embeddings.append(last_hidden_states.numpy())

In [None]:
test_embeddings = []
for batch in test_data:
    input_ids = torch.tensor([batch])
    with torch.no_grad():
        last_hidden_states = bert_model(input_ids)[0]
    test_embeddings.append(last_hidden_states.numpy())

In [None]:
# Define the GPT model
gpt_model = GPT2Model.from_pretrained('gpt2')

In [None]:
# Get GPT embeddings
train_embeddings_gpt = []
for batch in train_data_gpt:
    input_ids = torch.tensor([batch])
    with torch.no_grad():
        last_hidden_states = gpt_model(input_ids)[0]
    train_embeddings_gpt.append(last_hidden_states.numpy())

In [None]:
test_embeddings_gpt = []
for batch in test_data_gpt:
    input_ids = torch.tensor([batch])
    with torch.no_grad():
        last_hidden_states = gpt_model(input_ids)[0]
    test_embeddings_gpt.append(last_hidden_states.numpy())

In [None]:
# Flatten the embeddings
train_embeddings = np.concatenate(train_embeddings, axis=0)
test_embeddings = np.concatenate(test_embeddings, axis=0)
train_embeddings_gpt = np.concatenate(train_embeddings_gpt, axis=0)
test_embeddings_gpt = np.concatenate(test_embeddings_gpt, axis=0)

In [None]:
# Concatenate BERT and GPT embeddings
train_embeddings_combined = np.concatenate([train_embeddings, train_embeddings_gpt], axis=1)
test_embeddings_combined = np.concatenate([test_embeddings, test_embeddings_gpt], axis=1)

In [None]:
# Model building and training
model = Sequential()
model.add(Dense(256, activation='relu', input_dim=train_embeddings_combined.shape[1]))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])

In [None]:
# Early stopping to prevent overfitting
early_stopping = EarlyStopping(patience=3, restore_best_weights=True)

model.fit(train_embeddings_combined, train_labels, validation_data=(test_embeddings_combined, test_labels), epochs=10, batch_size=32, callbacks=[early_stopping])

In [None]:
# Model evaluation
predictions = model.predict(test_embeddings_combined)
predictions = (predictions > 0.5).astype(int)
accuracy = accuracy_score(test_labels, predictions)
precision = precision_score(test_labels, predictions)
recall = recall_score(test_labels, predictions)
f1 = f1_score(test_labels, predictions)

In [None]:
print('Accuracy:', accuracy)
print('Precision:', precision)
print('Recall:', recall)
print('F1 Score:', f1)