In [89]:
import pandas as pd
import keras
from sklearn.model_selection import train_test_split
from collections import Counter
from imblearn.pipeline import make_pipeline as make_pipeline_imb
from imblearn.under_sampling import RandomUnderSampler
from sklearn.naive_bayes import MultinomialNB
from imblearn.metrics import classification_report_imbalanced
from sklearn.feature_extraction.text import TfidfVectorizer
from keras.api.preprocessing import sequence
from keras._tf_keras.keras.preprocessing.text import Tokenizer
from keras.api.models import Sequential
from keras.api.layers import Dense, Embedding, Conv1D, MaxPooling1D, LSTM, Flatten
from sklearn.metrics import  classification_report


df = pd.read_csv("./content/train.csv")

In [90]:
df.head()

Unnamed: 0,sms,label
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...\n,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0


In [91]:
df.isnull().sum()

sms      0
label    0
dtype: int64

In [92]:
X_train, X_test, Y_train, Y_test = train_test_split(df['sms'],  df['label'], test_size=0.2)

In [93]:

print(f"Training class distributions summary: {Counter(Y_train)}")
print(f"Test class distributions summary: {Counter(Y_test)}")

Training class distributions summary: Counter({0: 3860, 1: 599})
Test class distributions summary: Counter({0: 967, 1: 148})


In [94]:
model = make_pipeline_imb(TfidfVectorizer(), RandomUnderSampler(), MultinomialNB())
model.fit(X_train, Y_train)

y_pred = model.predict(X_test)
print(classification_report_imbalanced(Y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.99      0.96      0.97      0.98      0.96      0.93       967
          1       0.79      0.97      0.96      0.87      0.96      0.93       148

avg / total       0.97      0.96      0.97      0.96      0.96      0.93      1115



In [95]:
X = df['sms']
Y = df['label']
Y

0       0
1       0
2       1
3       0
4       0
       ..
5569    1
5570    0
5571    0
5572    0
5573    0
Name: label, Length: 5574, dtype: int64

In [96]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

In [97]:
df['num_words'] = df['sms'].apply(lambda x: len(x.split(" ")))

In [98]:
max_tokens = 64
tokenizer = Tokenizer(num_words=max_tokens)
tokenizer.fit_on_texts(X_train.values)
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

In [99]:
maxlen=16

X_train = sequence.pad_sequences(X_train, maxlen=maxlen)
X_test = sequence.pad_sequences(X_test, maxlen=maxlen)
     

X_train.shape, X_test.shape, Y_train

((4459, 16),
 (1115, 16),
 4751    0
 2391    0
 2799    0
 1412    0
 4874    0
        ..
 3620    1
 4064    0
 4277    0
 869     1
 1125    0
 Name: label, Length: 4459, dtype: int64)

In [100]:


model = Sequential()
model.add(Embedding(max_tokens, 32, input_length=X_train.shape[1]))
model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))
     

model.compile(loss=keras.losses.CategoricalCrossentropy(), optimizer=keras.optimizers.Adam(learning_rate=0.001), metrics=[keras.metrics.Precision(), keras.metrics.Recall()])
     

history = model.fit(X_train, Y_train, validation_split=0.2, batch_size=64, epochs=10)


Epoch 1/10


  return self.fn(y_true, y_pred, **self._fn_kwargs)


[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 30ms/step - loss: 0.0000e+00 - precision_3: 0.1111 - recall_3: 0.0683 - val_loss: 0.0000e+00 - val_precision_3: 0.0000e+00 - val_recall_3: 0.0000e+00
Epoch 2/10
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - loss: 0.0000e+00 - precision_3: 0.0000e+00 - recall_3: 0.0000e+00 - val_loss: 0.0000e+00 - val_precision_3: 0.0000e+00 - val_recall_3: 0.0000e+00
Epoch 3/10
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - loss: 0.0000e+00 - precision_3: 0.0000e+00 - recall_3: 0.0000e+00 - val_loss: 0.0000e+00 - val_precision_3: 0.0000e+00 - val_recall_3: 0.0000e+00
Epoch 4/10
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - loss: 0.0000e+00 - precision_3: 0.0000e+00 - recall_3: 0.0000e+00 - val_loss: 0.0000e+00 - val_precision_3: 0.0000e+00 - val_recall_3: 0.0000e+00
Epoch 5/10
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - 

In [101]:
preds = model.predict(X_test)
preds

[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step


array([[1.0953516e-31],
       [1.0953350e-31],
       [1.0951679e-31],
       ...,
       [1.0948419e-31],
       [1.0945579e-31],
       [1.0965640e-31]], dtype=float32)

In [102]:
print(classification_report(Y_test, preds))

ValueError: Classification metrics can't handle a mix of binary and continuous targets