## Import libraries

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import string
import tensorflow as tf


plt.style.use("seaborn")
pd.options.display.max_rows = 2000
pd.options.display.max_columns = 500

## Load data

In [3]:
data = pd.read_csv('/kaggle/input/turkish-sms-collection/TurkishSMSCollection.csv', sep=';', on_bad_lines='skip')

## Preliminary analyse

In [4]:
data

Unnamed: 0,Message,Group,GroupText
0,125 lira,2,Normal
1,Baskanin aksam toplantısi fenaymis :),2,Normal
2,Bilal yalçnlara ne zaman gidiyoruz?,2,Normal
3,"BiP ile mesajlarimi aninda, daha eglenceli gon...",1,Spam
4,DIGITURKTEN FIRSAT! SiZE OZEL YIL SONUNA KADAR...,1,Spam
...,...,...,...
4746,"Ziraat Kartiniza ozel, Lezzetlihediye.com dan ...",1,Spam
4747,Ziraat Maximum'dan Yeniyila ozel 100 TL'ye kad...,1,Spam
4748,Zor olur sonra. Evet evet boş bol yürüyüş,2,Normal
4749,Zorlu hava şartlarında elektriksiz kalma! Attl...,1,Spam


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4751 entries, 0 to 4750
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Message    4751 non-null   object
 1   Group      4751 non-null   int64 
 2   GroupText  4751 non-null   object
dtypes: int64(1), object(2)
memory usage: 111.5+ KB


In [6]:
data['Group'].value_counts()

1    2536
2    2215
Name: Group, dtype: int64

In [7]:
data['GroupText'].value_counts()

Spam      2536
Normal    2215
Name: GroupText, dtype: int64

In [8]:
for i in range(1):
    n = np.random.randint(i,1000)
    sen = re.sub('\W', ' ', data.loc[data['GroupText']=='Spam','Message'][n:n+1].values[0])
    sen = re.sub('\W', ' ', sen)
    sen = sen.lower()
    sen = sen.split()
    sen = ' '.join(sen)
    print('*'*10)
    print(f"Spam: \n{sen}")

**********
Spam: 
gunun yorgunlugunu bizimle atmak istermisiniz memnuniyet garantili yerimiz bakirkoy incirlide 10 00 02 00 arasi acigiz irt melis hnm 05397768805 05397768806


In [9]:
for i in range(1):
    n = np.random.randint(i,1000)
    sen = re.sub('\W', ' ', data.loc[data['GroupText']=='Normal','Message'][n:n+1].values[0])
    sen = re.sub('\W', ' ', sen)
    sen = sen.lower()
    sen = sen.split()
    sen = ' '.join(sen)
    print('*'*10)
    print(f"Normal: \n{sen}")

**********
Normal: 
bitane büyük tül varmış tek parça 4 5 metre


In [10]:
for i in range(1):
    n = np.random.randint(i,1000)
    sen = re.sub('\W', ' ', data['Message'][n:n+1].values[0])
    sen = re.sub('\W', ' ', sen)
    sen = sen.lower()
    sen = sen.split()
    sen = ' '.join(sen)
    print('*'*10)
    print(f"Sentence: \n{sen}")

**********
Sentence: 
1675 09tl ekstreli kartinizin min tutarini odeyip aylik sadece 2 02 faiz vergiler haric ile 1ay ertelemek icin 20 08 2015 tarihine kadar atlat kart son 6noyu 3340a gonder size ozel indirim ve kampanyalarimiz ile ilgili ucretsiz sms almak istemiyorsaniz smsiptal yazip 3344 gonderi̇n


## Preproccesing

In [11]:
data.loc[data['GroupText']=='Normal','GroupText'] = int(0)
data.loc[data['GroupText']=='Spam','GroupText'] = int(1)
dat = data.iloc[:,0::2]
dat

Unnamed: 0,Message,GroupText
0,125 lira,0
1,Baskanin aksam toplantısi fenaymis :),0
2,Bilal yalçnlara ne zaman gidiyoruz?,0
3,"BiP ile mesajlarimi aninda, daha eglenceli gon...",1
4,DIGITURKTEN FIRSAT! SiZE OZEL YIL SONUNA KADAR...,1
...,...,...
4746,"Ziraat Kartiniza ozel, Lezzetlihediye.com dan ...",1
4747,Ziraat Maximum'dan Yeniyila ozel 100 TL'ye kad...,1
4748,Zor olur sonra. Evet evet boş bol yürüyüş,0
4749,Zorlu hava şartlarında elektriksiz kalma! Attl...,1


In [12]:
def custom_standardization(sentence):
    sample = tf.strings.lower(sentence)
    sample = tf.strings.regex_replace(sample, '\W', ' ')
    sample = tf.strings.regex_replace(sample, '\d', ' ')
    return tf.strings.regex_replace(sample,
                         '[%s]'%re.escape(string.punctuation), '')

max_features = 10000
sequence_length = 250

vectorize_layer = tf.keras.layers.TextVectorization(
                        standardize=custom_standardization,
                        split='whitespace',
                        max_tokens=max_features,
                        output_mode='int',
                        output_sequence_length=sequence_length,
                        encoding='utf-8')
vectorize_layer.adapt(dat['Message'])

In [13]:
for i in range(1):
    sample = np.random.randint(1000)
    print(f"Before standartization:\n\n{dat['Message'][sample]}")
    print('-'*80)
    print(f"After standartization:\n\n{custom_standardization(dat['Message'][sample])}")
    print('-'*80)
    print(f"After vectorization:\n\n{vectorize_layer(dat['Message'][sample])}")
    print('*'*80)

Before standartization:

700 düşünmüştük
--------------------------------------------------------------------------------
After standartization:

b'    d   nm  t k'
--------------------------------------------------------------------------------
After vectorization:

[  67 2168   51   35    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0  

## Prepare model to fit

In [14]:
x = dat.iloc[:,0]
X = tf.convert_to_tensor(vectorize_layer(x))
X.shape

TensorShape([4751, 250])

In [15]:
Y = tf.convert_to_tensor(dat.iloc[:,1].values.ravel(), dtype=float)
Y.shape

TensorShape([4751])

## Build model

In [16]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Dropout, Embedding, GlobalAveragePooling1D

In [17]:
embedding_dim = 16

In [18]:
model = Sequential()
model.add(Embedding(max_features+1, embedding_dim))
model.add(Dropout(0.2))
model.add(GlobalAveragePooling1D())
model.add(Dropout(0.2))
model.add(Dense(16, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy',
             optimizer='Adam',
             metrics=['accuracy'])

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 16)          160016    
                                                                 
 dropout (Dropout)           (None, None, 16)          0         
                                                                 
 global_average_pooling1d (G  (None, 16)               0         
 lobalAveragePooling1D)                                          
                                                                 
 dropout_1 (Dropout)         (None, 16)                0         
                                                                 
 dense (Dense)               (None, 16)                272       
                                                                 
 dense_1 (Dense)             (None, 1)                 17        
                                                        

In [19]:
epochs = 5
history = model.fit(X, 
                    Y, 
                    epochs=epochs,
                    batch_size=8,
                   validation_split=0.2)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


## Predict model

In [20]:
for i in range(10):
    n = np.random.randint(i,1000)
    labels = dat['GroupText'][n]
    sample = dat['Message'][n]
    print('-'*80)
    print(f'Sample:\n{sample}')
    print('Predict this sample wiht our model')
    print(f'Label: {labels}\nPredict: {model.predict(X[n:n+1])[0]}')

--------------------------------------------------------------------------------
Sample:
31 Mayıs'a kadar Goodyear'dan tek seferde 15 jant ve üzeri 4 adet dört mevsim veya yaz lastiği alışverişine 50TL değerinde Yakıt Kart hediye! www.opet.com.tr
Predict this sample wiht our model
Label: 1
Predict: [0.9999525]
--------------------------------------------------------------------------------
Sample:
Bi dahakine bizden önce girer heralde :)
Predict this sample wiht our model
Label: 0
Predict: [0.00946558]
--------------------------------------------------------------------------------
Sample:
bilmemedim
Predict this sample wiht our model
Label: 0
Predict: [0.02074409]
--------------------------------------------------------------------------------
Sample:
Biraz geç ama ??
Predict this sample wiht our model
Label: 0
Predict: [0.00676256]
--------------------------------------------------------------------------------
Sample:
04-17 Kasım'da CarrefourSA Teknoloji Bayramını kaçırmayın! çok av