In [77]:
import pandas as pd
import tensorflow as tf
import tensorflow.keras as keras
import numpy as np

In [78]:
twt = pd.read_csv('tweets_cleaned.csv')
twt.head()

Unnamed: 0,Text,Scam
0,the transfer scam is actually insane. i get l...,1
1,"i keep telling you all about this gem, i hope ...",1
2,1 year ago \n\nfrom what i'm learning of coppe...,1
3,to position for layer zero airdrop 🎯\n\nhere's...,1
4,here's a standard native token (ether) transac...,1


In [79]:
twt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1081332 entries, 0 to 1081331
Data columns (total 2 columns):
 #   Column  Non-Null Count    Dtype 
---  ------  --------------    ----- 
 0   Text    1081031 non-null  object
 1   Scam    1081332 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 16.5+ MB


In [80]:
# summary statistics of string lengths in the tweets
twt['Text'].str.len().describe()
# save the 25% quartile to a variable
q25 = twt['Text'].str.len().quantile(0.25)
# drop tweets that are shorter than the 25% quartile
twt = twt[twt['Text'].str.len() > q25]
twt.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 809388 entries, 0 to 1081331
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   Text    809388 non-null  object
 1   Scam    809388 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 18.5+ MB


In [81]:
twt['Text'].str.len().describe()

count    809388.000000
mean        158.366908
std          63.413452
min          75.000000
25%         107.000000
50%         136.000000
75%         209.000000
max         955.000000
Name: Text, dtype: float64

In [82]:
# check number of uniques in the scam column
twt['Scam'].value_counts()

0    563959
1    245429
Name: Scam, dtype: int64

In [83]:
# randomise the data
twt = twt.sample(frac = 1, random_state = 42)

In [84]:
twt_train = twt.sample(frac = 0.8, random_state = 0)
twt_test = twt.drop(twt_train.index)

In [85]:
twt_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 647510 entries, 420015 to 624079
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   Text    647510 non-null  object
 1   Scam    647510 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 14.8+ MB


In [86]:
twt_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 161878 entries, 797604 to 191720
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   Text    161878 non-null  object
 1   Scam    161878 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 3.7+ MB


In [87]:
# character vectorisation layer
char_vectorizer = keras.layers.experimental.preprocessing.TextVectorization(
    max_tokens = 250,
    output_mode = 'int',
    output_sequence_length = 250,
    ngrams = 2
)

# fit the vectoriser to the training data
char_vectorizer.adapt(twt_train['Text'].to_numpy())

In [88]:
# word vectorisation layer
word_vectorizer = keras.layers.experimental.preprocessing.TextVectorization(
    max_tokens = 1000,
    output_mode = 'int',
    output_sequence_length = 1000)

# fit the vectoriser to the training data
word_vectorizer.adapt(twt_train['Text'].to_numpy())

In [93]:
# create char model
char_model = keras.Sequential([
    char_vectorizer,
    keras.layers.Embedding(input_dim = 250, output_dim = 64, mask_zero = True),
    keras.layers.Bidirectional(keras.layers.LSTM(64)),
    keras.layers.Dense(1, activation = 'relu')])

# compile the model
char_model.compile(loss = keras.losses.BinaryCrossentropy(from_logits = True),
                     optimizer = keras.optimizers.Adam(1e-4),
                        metrics = ['accuracy'])

In [None]:
# fit the model
char_model.fit(twt_train['Text'].to_numpy(), twt_train['Scam'].to_numpy(), epochs = 3)

In [None]:
# plot the training history
pd.DataFrame(char_model.history.history).plot()

# evaluate the model
char_model.evaluate(twt_test['Text'].to_numpy(), twt_test['Scam'].to_numpy())