#1.&nbsp;Connect G-Drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
default_dir = "/content/drive/MyDrive/..."
os.chdir(default_dir)

FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/...'

In [None]:
import os
os.getcwd()

In [None]:
!ls

#2.&nbsp;Import Libraries

In [None]:
import pandas as pd
import seaborn as sns

# text processing
import re
import nltk

In [None]:
nltk.download('punkt')

In [None]:
nltk.download('stopwords')

In [None]:
from nltk.corpus import stopwords
print(stopwords.words('english'))

In [None]:
nltk.download('wordnet')

In [None]:
nltk.download('omw-1.4')

In [None]:
from nltk.stem import WordNetLemmatizer

In [None]:
# Dataset Preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [None]:
# ML Modeling
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

In [None]:
# ML Model Evaluation
from sklearn.model_selection import cross_val_score

#3.&nbsp;Load Dataset

In [None]:
df_spam = pd.read_csv('spam.csv', encoding="latin-1")
df_spam.head()

In [None]:
# Drop un-meaningful columns

df_spam = df_spam.drop(
    columns=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4']
)

df_spam.head()

In [None]:
# Rename columns

df_spam = df_spam.rename(
    columns={
        'v1': 'label',
        'v2': 'message'
    }
)

df_spam.head()

#4.&nbsp; Dataset Exploration

##4.1. Label Counts

In [None]:
print(df_spam.label.value_counts())
df_spam.label.value_counts().plot(kind='bar');

##4.2. Message Length

In [None]:
df_spam['message_len'] = df_spam.message.apply(len)

df_spam.sort_values(by='message_len', ascending=False).head(10)

#5.&nbsp;Text Preprocessing

##5.1. Text Cleaning

In [None]:
my_text = "Nama saya HENDY"

In [None]:
my_text.split()

In [None]:
"#".join(["nama", "saya", "hendy"])

In [None]:
# Defining a function to clean up the text
def clean_text(text):

    # Replacing all non-alphabetic characters with a space
    sms = re.sub('[^a-zA-Z]', ' ', text)

    # Converting to lowecase
    sms = sms.lower()

    # Splitting text (check the defaults of split func!)
    sms = sms.split()

    # Rejoining text
    sms = ' '.join(sms)

    return sms

In [None]:
df_spam["cleaned_text"] = df_spam["message"].apply(clean_text)

# Lets have a look at a sample of texts after cleaning
print("The First 10 Texts after cleaning: \n")
print(*df_spam["cleaned_text"][:10], sep="\n")

In [None]:
print(df_spam['message'][0])
print(df_spam['cleaned_text'][0])

##5.2. Tokenization

In [None]:
df_spam["tokenized_text"] = df_spam.apply(
    lambda row: nltk.word_tokenize(row["cleaned_text"]),
    axis=1
)

df_spam.head(10)

##5.3. Remove Stopwords

In [None]:
# Removing the stopwords function
def remove_stopwords(text):
    stop_words = set(stopwords.words("english"))
    filtered_text = [word for word in text if word not in stop_words]
    return filtered_text

In [None]:
df_spam["no_stopword_text"] = df_spam["tokenized_text"].apply(remove_stopwords)

print("The First 10 Texts after removing the stopwords: \n")
print(*df_spam["no_stopword_text"][:10], sep="\n")

In [None]:
df_spam

##5.4. Lemmatization

In [None]:
lemmatizer = WordNetLemmatizer()

# lemmatize string function
def lemmatize_word(text):
    # provide context i.e. part-of-speech (pos)
    lemmas = [lemmatizer.lemmatize(word, pos='v') for word in text]
    return lemmas

In [None]:
df_spam["lemmatized_text"] = df_spam["no_stopword_text"].apply(lemmatize_word)

print("The First 10 Texts after lemitization: \n")
print(*df_spam["lemmatized_text"][:10], sep="\n")

In [None]:
df_spam

##5.5. Vectorization

###5.5.1. Creating a corpus of lemmatized text

In [None]:
# Creating a corpus of text feature to encode further into vectorized form
corpus = []
for i in df_spam["lemmatized_text"]:
    msg = ' '.join([row for row in i])
    corpus.append(msg)

print("The First 10 lines in corpus : \n")
print(*corpus[:10], sep="\n")

In [None]:
df_spam.head()

###5.5.2. Converting the corpus in vector form

In [None]:
# Changing text data in to numbers.
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(corpus).toarray()

# Let's have a look at our feature
X.dtype

In [None]:
X

In [None]:
X.shape

In [None]:
len(df_spam)

In [None]:
X[0, :]

###5.5.5. Label Encoding the classes in Target

In [None]:
# Label encode the Target and use it as y
le = LabelEncoder()
df_spam["label"] = le.fit_transform(df_spam["label"])

In [None]:
df_spam.head()

#6.&nbsp;Model Building

In [None]:
# Setting values for labels and feature as y and X (we already did X in vectorizing...)
y = df_spam["label"]

# Splitting the testing and training sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Testing on the following classifiers
classifiers = [
    MultinomialNB(),
    RandomForestClassifier(),
    KNeighborsClassifier(),
    SVC()
]

for cls in classifiers:
    cls.fit(X_train, y_train)

# Dictionary of pipelines and model types for ease of reference
pipe_dict = {
    0: "NaiveBayes",
    1: "RandomForest",
    2: "KNeighbours",
    3: "SVC"
}

#7.&nbsp;Model Evaluation

In [None]:
# Cross-Validation
for i, model in enumerate(classifiers):
    cv_score = cross_val_score(model, X_train,y_train, scoring="accuracy", cv=10)
    print("%s: %f " % (pipe_dict[i], cv_score.mean()))