<h3 style="color:maroon;">Imports</h3>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from nltk.tokenize import WordPunctTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import cross_val_score


<h3 style="color:maroon;">Visual aids</h3>

In [None]:
class Colors:
    RED = '\033[91m' + '\033[1m' + '\033[4m'
    GREEN = '\033[92m' + '\033[1m' + '\033[4m'
    YELLOW = '\033[93m' + '\033[1m' + '\033[4m'
    BLUE = '\033[94m' + '\033[1m' + '\033[4m'
    END = '\033[0m'

<h3 style="color:maroon;">A. Initial exploration</h3>

<h4 style="color:#874c62;">1. Import data</h4>

In [None]:
raw_data = pd.read_csv("Youtube01-Psy.csv")

<h4 style="color:#874c62;">2. Examination of properties of raw data frame</h4>

In [None]:
print(Colors.BLUE + "Shape of data frame:" + Colors.END)
print(raw_data.shape)
print("")

print(Colors.BLUE + "Data frame fields:" + Colors.END)
print(raw_data.columns)
print("")

print(Colors.BLUE + "Data frame info:" + Colors.END)
raw_data.info()
print("")

print(Colors.BLUE + "Unique entries per attribute:" + Colors.END)
print(raw_data.nunique())
print("")

print(Colors.BLUE + "Data frame head:" + Colors.END)
raw_data.head(3)


<h4 style="color:#874c62;">3. Analysis of raw data frame</h4>

<div style="background-color: rgba(120,120,200,0.25); padding: 10px;">
    <h5>Analysis of raw data frame<br></h5>
        <body>
            Main observations:
            <ol>
                <li>The data set contains 350 entries and 5 attributes.</li>
                <li>None of the fields in the data set are empty.</li>
                <li>Fields,<b>COMMENT_ID, AUTHOR, DATE</b> contain mostly unique values, hence they will be ignored in classifying the <b>CONTENT</b> class.</li>
            </ol>
    </body>
</div>

<h4 style="color:#874c62;">4. Summary examination of <b>CONTENT</b> field.</h4>

In [None]:
# Max length of CONTENT field
max_length_comment = raw_data["CONTENT"].str.len().max()
print(Colors.BLUE + "Max length of content:" + Colors.END)
print(max_length_comment)
print("")

# Min length of CONTENT field
min_length_comment = raw_data["CONTENT"].str.len().min()
print(Colors.BLUE + "Min length of content:" + Colors.END)
print(min_length_comment)
print("")

# Max length of spam comment
max_length_comment_spam = raw_data.loc[raw_data["CLASS"] == 1, "CONTENT"].str.len().max()
print(Colors.BLUE + "Max length of spam content:" + Colors.END)
print(max_length_comment_spam)
print("")

# Min length of spam comment
max_length_comment_ham = raw_data.loc[raw_data["CLASS"] == 0, "CONTENT"].str.len().max()
print(Colors.BLUE + "Max length of ham content:" + Colors.END)
print(max_length_comment_ham)
print("")

# Max length of ham comment
min_length_comment_spam = raw_data.loc[raw_data["CLASS"] == 1, "CONTENT"].str.len().min()
print(Colors.BLUE + "Min length of spam content:" + Colors.END)
print(min_length_comment_spam)
print("")

# Min legnth of ham comment
min_length_comment_ham = raw_data.loc[raw_data["CLASS"] == 0, "CONTENT"].str.len().min()
print(Colors.BLUE + "Min length of ham content:" + Colors.END)
print(min_length_comment_ham)
print("")

# Print longest spam comment
filtered_data_class = raw_data[raw_data["CLASS"] == 1]
longest_content_class = filtered_data_class.loc[filtered_data_class["CONTENT"].str.len().idxmax(), "CONTENT"]
print(Colors.BLUE + "Longest spam comment:" + Colors.END)
print(longest_content_class)
print("")

# Print shortest spam comment
filtered_data_class = raw_data[raw_data["CLASS"] == 1]
shortest_content_class = filtered_data_class.loc[filtered_data_class["CONTENT"].str.len().idxmin(), "CONTENT"]
print(Colors.BLUE + "Shortest spam comment:" + Colors.END)
print(shortest_content_class)
print("")

# Print longest hamm comment
filtered_data_class = raw_data[raw_data["CLASS"] == 0]
longest_content_class = filtered_data_class.loc[filtered_data_class["CONTENT"].str.len().idxmax(), "CONTENT"]
print(Colors.BLUE + "Longest ham comment:" + Colors.END)
print(longest_content_class)
print("")

filtered_data_class = raw_data[raw_data["CLASS"] == 0]
shortest_content_class = filtered_data_class.loc[filtered_data_class["CONTENT"].str.len().idxmin(), "CONTENT"]
print(Colors.BLUE + "Shortest ham comment:" + Colors.END)
print(shortest_content_class)
print("")

<h4 style="color:#874c62;">5. Visual examination of <b>CONTENT</b> field.</h4>

In [None]:
# Get lengths of all spam and ham comments
spam_lengths = raw_data.loc[raw_data["CLASS"]==1, "CONTENT"].str.len()
ham_lengths = raw_data.loc[raw_data["CLASS"]==0, "CONTENT"].str.len()

# Plot distribution of spam and ham lengths
fig, ax = plt.subplots(nrows=3, ncols=1, figsize=(10,10))

ax[0].hist(spam_lengths, bins=30, color="red", edgecolor="red", alpha=0.5)
ax[0].set_title("Distribution of spam comment lengths")
ax[0].set_xlabel("Length")
ax[0].set_ylabel("Counts")

ax[1].hist(ham_lengths, bins=30, color="green", edgecolor="green", alpha=0.5)
ax[1].set_title("Distribution of ham comment lengths")
ax[1].set_xlabel("Length")
ax[1].set_ylabel("Counts")

ax[2].bar( x=1, height=(raw_data["CLASS"]==1).sum(), width =0.5, color='red', edgecolor="red", alpha=0.6, label="Spam")
ax[2].bar( x=2, height=(raw_data["CLASS"]==0).sum(), width =0.5, color='green', edgecolor="green", alpha=0.6, label="Ham")
ax[2].set_xticks([1, 2])
ax[2].set_xticklabels(['Spam', 'Ham'])
ax[2].set_title("Number of spam vs ham comments")
ax[2].set_xlabel("Categories")
ax[2].set_ylabel("Counts")

plt.tight_layout()
plt.show()

<h4 style="color:#874c62;">6. Analysis of <b>CONTENT</b> field.</h4>

<div style="background-color: rgba(120,120,200,0.25); padding: 10px;">
    <h5>Analysis of CONTENT<br></h5>
        <body>
            <ul>
                <li>
                    Based on a summary statistical analysis there aren't many features that distinguish between spam and ham comments.
                </li>
                <li>
                    Both spam and ham comments show similarly skewed distribution, with most of them having total character counts bounded between 0 - 200 characters. 
                </li>
                <li>
                    The longest spam message has more characters than the longest ham message, however, the statistical relevance of this is undecided.
                </li>
                <li>
                    It is not immediately clear if including content length as a feature will improve the performance of a classifier.
                </li>
                <li>
                    The data set is balanced with roughly the same number of spam and ham examples.
                </li>
            </ul>
    </body>
</div>

<h3 style="color:maroon;">B. Data preparation</h3>

<h4 style="color:#874c62;">1. Prepare new dataframe with only <b>CONTENT</b> and <b>CLASS</b> fields.</h4>

In [None]:
# Create new dataframe with appropriate fields
proc_data = raw_data[["CLASS","CONTENT"]].copy() # Use copy to supress slice warning (we want a copy not a slice referencing the memory location of raw_data)

In [None]:
# Inspect the dataframe
print(Colors.BLUE + "Shape of data frame:" + Colors.END)
print(proc_data.shape)
print("")

print(Colors.BLUE + "Data frame fields:" + Colors.END)
print(proc_data.columns)
print("")

print(Colors.BLUE + "Data frame info:" + Colors.END)
proc_data.info()
print("")

print(Colors.BLUE + "Unique entries per attribute:" + Colors.END)
print(proc_data.nunique())
print("")

print(Colors.BLUE + "Data frame head:" + Colors.END)
proc_data.head(3)

<h4 style="color:#874c62;">2. Convert text to lowercase.</h4>

In [None]:
proc_data["PROC_CONTENT"] = proc_data["CONTENT"].apply(lambda x: x.lower())

In [None]:
print(Colors.BLUE + "Data frame head:" + Colors.END)
proc_data.head(3)

<h4 style="color:#874c62;">3. Tokenize words.</h4>

In [None]:
proc_data["PROC_CONTENT"] = proc_data["PROC_CONTENT"].apply(lambda x: WordPunctTokenizer().tokenize(x))

In [None]:
print(Colors.BLUE + "Data frame head:" + Colors.END)
proc_data.head(3)

<h4 style="color:#874c62;">4. Remove stop words.</h4>

In [None]:
stop_words = set(stopwords.words('english'))

def remove_stopwords(tokens):
    return [word for word in tokens if word not in stop_words]

In [None]:
proc_data["PROC_CONTENT"] = proc_data["PROC_CONTENT"].apply(remove_stopwords)
print(Colors.BLUE + "Data frame head:" + Colors.END)
proc_data.head(3)

<h4 style="color:#874c62;">5. Remove non-alpha numeric characters.</h4>

In [None]:
def remove_punct(tokens):
    return [word for word in tokens if word.isalnum()]

proc_data["PROC_CONTENT"] = proc_data["PROC_CONTENT"].apply(remove_punct)

In [None]:
print(Colors.BLUE + "Data frame head:" + Colors.END)
proc_data.head(3)

<h4 style="color:#874c62;">6. Lemmatize words.</h4>

In [None]:
def lemmatize_words(tokens):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(token) for token in tokens]

proc_data["PROC_CONTENT"] = proc_data["PROC_CONTENT"].apply(lemmatize_words)

In [None]:
print(Colors.BLUE + "Data frame head:" + Colors.END)
proc_data.head(3)

<h4 style="color:#874c62;">7. Re-assemble string post processing.</h4>

In [None]:
def join_words(tokens):
    return " ".join(tokens)

proc_data["PROC_CONTENT"] = proc_data["PROC_CONTENT"].apply(join_words)

In [None]:
print(Colors.BLUE + "Data frame head:" + Colors.END)
proc_data.head(3)

<h3 style="color:maroon;">C. Data transformation</h3>

<h4 style="color:#874c62;">1. Vectorize string with count vectorizer.</h4>

In [None]:
vectorizer = CountVectorizer()
bow_sparse_matrix = vectorizer.fit_transform(proc_data['PROC_CONTENT'])

In [None]:
print(Colors.BLUE + "Type of object returned by CountVectorizer().fit_transform:" + Colors.END)
print(type(bow_sparse_matrix))

print(Colors.BLUE + "Shape of sparse matrix returned by CountVectorizer().fit_transform:" + Colors.END)
print(bow_sparse_matrix.shape)

print(Colors.BLUE + "Density of sparse matrix returned by CountVectorizer().fit_transform:" + Colors.END)
print( bow_sparse_matrix.nnz / (bow_sparse_matrix.shape[0]*bow_sparse_matrix.shape[1]) )

print(Colors.BLUE + "Sum sparse matrix returned by CountVectorizer().fit_transform:" + Colors.END)
print( bow_sparse_matrix.toarray().sum() )

<h4 style="color:#874c62;">2. Downscaling with TF-IDF.</h4>

In [None]:
tfidf_transformer = TfidfTransformer()
tfidf_sparse_matrix = tfidf_transformer.fit_transform(bow_sparse_matrix)

In [None]:
print(Colors.BLUE + "Type of object returned by TfidTransformer().fit_transform:" + Colors.END)
print(type(tfidf_sparse_matrix))

print(Colors.BLUE + "Shape of sparse matrix returned by TfidTransformer().fit_transform:" + Colors.END)
print(tfidf_sparse_matrix.shape)

print(Colors.BLUE + "Density of sparse matrix returned by TfidTransformer().fit_transform:" + Colors.END)
print( tfidf_sparse_matrix.nnz / (tfidf_sparse_matrix.shape[0]*tfidf_sparse_matrix.shape[1]) )

print(Colors.BLUE + "Sum sparse matrix returned by TfidTransformer().fit_transform:" + Colors.END)
print( tfidf_sparse_matrix.toarray().sum() )

<h4 style="color:#874c62;">3. Create the feature matrix.</h4>

In [None]:
feature_names = vectorizer.get_feature_names_out();
tfidf_df = pd.DataFrame(tfidf_sparse_matrix.toarray(), columns=feature_names)
feat_mat = pd.concat([proc_data["CLASS"],tfidf_df], axis = 1)

In [None]:
print(Colors.BLUE + "Shape of feature matrix:" + Colors.END)
print( feat_mat.shape )

print(Colors.BLUE + "Head of feature matrix:" + Colors.END)
print( feat_mat.head(3) )

<h4 style="color:#874c62;">4. Shuffle the data.</h4>

In [None]:
shuffled_feat_mat = feat_mat.sample(frac=1, random_state=1)

In [None]:
print(Colors.BLUE + "Shape of shuffled feature matrix:" + Colors.END)
print( shuffled_feat_mat.shape )
print("")

print(Colors.BLUE + "Head of shuffled feature matrix:" + Colors.END)
print( shuffled_feat_mat.head(3) )
print("")

<h4 style="color:#874c62;">5. Analysis.</h4>

<div style="background-color: rgba(120,120,200,0.25); padding: 10px;">
    <h5>Analysis of BOW vs TF-IDF transforms<br></h5>
    <body>
        <table>
            <tr>
                <th>S No</th> <th>Matrix</th> <th>Rows</th> <th>Columns</th> <th>Sum of elements</th> <th>Density</th>
            </tr>
            <tr>
                <td>1</td> <td>Original</td> <td>350</td> <td>2</td> <td>Not specified</td> <td>1</td>
            </tr>
                <td>2</td> <td>Count vectorizer sparse</td> <td>350</td> <td>1229</td> <td>3152</td> <td>0.0065</td>
            <tr>
            <tr>
                <td>3</td> <td>TF-IDF sparse</td> <td>350</td> <td>1229</td> <td>881.132</td> <td>0.0065</td>
            </tr>
        </table> 
        We observe that:
        <ul>
            <li>When the data is vectorized, the number of columns increases to the number of unique words in the corpus.</li>
            <li>The density of sparse matrices are low, i.e. most of the matrix elements are 0.</li>
            <li>As expected, the sparse matrices from Count Vectorizer and TF-IDF have the same number of columns and the same density.</li>
            <li>The sum of all the elements in the sparse matrix corresponding to TF-IDF is lower than the sum of all the elements in the sparse matrix of Count Vectorizer. This is because TF-IDF downscales the values.</li>
        </ul>
    </body>
</div>

<h3 style="color:maroon;">D. Model building</h3>

<h4 style="color:#874c62;">1. Make a 75-25 train test split without using train_test_split.</h4>

In [None]:
# Determine indices
split_index = int( 0.75 * len(shuffled_feat_mat) )

# Allocate data according to indices
train_data = shuffled_feat_mat[:split_index]
test_data = shuffled_feat_mat[split_index:]

# Separate classes from features for each data set
x_train = train_data.drop(columns = ["CLASS"])
y_train = train_data["CLASS"]

x_test = test_data.drop(columns = ["CLASS"])
y_test = test_data["CLASS"]

In [None]:
print(Colors.BLUE + "Split index:" + Colors.END)
print( split_index )
print("")

print(Colors.BLUE + "Shape of x_train:" + Colors.END)
print( x_train.shape )
print("")

print(Colors.BLUE + "Shape of y_train:" + Colors.END)
print( y_train.shape )
print("")

print(Colors.BLUE + "Shape of x_test:" + Colors.END)
print( x_test.shape )
print("")

print(Colors.BLUE + "Shape of y_test:" + Colors.END)
print( y_test.shape )
print("")

<h4 style="color:#874c62;">2. Fit multinomial Naive-Bayes.</h4>

In [None]:
clf = MultinomialNB()
clf.fit(x_train, y_train)

<h4 style="color:#874c62;">3. Cross validation.</h4>

In [None]:
cross_val_scores = cross_val_score(clf, x_train, y_train, cv=5)

In [None]:
# Print the cross-validation sc0ores
print(Colors.BLUE + "Cross-validation scores:" + Colors.END, cross_val_scores)
print(Colors.BLUE + "Mean accuracy:" + Colors.END, cross_val_scores.mean())
print(Colors.BLUE + "Standard deviation:" + Colors.END, cross_val_scores.std())

<h4 style="color:#874c62;">4. Test model, print accuracy and confusion matrix.</h4>

In [None]:
# Predict on the test set
y_pred = clf.predict(x_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(accuracy)

In [None]:
print(Colors.BLUE + 'Accuracy: ' + Colors.END, accuracy)

# Display classification report
print(Colors.BLUE + "Classification Report:" + Colors.END)
print(classification_report(y_test, y_pred))

<h3 style="color:white; background-color:#000000">Consolidating above code into pipelines</h3>

In [1]:
# Text processing pipeline
def process_text (text):
    '''Process text for vectorization'''
    
    # Set up
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()

    # Process
    text = text.lower()
    text = WordPunctTokenizer().tokenize(text)
    text = [word for word in text if word.isalnum()]
    text = [word for word in text if word not in stop_words]
    text = [lemmatizer.lemmatize(word) for word in text]
    text = " ".join(text)

    # Return processed text
    return text

# Vectorization pipeline
def vectorize (dataframe, text_field, label_field, training_data=0):
    '''Vectorize text for Naive-Bayes classification'''

    # Set up
    count_vectorizer = CountVectorizer()
    tfidf_transformer = TfidfTransformer()
    
    # Vectorize
    bow_sparse_matrix = count_vectorizer.fit_transform(dataframe[text_field])
    tfidf_sparse_matrix = tfidf_transformer.fit_transform(bow_sparse_matrix)

    # Create feature matrix
    feature_names = count_vectorizer.get_feature_names_out();
    tfidf_df = pd.DataFrame(tfidf_sparse_matrix.toarray(), columns=feature_names)
    # Re-add classification labels is data is being vectorized for training-testing
    if (training_data == 1):
        feat_mat = pd.concat([dataframe[label_field],tfidf_df], axis = 1)
    # Don't add classification label if predicting new data
    else:
        feat_mat = tfidf_df

    # Return feature matrix
    return feat_mat

# Model pipeline
def model (feat_mat, text_field, label_field, split_ratio=0.75):
    '''Build and train model'''

    # Set up
    clf = MultinomialNB()
    
    # Shuffle matrix
    shuffled_feat_mat = feat_mat.sample(frac=1, random_state=1)
    # Split data into test and train sets
    split_index = int( 0.75 * len(shuffled_feat_mat) )
    train_data = shuffled_feat_mat[:split_index]
    test_data = shuffled_feat_mat[split_index:]
    x_train = train_data.drop(columns = ["CLASS"])
    y_train = train_data["CLASS"]
    x_test = test_data.drop(columns = ["CLASS"])
    y_test = test_data["CLASS"]
    # Train model
    trained_model = clf.fit(x_train, y_train)
    # Test model
    y_pred = clf.predict(x_test)
    # Print classification report
    print("Classification Report:")
    print(classification_report(y_test, y_pred))

    # Return model
    return trained_model

<h3 style="color:white; background-color:#000000">Testing pipeline</h3>

In [2]:
# Testing
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from nltk.tokenize import WordPunctTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import cross_val_score

raw_data = pd.read_csv("Youtube01-Psy.csv")

proc_data = raw_data[["CLASS","CONTENT"]].copy()
# print(proc_data.head(3))

proc_data["CONTENT"] = proc_data["CONTENT"].apply(process_text)
# print(proc_data.head(3))

feat_mat=vectorize(proc_data, "CONTENT", "CLASS",1)
# print(feat_mat.head(3))

model = model(feat_mat,"CONTENT","CLASS")

model.predict(x_test)

Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.98      0.95        42
           1       0.98      0.93      0.96        46

    accuracy                           0.95        88
   macro avg       0.95      0.96      0.95        88
weighted avg       0.96      0.95      0.95        88



NameError: name 'x_test' is not defined