In [31]:
import numpy as np
import pandas as pd
import glob
import os
import pickle
import plotly.express as px
from plotly.offline import init_notebook_mode
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import  train_test_split
from sklearn.naive_bayes import BernoulliNB
init_notebook_mode(connected=True)

In [8]:
path = "youtube-spam-collection-v1"
all_files = glob.glob(os.path.join(path, "*.csv"))
dfs = []

for filename in all_files:
    frame = pd.read_csv(filename, index_col=None, header=0)
    dfs.append(frame)

In [9]:
df = pd.concat(dfs, axis=0, ignore_index=True)

In [10]:
type(frame)

pandas.core.frame.DataFrame

In [15]:
df = df[["CONTENT", "CLASS"]]

In [16]:
df["CLASS"] = df["CLASS"].map({
    0: "Not Spam", 1: "Spam"
})

In [30]:
df["CLASS"].value_counts()

Spam        1005
Not Spam     951
Name: CLASS, dtype: int64

In [32]:
types = df["CLASS"].value_counts()
values = types.values 
names = types.index

fig = px.pie(df, values=values, names=names)
fig.show()

In [19]:
X = df["CONTENT"]
X = np.array(X)

y = df["CLASS"]
y = np.array(y)

In [20]:
cv = CountVectorizer()
X_scaled = cv.fit_transform(X)

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=4242)

In [23]:
model = BernoulliNB()
model.fit(X_train, y_train)

In [24]:
model.score(X_test, y_test)

0.8903061224489796

In [26]:
sample = "Follow me: https://example.com"
test = cv.transform([sample]).toarray()
model.predict(test)

array(['Spam'], dtype='<U8')

In [28]:
pickle.dump(cv, open("cv.pkl", "wb"))

In [29]:
pickle.dump(model, open("model.pkl", "wb"))