# Installs & imports

In [1]:
!pip install spacy



In [2]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m20.9 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [3]:
!pip install fasttext-wheel



In [4]:
!pip install wandb



In [28]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import roc_auc_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

import spacy
import gensim.downloader
import fasttext

import wandb

# Data

In [None]:
!gdown https://drive.google.com/drive/folders/1Df8XPJNz2k2O8K_7NJ9hPBceRB6oa8gB --folder

In [72]:
train_data = pd.read_csv('spam_dataset/train_spam.csv', header=0, index_col=False)
test_data = pd.read_csv('spam_dataset/test_spam.csv', header=0, index_col=False)

In [73]:
train_data.shape, test_data.shape

((16278, 2), (4070, 1))

In [74]:
train_data.head()

Unnamed: 0,text_type,text
0,ham,make sure alex knows his birthday is over in f...
1,ham,a resume for john lavorato thanks vince i will...
2,spam,plzz visit my website moviesgodml to get all m...
3,spam,urgent your mobile number has been awarded wit...
4,ham,overview of hr associates analyst project per ...


In [75]:
test_data.head()

Unnamed: 0,text
0,j jim whitehead ejw cse ucsc edu writes j you ...
1,original message from bitbitch magnesium net p...
2,java for managers vince durasoft who just taug...
3,there is a youtuber name saiman says
4,underpriced issue with high return on equity t...


In [76]:
train_data['text_type'].value_counts()

text_type
ham     11469
spam     4809
Name: count, dtype: int64

In [77]:
train_data['text_type'] = train_data['text_type'].apply(lambda x: 1 if x == 'spam' else 0)

In [78]:
train_ids, val_ids = train_test_split(train_data.index, test_size=0.2, stratify=train_data.text_type, random_state=74)

In [79]:
train_set, val_set = train_data.loc[train_ids], train_data.loc[val_ids]
train_set.text_type.value_counts() / train_set.shape[0], val_set.text_type.value_counts() / val_set.shape[0]

(text_type
 0    0.704577
 1    0.295423
 Name: count, dtype: float64,
 text_type
 0    0.704545
 1    0.295455
 Name: count, dtype: float64)

# Preprocessing

In [37]:
nlp = spacy.load("en_core_web_sm")

In [44]:
def preprocessing(text):
    parsed = nlp(text)
    normalized = []
    for token in parsed:
        if token.is_stop or token.is_punct or token.is_space:
            continue
        normalized.append(token.lemma_)
    return ' '.join(normalized)

In [45]:
%%time
train_set['preprocessed'] = train_set.text.apply(preprocessing)
val_set['preprocessed'] = val_set.text.apply(preprocessing)
test_data['preprocessed'] = test_data.text.apply(preprocessing)

CPU times: user 6min, sys: 993 ms, total: 6min 1s
Wall time: 6min 6s


In [46]:
train_set.to_csv('preprocessed_train.csv', header=True, index=True)
val_set.to_csv('preprocessed_val.csv', header=True, index=True)
test_data.to_csv('preprocessed_test.csv', header=True, index=True)

In [6]:
# train_set = pd.read_csv('preprocessed_train.csv', header=0, index_col=0).fillna('')
# val_set = pd.read_csv('preprocessed_val.csv', header=0, index_col=0).fillna('')
# test_data = pd.read_csv('preprocessed_test.csv', header=0, index_col=0).fillna('')

# Pipeline 1: vectorization + classification

## Vectorizers

In [7]:
embeddings = {}

### TF-IDF

In [8]:
tfidf_vectorizer = TfidfVectorizer(max_features=10000)
X_train_tfidf = tfidf_vectorizer.fit_transform(train_set.preprocessed)
X_val_tfidf = tfidf_vectorizer.transform(val_set.preprocessed)

In [9]:
X_train_tfidf.shape, X_val_tfidf.shape

((13022, 10000), (3256, 10000))

In [10]:
embeddings['tfidf'] = {'train': X_train_tfidf, 'val': X_val_tfidf}

### Word2Vec

In [11]:
w2v_vectors = gensim.downloader.load('word2vec-google-news-300')

In [12]:
class W2vVectorizer(BaseEstimator, TransformerMixin):
    def __init__(self, w2v_model):
        self._model = w2v_model

    def fit(self, X, y=None):
        self.is_fitted_ = True
        return self

    def transform(self, X) -> np.ndarray:
        vectors = []
        for sample in X:
            if sample:
                vector = self._model.get_mean_vector(
                    sample.split(),
                    pre_normalize=False,
                    ignore_missing=True
                    )
            else:
                vector = np.zeros(self._model.vector_size)
            vectors.append(vector)
        return np.array(vectors)

In [13]:
w2v_vectorizer = W2vVectorizer(w2v_vectors)

In [14]:
X_train_w2v = w2v_vectorizer.fit_transform(train_set.preprocessed)
X_val_w2v = w2v_vectorizer.transform(val_set.preprocessed)
X_train_w2v.shape, X_val_w2v.shape

((13022, 300), (3256, 300))

In [15]:
embeddings['w2v'] = {'train': X_train_w2v, 'val': X_val_w2v}

In [16]:
del w2v_vectors
del w2v_vectorizer

### FastText

In [17]:
# !wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz
# !gunzip cc.en.300.bin.gz
ft_model = fasttext.load_model('cc.en.300.bin')



In [18]:
class FastTextVectorizer(BaseEstimator, TransformerMixin):
    def __init__(self, ft_model):
        self._model = ft_model

    def fit(self, X, y=None):
        self.is_fitted_ = True
        return self

    def transform(self, X) -> np.ndarray:
        vectors = []
        for i, sample in enumerate(X):
            vector = self._model.get_sentence_vector(sample)
            vectors.append(vector)
        return np.array(vectors)

In [19]:
ft_vectorizer = FastTextVectorizer(ft_model)

In [20]:
X_train_ft = ft_vectorizer.fit_transform(train_set.preprocessed)
X_val_ft = ft_vectorizer.transform(val_set.preprocessed)
X_train_ft.shape, X_val_ft.shape

((13022, 300), (3256, 300))

In [21]:
embeddings['ft'] = {'train': X_train_ft, 'val': X_val_ft}

In [22]:
del ft_model
del ft_vectorizer

---

In [25]:
import pickle

with open('embeddings.pkl', 'wb') as newf:
    pickle.dump(embeddings, newf)

# with open('embeddings.pkl', 'rb') as f:
#     embeddings = pickle.load(f)

## Classifiers

In [30]:
classifiers  = {
    'logreg': LogisticRegression(random_state=74),
    'nb': MultinomialNB(),
    'rf@10': RandomForestClassifier(max_depth=10, random_state=74),
    'rf@100': RandomForestClassifier(max_depth=100, random_state=74),
    'rf@1000': RandomForestClassifier(max_depth=1000, random_state=74)
}

## Cross-validation

In [31]:
wandb.login()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [32]:
sweep_configuration = {
    "method": "grid",
    "metric": {"goal": "maximize", "name": "val_auc_roc"},
    "parameters": {
        "embeddings": {"values": ['tfidf', 'w2v', 'ft']},
        "classifiers": {"values": ['logreg', 'nb', 'rf@10', 'rf@100', 'rf@1000']},
    },
}

In [40]:
clf = classifiers['logreg']
embs = embeddings['tfidf']

clf.fit(embs['train'], train_set.text_type.to_numpy())

train_pred = clf.predict_proba(embs['train'])
val_pred = clf.predict_proba(embs['val'])
train_pred, val_pred
# train_auc_roc = roc_auc_score(train_set.text_type.to_numpy(), train_pred)

(array([[0.88090207, 0.11909793],
        [0.20660679, 0.79339321],
        [0.94430052, 0.05569948],
        ...,
        [0.77803617, 0.22196383],
        [0.12642396, 0.87357604],
        [0.96353556, 0.03646444]]),
 array([[0.90819159, 0.09180841],
        [0.91616235, 0.08383765],
        [0.94605978, 0.05394022],
        ...,
        [0.95546378, 0.04453622],
        [0.03030329, 0.96969671],
        [0.94644859, 0.05355141]]))

In [41]:
clf.classes_

array([0, 1])

In [44]:
def train():
    run = wandb.init()

    if wandb.config.classifiers == 'nb' and wandb.config.embeddings in ['ft', 'w2v']:
        return

    embs = embeddings[wandb.config.embeddings]
    clf = classifiers[wandb.config.classifiers]

    clf.fit(embs['train'], train_set.text_type.to_numpy())

    train_pred = clf.predict_proba(embs['train'])
    train_auc_roc = roc_auc_score(train_set.text_type.to_numpy(), train_pred[:, 1])

    val_pred = clf.predict_proba(embs['val'])
    val_auc_roc = roc_auc_score(val_set.text_type.to_numpy(), val_pred[:, 1])
    wandb.log({
        'train_auc_roc': train_auc_roc,
        'val_auc_roc': val_auc_roc})

In [45]:
pipe1_sweep = wandb.sweep(sweep=sweep_configuration, project="vk_spam_detection")
wandb.agent(pipe1_sweep, function=train)

Create sweep with ID: vcw608nb
Sweep URL: https://wandb.ai/vknyazkova/vk_spam_detection/sweeps/vcw608nb


[34m[1mwandb[0m: Agent Starting Run: kei2fi0w with config:
[34m[1mwandb[0m: 	classifiers: logreg
[34m[1mwandb[0m: 	embeddings: tfidf


VBox(children=(Label(value='0.010 MB of 0.010 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
train_auc_roc,▁
val_auc_roc,▁

0,1
train_auc_roc,0.98902
val_auc_roc,0.98066


[34m[1mwandb[0m: Agent Starting Run: 5tec1swy with config:
[34m[1mwandb[0m: 	classifiers: logreg
[34m[1mwandb[0m: 	embeddings: w2v


VBox(children=(Label(value='0.010 MB of 0.010 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
train_auc_roc,▁
val_auc_roc,▁

0,1
train_auc_roc,0.93448
val_auc_roc,0.93014


[34m[1mwandb[0m: Agent Starting Run: 1otj9usj with config:
[34m[1mwandb[0m: 	classifiers: logreg
[34m[1mwandb[0m: 	embeddings: ft


VBox(children=(Label(value='0.010 MB of 0.010 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
train_auc_roc,▁
val_auc_roc,▁

0,1
train_auc_roc,0.9436
val_auc_roc,0.93533


[34m[1mwandb[0m: Agent Starting Run: vhbc8h2r with config:
[34m[1mwandb[0m: 	classifiers: nb
[34m[1mwandb[0m: 	embeddings: tfidf


VBox(children=(Label(value='0.010 MB of 0.010 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
train_auc_roc,▁
val_auc_roc,▁

0,1
train_auc_roc,0.98369
val_auc_roc,0.97178


[34m[1mwandb[0m: Agent Starting Run: 56xoau2o with config:
[34m[1mwandb[0m: 	classifiers: nb
[34m[1mwandb[0m: 	embeddings: w2v


VBox(children=(Label(value='0.010 MB of 0.010 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

[34m[1mwandb[0m: Agent Starting Run: x9dias3x with config:
[34m[1mwandb[0m: 	classifiers: nb
[34m[1mwandb[0m: 	embeddings: ft


VBox(children=(Label(value='0.010 MB of 0.010 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

[34m[1mwandb[0m: Agent Starting Run: 4n02bbn5 with config:
[34m[1mwandb[0m: 	classifiers: rf@10
[34m[1mwandb[0m: 	embeddings: tfidf


VBox(children=(Label(value='0.010 MB of 0.010 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
train_auc_roc,▁
val_auc_roc,▁

0,1
train_auc_roc,0.96273
val_auc_roc,0.96145


[34m[1mwandb[0m: Agent Starting Run: 1cn03mi6 with config:
[34m[1mwandb[0m: 	classifiers: rf@10
[34m[1mwandb[0m: 	embeddings: w2v


VBox(children=(Label(value='0.010 MB of 0.010 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
train_auc_roc,▁
val_auc_roc,▁

0,1
train_auc_roc,0.99592
val_auc_roc,0.94449


[34m[1mwandb[0m: Agent Starting Run: jmjb0o7s with config:
[34m[1mwandb[0m: 	classifiers: rf@10
[34m[1mwandb[0m: 	embeddings: ft


VBox(children=(Label(value='0.010 MB of 0.010 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
train_auc_roc,▁
val_auc_roc,▁

0,1
train_auc_roc,0.99896
val_auc_roc,0.96858


[34m[1mwandb[0m: Agent Starting Run: ezxvn9a9 with config:
[34m[1mwandb[0m: 	classifiers: rf@100
[34m[1mwandb[0m: 	embeddings: tfidf
Connection to wandb service failed: [Errno 111] Connection refused. 
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/wandb/sdk/wandb_manager.py", line 116, in _service_connect
    svc_iface._svc_connect(port=port)
  File "/usr/local/lib/python3.10/dist-packages/wandb/sdk/service/service_sock.py", line 30, in _svc_connect
    self._sock_client.connect(port=port)
  File "/usr/local/lib/python3.10/dist-packages/wandb/sdk/lib/sock_client.py", line 102, in connect
    s.connect(("localhost", port))
ConnectionRefusedError: [Errno 111] Connection refused

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/wandb/sdk/wandb_init.py", line 1177, in init
    wi.setup(kwargs)
  File "/usr/local/lib/python3.10/dist-packages/wa

VBox(children=(Label(value='0.010 MB of 0.010 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
train_auc_roc,▁
val_auc_roc,▁

0,1
train_auc_roc,0.99946
val_auc_roc,0.95007


[34m[1mwandb[0m: Agent Starting Run: 5jc3hp0w with config:
[34m[1mwandb[0m: 	classifiers: rf@100
[34m[1mwandb[0m: 	embeddings: ft


VBox(children=(Label(value='0.010 MB of 0.010 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
train_auc_roc,▁
val_auc_roc,▁

0,1
train_auc_roc,0.99987
val_auc_roc,0.97243


[34m[1mwandb[0m: Agent Starting Run: gso2zel3 with config:
[34m[1mwandb[0m: 	classifiers: rf@1000
[34m[1mwandb[0m: 	embeddings: tfidf


VBox(children=(Label(value='0.010 MB of 0.010 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
train_auc_roc,▁
val_auc_roc,▁

0,1
train_auc_roc,0.9995
val_auc_roc,0.98123


[34m[1mwandb[0m: Agent Starting Run: 4fdw9pz9 with config:
[34m[1mwandb[0m: 	classifiers: rf@1000
[34m[1mwandb[0m: 	embeddings: w2v


VBox(children=(Label(value='0.010 MB of 0.010 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
train_auc_roc,▁
val_auc_roc,▁

0,1
train_auc_roc,0.99946
val_auc_roc,0.95007


[34m[1mwandb[0m: Agent Starting Run: iwcfrmvt with config:
[34m[1mwandb[0m: 	classifiers: rf@1000
[34m[1mwandb[0m: 	embeddings: ft


VBox(children=(Label(value='0.010 MB of 0.010 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
train_auc_roc,▁
val_auc_roc,▁

0,1
train_auc_roc,0.99987
val_auc_roc,0.97243


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Ctrl + C detected. Stopping sweep.


# Pipeline 2: BERT fine-tuning