# CBOW

In [1]:
%%capture
!pip install plotly
!pip install --upgrade gensim

In [2]:
from gensim.models import Word2Vec, FastText
import pandas as pd
import re

from sklearn.decomposition import PCA

from matplotlib import pyplot as plt
import plotly.graph_objects as go

import numpy as np

import warnings
warnings.filterwarnings('ignore')

df = pd.read_csv('emails.csv')

In [11]:
from gensim.models import Word2Vec

In [19]:
import numpy as np

class MyTokenizer:
    def fit_transform(self, texts):
        # Tokenisasi sederhana: lowercase + split
        return [str(text).lower().split() for text in texts]

class MeanEmbeddingVectorizer:
    def __init__(self, word2vec_model):
        self.word2vec = word2vec_model
        # Perbaikan: gunakan vector_size (Gensim ≥ 4.0)
        self.dim = word2vec_model.wv.vector_size

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_tokenized = MyTokenizer().fit_transform(X)
        embeddings = []
        for words in X_tokenized:
            # Ambil vektor hanya untuk kata yang ada di vocab
            valid_vectors = [
                self.word2vec.wv[word] for word in words
                if word in self.word2vec.wv
            ]
            if valid_vectors:
                embeddings.append(np.mean(valid_vectors, axis=0))
            else:
                embeddings.append(np.zeros(self.dim))
        return np.array(embeddings)

    def fit_transform(self, X, y=None):
        return self.transform(X)

In [13]:
clean_txt = []
for w in range(len(df.text)):
   desc = df['text'][w].lower()

   #remove punctuation
   desc = re.sub('[^a-zA-Z]', ' ', desc)

   #remove tags
   desc=re.sub("&lt;/?.*?&gt;"," &lt;&gt; ",desc)

   #remove digits and special chars
   desc=re.sub("(\\d|\\W)+"," ",desc)
   clean_txt.append(desc)

df['clean'] = clean_txt
df.head()

Unnamed: 0,text,spam,clean
0,Subject: naturally irresistible your corporate...,1,subject naturally irresistible your corporate ...
1,Subject: the stock trading gunslinger fanny i...,1,subject the stock trading gunslinger fanny is ...
2,Subject: unbelievable new homes made easy im ...,1,subject unbelievable new homes made easy im wa...
3,Subject: 4 color printing special request add...,1,subject color printing special request additio...
4,"Subject: do not have money , get software cds ...",1,subject do not have money get software cds fro...


In [26]:
df.shape

(5728, 5)

In [20]:
corpus = []
for col in df.clean:
   word_list = col.split(" ")
   corpus.append(word_list)

#show first value
corpus[0:1]

#generate vectors from corpus
model = Word2Vec(corpus, min_count=1, vector_size = 56)

In [16]:
#explore embeddings using cosine similarity
model.wv.most_similar('eric')

model.wv.most_similar_cosmul(positive = ['phone', 'number'], negative = ['call'])

model.wv.doesnt_match("phone number prison cell".split())

#save embeddings
filename = 'email_embd.txt'
model.wv.save_word2vec_format(filename, binary=False)

In [21]:
mean_embedding_vectorizer = MeanEmbeddingVectorizer(model)
mean_embedded = mean_embedding_vectorizer.fit_transform(df['clean'])

In [22]:
df['array']=list(mean_embedded)

In [23]:
df.head(5)

Unnamed: 0,text,spam,clean,array
0,Subject: naturally irresistible your corporate...,1,subject naturally irresistible your corporate ...,"[-0.34084636, 0.94587916, -0.23244119, -0.2751..."
1,Subject: the stock trading gunslinger fanny i...,1,subject the stock trading gunslinger fanny is ...,"[-0.2489495, 0.6393162, -0.09695087, 0.3257428..."
2,Subject: unbelievable new homes made easy im ...,1,subject unbelievable new homes made easy im wa...,"[-0.49715576, 0.66056025, -0.21417588, -0.2550..."
3,Subject: 4 color printing special request add...,1,subject color printing special request additio...,"[-0.72565305, 0.3345964, -0.7068794, 0.0136828..."
4,"Subject: do not have money , get software cds ...",1,subject do not have money get software cds fro...,"[-0.18735954, 0.44190544, -0.78945196, -0.1554..."


In [24]:
df['embedding_length'] = df['array'].str.len()

In [25]:
print(df['embedding_length'])

0       56
1       56
2       56
3       56
4       56
        ..
5723    56
5724    56
5725    56
5726    56
5727    56
Name: embedding_length, Length: 5728, dtype: int64


In [27]:
df.shape

(5728, 5)

In [28]:
num_features = len(df['array'].iloc[0])  # asumsi semua list punya panjang sama
columns = [f'f{i+1}' for i in range(num_features)]

# Inisialisasi dictionary untuk menampung data per kolom
data_dict = {col: [] for col in columns}

# Looping setiap baris di kolom 'embedding'
for embedding_list in df['array']:
    for i, value in enumerate(embedding_list):
        data_dict[f'f{i+1}'].append(value)

# Buat DataFrame dari dictionary
embedding_df = pd.DataFrame(data_dict)

print(embedding_df)

            f1        f2        f3        f4        f5        f6        f7  \
0    -0.340846  0.945879 -0.232441 -0.275108  0.416041 -0.106353  0.840533   
1    -0.248949  0.639316 -0.096951  0.325743  0.325895 -0.359264  0.368117   
2    -0.497156  0.660560 -0.214176 -0.255093  0.522734 -0.341303  1.046687   
3    -0.725653  0.334596 -0.706879  0.013683  0.698313  0.224880  0.911870   
4    -0.187360  0.441905 -0.789452 -0.155474  0.274913 -0.641672  0.901835   
...        ...       ...       ...       ...       ...       ...       ...   
5723 -0.429112  0.930693  0.170549  0.136794 -0.077450 -0.306312 -0.034427   
5724 -0.535915  0.300235  0.033640 -0.374368  0.577522 -0.083339  0.436403   
5725 -0.366838  0.649603  0.222223 -0.243197  0.386375 -0.453572  0.724553   
5726 -0.248911  1.182235  0.173246 -0.133293 -0.079092 -0.409817 -0.074807   
5727 -0.028038  0.367393 -0.073000 -0.172228  0.457297 -0.125878  0.582865   

            f8        f9       f10  ...       f47       f48    

In [29]:
embedding_df['spam'] = df['spam'].values  

In [30]:
embedding_df

Unnamed: 0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,...,f48,f49,f50,f51,f52,f53,f54,f55,f56,spam
0,-0.340846,0.945879,-0.232441,-0.275108,0.416041,-0.106353,0.840533,-0.366576,-0.195145,-0.472340,...,0.129809,0.645374,0.891383,0.880596,0.026609,-0.284418,0.480821,0.337945,0.287987,1
1,-0.248949,0.639316,-0.096951,0.325743,0.325895,-0.359264,0.368117,-0.241210,-0.325327,-0.101467,...,0.182618,-0.030076,0.163888,0.243907,-0.047279,-0.292656,-0.034134,-0.104050,0.192359,1
2,-0.497156,0.660560,-0.214176,-0.255093,0.522734,-0.341303,1.046687,-0.353925,-0.313221,-0.252311,...,-0.252567,0.679628,1.086356,1.180938,0.255360,-0.133034,0.318912,0.757153,0.205873,1
3,-0.725653,0.334596,-0.706879,0.013683,0.698313,0.224880,0.911870,0.095007,-0.958646,-0.139002,...,-0.389851,0.687540,0.293965,0.188490,0.521421,0.223555,1.013161,0.530562,0.309869,1
4,-0.187360,0.441905,-0.789452,-0.155474,0.274913,-0.641672,0.901835,-0.058394,-0.582017,-0.023898,...,-0.009607,0.396054,0.769915,1.022024,0.226101,-0.559300,-0.116582,0.224948,0.495575,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5723,-0.429112,0.930693,0.170549,0.136794,-0.077450,-0.306312,-0.034427,-0.382644,-0.470460,-0.400324,...,0.671940,-0.448062,0.286456,-0.112746,0.516586,0.627049,-1.602695,1.135171,0.560008,0
5724,-0.535915,0.300235,0.033640,-0.374368,0.577522,-0.083339,0.436403,-0.018108,-0.683059,-0.108046,...,0.001764,-0.033609,0.166773,0.041532,0.718980,-0.100917,0.019857,1.021474,-0.067909,0
5725,-0.366838,0.649603,0.222223,-0.243197,0.386375,-0.453572,0.724553,-0.320769,-0.488486,-0.213099,...,0.233517,-0.085686,0.547258,0.524856,0.646246,0.035679,-0.500271,1.068576,0.346308,0
5726,-0.248911,1.182235,0.173246,-0.133293,-0.079092,-0.409817,-0.074807,-0.329784,-0.581167,-0.478702,...,0.428678,-0.756828,0.461571,0.110711,0.796461,0.960588,-1.908756,1.577654,0.792951,0


In [31]:
embedding_df.shape

(5728, 57)