In [12]:
import nltk
import pandas as pd
import re
 
from sklearn.feature_extraction.text import TfidfVectorizer
import string
 

data = pd.read_csv("SMSSpamCollection.txt", sep='\t', header=None)
data.columns = ['label', 'Content']

en_stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()
wn = nltk.WordNetLemmatizer()
data['Content_len'] = data['Content'].apply(lambda x: len(x) - x.count(" "))

def count_punctuation(text):
    binary_array = [1 for ch in text if ch in string.punctuation] 
    nb_ponctuation = sum(binary_array)
    total = len(text) - text.count(" ")
    return round(nb_ponctuation/(total), 4)*100

data['punctuation_rate'] = data['Content'].apply(lambda x: count_punctuation(x))


def clean_email(email):
    result = "".join([word for word in email if word not in string.punctuation])
    tokens = re.split('\W+', result)
    text = [wn.lemmatize(word) for word in tokens if word not in en_stopwords]
    return text

 


In [13]:
data

Unnamed: 0,label,Content,Content_len,punctuation_rate
0,ham,"Go until jurong point, crazy.. Available only ...",92,9.78
1,ham,Ok lar... Joking wif u oni...,24,25.00
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,128,4.69
3,ham,U dun say so early hor... U c already then say...,39,15.38
4,ham,"Nah I don't think he goes to usf, he lives aro...",49,4.08
...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,131,6.11
5568,ham,Will ü b going to esplanade fr home?,29,3.45
5569,ham,"Pity, * was in mood for that. So...any other s...",48,14.58
5570,ham,The guy did some bitching but I acted like i'd...,100,1.00


In [14]:
data['cleaned_content'] = data['Content'].apply(lambda x: clean_email(x.lower()))
data1=data['cleaned_content'].tolist()

In [15]:
data1

[['go',
  'jurong',
  'point',
  'crazy',
  'available',
  'bugis',
  'n',
  'great',
  'world',
  'la',
  'e',
  'buffet',
  'cine',
  'got',
  'amore',
  'wat'],
 ['ok', 'lar', 'joking', 'wif', 'u', 'oni'],
 ['free',
  'entry',
  '2',
  'wkly',
  'comp',
  'win',
  'fa',
  'cup',
  'final',
  'tkts',
  '21st',
  'may',
  '2005',
  'text',
  'fa',
  '87121',
  'receive',
  'entry',
  'questionstd',
  'txt',
  'ratetcs',
  'apply',
  '08452810075over18s'],
 ['u', 'dun', 'say', 'early', 'hor', 'u', 'c', 'already', 'say'],
 ['nah', 'dont', 'think', 'go', 'usf', 'life', 'around', 'though'],
 ['freemsg',
  'hey',
  'darling',
  '3',
  'week',
  'word',
  'back',
  'id',
  'like',
  'fun',
  'still',
  'tb',
  'ok',
  'xxx',
  'std',
  'chgs',
  'send',
  '150',
  'rcv'],
 ['even', 'brother', 'like', 'speak', 'treat', 'like', 'aid', 'patent'],
 ['per',
  'request',
  'melle',
  'melle',
  'oru',
  'minnaminunginte',
  'nurungu',
  'vettam',
  'set',
  'callertune',
  'caller',
  'press',
  

In [18]:
from gensim.models import KeyedVectors

model_path = '../DevoirMaison_TP_NLP/GoogleNews-vectors-negative300.bin'
word2vec_model = KeyedVectors.load_word2vec_format(model_path, binary=True)

In [19]:
# Initialiser une liste vide pour stocker les vecteurs de phrases
vecteurs_phrases = []

# Itérer à travers chaque phrase dans le corpus sou
for phrase in data1:
   # Filtrer les mots de la phrase qui sont présents dans le modèle Word2Vec
    mots_dans_le_modele = [mot for mot in phrase if mot in word2vec_model]
    # Si la phrase est vide ou ne contient que des mots absents dans le modèle, créer un vecteur nul
    if not mots_dans_le_modele:
        vecteur_moyen = [0.0] * word2vec_model.vector_size
    else:
        # Calculer le vecteur moyen en prenant la moyenne des vecteurs des mots présents dans le modèle
        vecteur_moyen = sum(word2vec_model[mot] for mot in mots_dans_le_modele) / len(mots_dans_le_modele)
    
    # Ajouter le vecteur moyen de la phrase à la liste      
    vecteurs_phrases.append(vecteur_moyen)

import pandas as pd

# Créer un DataFrame à partir des vecteurs de phrase
df_w2v = pd.DataFrame(vecteurs_phrases, columns=[f"dim_{i+1}" for i in range(len(vecteurs_phrases[0]))])


print(df_w2v)


         dim_1     dim_2     dim_3     dim_4     dim_5     dim_6     dim_7  \
0    -0.019806  0.051671  0.027100  0.218680 -0.031034  0.038980  0.081966   
1    -0.063235  0.080383  0.060944  0.102498 -0.078857  0.035238 -0.053040   
2     0.000789 -0.029879 -0.071720  0.074291  0.067394  0.026331 -0.008408   
3    -0.065681  0.026215  0.108154  0.086975 -0.111816  0.000515 -0.044501   
4    -0.002167  0.021763  0.060364  0.204407 -0.010239  0.021332  0.058685   
...        ...       ...       ...       ...       ...       ...       ...   
5567  0.024626  0.024634  0.037060  0.047160  0.001799  0.010300  0.050218   
5568  0.013713  0.101562  0.146983  0.089147 -0.062093 -0.017278 -0.082540   
5569  0.093099  0.096029  0.026326  0.124674 -0.160400  0.026774  0.123372   
5570  0.093331  0.015834 -0.002736  0.070326 -0.099481  0.023071  0.035505   
5571  0.111979  0.067942  0.087240  0.075765 -0.172201  0.050944  0.062012   

         dim_8     dim_9    dim_10  ...   dim_291   dim_292   d

In [21]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(df_w2v, data['label'], test_size=0.2)

In [26]:
from sklearn  import svm
alg_svm= svm.SVC(kernel = 'linear')
alg_svm.fit(X_train, Y_train)
predictions = alg_svm.predict(X_test)

In [28]:
from sklearn.metrics import precision_recall_fscore_support as score
precision, recall, fscore, _ = score(Y_test, predictions, pos_label='spam', average='binary')
print('Precision: {} / Recall: {} / Accuracy: {}'.format(round(precision, 3),
                                                        round(recall, 3),
                                                        round((predictions==Y_test).sum() / len(predictions),3)))

Precision: 0.856 / Recall: 0.844 / Accuracy: 0.962


In [29]:
fscore

0.85