<a href="https://colab.research.google.com/github/sushanttwayana/NLP_ML-DL/blob/main/nlp_word2vec_and_avgword2vec_ipynb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# importing the Dataset

import pandas as pd

# sep -> seperator
messages = pd.read_csv('/kaggle/input/smsspamcollection/SMSSpamCollection.txt', sep='\t',
                           names=["label", "message"])

In [None]:
messages

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [None]:
messages.shape

(5572, 2)

In [None]:
messages['message'].loc[100]

"Please don't text me anymore. I have nothing else to say."

In [None]:
# Data cleaning and preprocessing

import re
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [None]:
corpus = []

for i in range(0, len(messages)):
    review = re.sub('[^a-zA-Z0-9]', ' ', messages['message'][i])
    review = review.lower()
    review = review.split()

    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [None]:
corpus

['go jurong point crazi avail bugi n great world la e buffet cine got amor wat',
 'ok lar joke wif u oni',
 'free entri 2 wkli comp win fa cup final tkt 21st may 2005 text fa 87121 receiv entri question std txt rate c appli 08452810075over18',
 'u dun say earli hor u c alreadi say',
 'nah think goe usf live around though',
 'freemsg hey darl 3 week word back like fun still tb ok xxx std chg send 1 50 rcv',
 'even brother like speak treat like aid patent',
 'per request mell mell oru minnaminungint nurungu vettam set callertun caller press 9 copi friend callertun',
 'winner valu network custom select receivea 900 prize reward claim call 09061701461 claim code kl341 valid 12 hour',
 'mobil 11 month u r entitl updat latest colour mobil camera free call mobil updat co free 08002986030',
 'gonna home soon want talk stuff anymor tonight k cri enough today',
 'six chanc win cash 100 20 000 pound txt csh11 send 87575 cost 150p day 6day 16 tsandc appli repli hl 4 info',
 'urgent 1 week free mem

In [None]:
# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
# get the top 2500 maximum features
cv = CountVectorizer(max_features=2500, binary = True, ngram_range = (1,1))
X = cv.fit_transform(corpus).toarray()

In [None]:
X[0]

array([0, 0, 0, ..., 0, 0, 0])

In [None]:
X.shape

(5572, 2500)

In [None]:
y=pd.get_dummies(messages['label'])
y=y.iloc[:,1].values

In [None]:
y

array([False, False,  True, ..., False, False, False])

In [None]:
# Train Test Split

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [None]:
X_train, y_train

(array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]]),
 array([False, False, False, ...,  True, False, False]))

In [None]:
from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB().fit(X_train, y_train)

In [None]:
#prediction
y_pred=spam_detect_model.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score,classification_report

score=accuracy_score(y_test,y_pred)
print(score)

0.9865470852017937


In [None]:
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

       False       0.99      0.99      0.99       960
        True       0.94      0.97      0.95       155

    accuracy                           0.99      1115
   macro avg       0.97      0.98      0.97      1115
weighted avg       0.99      0.99      0.99      1115



In [None]:
# Creating the TFIDF model
from sklearn.feature_extraction.text import TfidfVectorizer
tv = TfidfVectorizer(max_features=2500, ngram_range = (1,2))
X = tv.fit_transform(corpus).toarray()

In [None]:
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [None]:
# Train Test Split

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [None]:
from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB().fit(X_train, y_train)

In [None]:
#prediction
y_pred=spam_detect_model.predict(X_test)

In [None]:
score=accuracy_score(y_test,y_pred)
print(score)

0.9865470852017937


In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

       False       0.99      0.99      0.99       960
        True       0.94      0.97      0.95       155

    accuracy                           0.99      1115
   macro avg       0.97      0.98      0.97      1115
weighted avg       0.99      0.99      0.99      1115



# Random Forest Classifier


In [None]:
from sklearn.ensemble import RandomForestClassifier

classifier = RandomForestClassifier()
classifier.fit(X_train, y_train)

ValueError: Found array with dim 3. RandomForestClassifier expected <= 2.

In [None]:
y_pred = classifier.predict(X_test)
print(accuracy_score(y_pred, y_test))
print(classification_report(y_pred, y_test))

0.9829596412556054
              precision    recall  f1-score   support

       False       1.00      0.98      0.99       970
        True       0.89      0.99      0.94       145

    accuracy                           0.98      1115
   macro avg       0.95      0.98      0.96      1115
weighted avg       0.98      0.98      0.98      1115



# **Word2vec Implementation**

In [None]:
!pip install gensim



In [None]:
from nltk.stem import WordNetLemmatizer
lemmatizer=WordNetLemmatizer()

In [None]:
nltk.download('wordnet')


[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
nltk.data.path.append("/usr/share/nltk_data")

In [None]:
print(nltk.data.path)

['/root/nltk_data', '/usr/share/nltk_data', '/usr/local/share/nltk_data', '/usr/lib/nltk_data', '/usr/local/lib/nltk_data', '/usr/share/nltk_data', '/usr/share/nltk_data', '/usr/share/nltk_data']


In [None]:
import spacy

# Load English language model
nlp = spacy.load("en_core_web_sm")

# Define function for text preprocessing
def preprocess_text(text):
    # Apply spaCy pipeline to text
    doc = nlp(text)
    # Lemmatize tokens and remove stopwords
    processed_text = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha ]
    return ' '.join(processed_text)

# Example usage:
corpus = []
for i in range(len(messages)):
    review = messages['message'][i]
    processed_review = preprocess_text(review)
    corpus.append(processed_review)


In [None]:
corpus

['jurong point crazy available bugis n great world la e buffet Cine get amore wat',
 'ok lar joke wif u oni',
 'free entry wkly comp win FA Cup final tkts text FA receive entry txt apply',
 'u dun early hor u c',
 'nah think go usf live',
 'FreeMsg hey darle week word like fun tb ok XxX std chgs send rcv',
 'brother like speak treat like aids patent',
 'request Melle Melle Oru Minnaminunginte Nurungu Vettam set callertune Callers Press copy friend Callertune',
 'WINNER value network customer select receivea prize reward claim claim code valid hour',
 'mobile month u r entitle update late colour mobile camera Free Mobile Update Co free',
 'go to home soon want talk stuff anymore tonight k cry today',
 'chance win cash pound txt send cost day TsandCs apply Reply HL info',
 'urgent win week free membership Prize Jackpot txt word claim LCCLTD POBOX',
 'search right word thank breather promise will not help grant fulfil promise wonderful blessing time',
 'date SUNDAY',
 'xxxmobilemovieclub 

In [None]:
from nltk import sent_tokenize
from gensim.utils import simple_preprocess

In [None]:
corpus[0]

'jurong point crazy available bugis n great world la e buffet Cine get amore wat'

In [None]:
#Convert a document into a list of lowercase tokens, ignoring tokens that are too short or too long.
simple_preprocess()

In [None]:
# lowering each and every words in the sentences
words=[]
for sent in corpus:
    sent_token=sent_tokenize(sent)
    for sent in sent_token:
        words.append(simple_preprocess(sent))

In [None]:
words

[['jurong',
  'point',
  'crazy',
  'available',
  'bugis',
  'great',
  'world',
  'la',
  'buffet',
  'cine',
  'get',
  'amore',
  'wat'],
 ['ok', 'lar', 'joke', 'wif', 'oni'],
 ['free',
  'entry',
  'wkly',
  'comp',
  'win',
  'fa',
  'cup',
  'final',
  'tkts',
  'text',
  'fa',
  'receive',
  'entry',
  'txt',
  'apply'],
 ['dun', 'early', 'hor'],
 ['nah', 'think', 'go', 'usf', 'live'],
 ['freemsg',
  'hey',
  'darle',
  'week',
  'word',
  'like',
  'fun',
  'tb',
  'ok',
  'xxx',
  'std',
  'chgs',
  'send',
  'rcv'],
 ['brother', 'like', 'speak', 'treat', 'like', 'aids', 'patent'],
 ['request',
  'melle',
  'melle',
  'oru',
  'minnaminunginte',
  'nurungu',
  'vettam',
  'set',
  'callertune',
  'callers',
  'press',
  'copy',
  'friend',
  'callertune'],
 ['winner',
  'value',
  'network',
  'customer',
  'select',
  'receivea',
  'prize',
  'reward',
  'claim',
  'claim',
  'code',
  'valid',
  'hour'],
 ['mobile',
  'month',
  'entitle',
  'update',
  'late',
  'colour',


In [None]:
cd /usr/share/nltk_data

/usr/share/nltk_data


In [None]:
ls

[0m[01;34mchunkers[0m/  [01;34mgrammars[0m/  [01;34mmodels[0m/     [01;34mstemmers[0m/  [01;34mtokenizers[0m/
[01;34mcorpora[0m/   [01;34mhelp[0m/      [01;34msentiment[0m/  [01;34mtaggers[0m/


In [None]:
ls corpora


[0m[01;34mabc[0m/                     [01;34mkimmo[0m/                 [01;34msentence_polarity[0m/
abc.zip                  kimmo.zip              sentence_polarity.zip
[01;34malpino[0m/                  knbc.zip               [01;34msentiwordnet[0m/
alpino.zip               [01;34mlin_thesaurus[0m/         sentiwordnet.zip
[01;34mbiocreative_ppi[0m/         lin_thesaurus.zip      [01;34mshakespeare[0m/
biocreative_ppi.zip      [01;34mmac_morpho[0m/            shakespeare.zip
[01;34mbrown[0m/                   mac_morpho.zip         [01;34msinica_treebank[0m/
brown.zip                machado.zip            sinica_treebank.zip
[01;34mbrown_tei[0m/               masc_tagged.zip        [01;34msmultron[0m/
brown_tei.zip            [01;34mmovie_reviews[0m/         smultron.zip
[01;34mcess_cat[0m/                movie_reviews.zip      [01;34mstate_union[0m/
cess_cat.zip             [01;34mmte_teip5[0m/             state_union.zip
[01;34mcess_esp[0m/     

# Creating Word2Vec from Scratch

In [None]:
import gensim

In [None]:
### Lets train Word2vec from scratch
model=gensim.models.Word2Vec(words,window=5,min_count=2)

In [None]:
# all vocabularly
model.wv.index_to_key

['ur',
 'go',
 'come',
 'not',
 'get',
 'know',
 'free',
 'ok',
 'good',
 'send',
 'like',
 'day',
 'want',
 'love',
 'time',
 'text',
 'tell',
 'think',
 'need',
 'txt',
 'stop',
 'today',
 'home',
 'lor',
 'sorry',
 'reply',
 'mobile',
 'phone',
 'new',
 'week',
 'hi',
 'da',
 'work',
 'later',
 'win',
 'ask',
 'say',
 'hope',
 'miss',
 'night',
 'pls',
 'message',
 'wait',
 'dear',
 'try',
 'great',
 'thing',
 'claim',
 'oh',
 'leave',
 'wat',
 'hey',
 'meet',
 'happy',
 'number',
 'friend',
 'min',
 'feel',
 'msg',
 'thank',
 'way',
 'find',
 'late',
 'prize',
 'right',
 'let',
 'pick',
 'tomorrow',
 'yes',
 'cash',
 'yeah',
 'amp',
 'sleep',
 'babe',
 'life',
 'tone',
 'buy',
 'morning',
 'call',
 'cos',
 'care',
 'contact',
 'service',
 'lol',
 'start',
 'look',
 'sure',
 'wish',
 'year',
 'watch',
 'nokia',
 'to',
 'end',
 'award',
 'smile',
 'place',
 'finish',
 'urgent',
 'gud',
 'soon',
 'guy',
 'tonight',
 'mean',
 'talk',
 'customer',
 'word',
 'chat',
 'money',
 'will',
 '

In [None]:
model.corpus_count

5542

In [None]:
model.epochs

5

In [None]:
model.wv.similar_by_word('win')

[('txt', 0.9996490478515625),
 ('cash', 0.9996243715286255),
 ('claim', 0.999620795249939),
 ('free', 0.9995660781860352),
 ('mobile', 0.9995518326759338),
 ('ur', 0.9995474815368652),
 ('prize', 0.9995352029800415),
 ('number', 0.9995277523994446),
 ('show', 0.9995193481445312),
 ('line', 0.9995065927505493)]

In [None]:
model.wv['win'].shape

(100,)

**Avg Word2Vec**

In [None]:
def avg_word2vec(doc):
    # remove out-of-vocabulary words
    #sent = [word for word in doc if word in model.wv.index_to_key]
    #print(sent)

    return np.mean([model.wv[word] for word in doc if word in model.wv.index_to_key],axis=0)
                #or [np.zeros(len(model.wv.index_to_key))], axis=0)



In [None]:
!pip install tqdm



In [None]:
from tqdm import tqdm

In [None]:
words[73]

['perform']

In [None]:
type(model.wv.index_to_key)

list

In [None]:
words

[['jurong',
  'point',
  'crazy',
  'available',
  'bugis',
  'great',
  'world',
  'la',
  'buffet',
  'cine',
  'get',
  'amore',
  'wat'],
 ['ok', 'lar', 'joke', 'wif', 'oni'],
 ['free',
  'entry',
  'wkly',
  'comp',
  'win',
  'fa',
  'cup',
  'final',
  'tkts',
  'text',
  'fa',
  'receive',
  'entry',
  'txt',
  'apply'],
 ['dun', 'early', 'hor'],
 ['nah', 'think', 'go', 'usf', 'live'],
 ['freemsg',
  'hey',
  'darle',
  'week',
  'word',
  'like',
  'fun',
  'tb',
  'ok',
  'xxx',
  'std',
  'chgs',
  'send',
  'rcv'],
 ['brother', 'like', 'speak', 'treat', 'like', 'aids', 'patent'],
 ['request',
  'melle',
  'melle',
  'oru',
  'minnaminunginte',
  'nurungu',
  'vettam',
  'set',
  'callertune',
  'callers',
  'press',
  'copy',
  'friend',
  'callertune'],
 ['winner',
  'value',
  'network',
  'customer',
  'select',
  'receivea',
  'prize',
  'reward',
  'claim',
  'claim',
  'code',
  'valid',
  'hour'],
 ['mobile',
  'month',
  'entitle',
  'update',
  'late',
  'colour',


In [None]:
#apply for the entire sentences
import numpy as np

X=[]
for i in tqdm(range(len(words))):
    print("Hello",i)
    X.append(avg_word2vec(words[i]))


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
 22%|██▏       | 1231/5542 [00:00<00:00, 6181.60it/s]

Hello 0
Hello 1
Hello 2
Hello 3
Hello 4
Hello 5
Hello 6
Hello 7
Hello 8
Hello 9
Hello 10
Hello 11
Hello 12
Hello 13
Hello 14
Hello 15
Hello 16
Hello 17
Hello 18
Hello 19
Hello 20
Hello 21
Hello 22
Hello 23
Hello 24
Hello 25
Hello 26
Hello 27
Hello 28
Hello 29
Hello 30
Hello 31
Hello 32
Hello 33
Hello 34
Hello 35
Hello 36
Hello 37
Hello 38
Hello 39
Hello 40
Hello 41
Hello 42
Hello 43
Hello 44
Hello 45
Hello 46
Hello 47
Hello 48
Hello 49
Hello 50
Hello 51
Hello 52
Hello 53
Hello 54
Hello 55
Hello 56
Hello 57
Hello 58
Hello 59
Hello 60
Hello 61
Hello 62
Hello 63
Hello 64
Hello 65
Hello 66
Hello 67
Hello 68
Hello 69
Hello 70
Hello 71
Hello 72
Hello 73
Hello 74
Hello 75
Hello 76
Hello 77
Hello 78
Hello 79
Hello 80
Hello 81
Hello 82
Hello 83
Hello 84
Hello 85
Hello 86
Hello 87
Hello 88
Hello 89
Hello 90
Hello 91
Hello 92
Hello 93
Hello 94
Hello 95
Hello 96
Hello 97
Hello 98
Hello 99
Hello 100
Hello 101
Hello 102
Hello 103
Hello 104
Hello 105
Hello 106
Hello 107
Hello 108
Hello 109
Hello 110


 45%|████▍     | 2477/5542 [00:00<00:00, 6094.93it/s]

Hello 1282
Hello 1283
Hello 1284
Hello 1285
Hello 1286
Hello 1287
Hello 1288
Hello 1289
Hello 1290
Hello 1291
Hello 1292
Hello 1293
Hello 1294
Hello 1295
Hello 1296
Hello 1297
Hello 1298
Hello 1299
Hello 1300
Hello 1301
Hello 1302
Hello 1303
Hello 1304
Hello 1305
Hello 1306
Hello 1307
Hello 1308
Hello 1309
Hello 1310
Hello 1311
Hello 1312
Hello 1313
Hello 1314
Hello 1315
Hello 1316
Hello 1317
Hello 1318
Hello 1319
Hello 1320
Hello 1321
Hello 1322
Hello 1323
Hello 1324
Hello 1325
Hello 1326
Hello 1327
Hello 1328
Hello 1329
Hello 1330
Hello 1331
Hello 1332
Hello 1333
Hello 1334
Hello 1335
Hello 1336
Hello 1337
Hello 1338
Hello 1339
Hello 1340
Hello 1341
Hello 1342
Hello 1343
Hello 1344
Hello 1345
Hello 1346
Hello 1347
Hello 1348
Hello 1349
Hello 1350
Hello 1351
Hello 1352
Hello 1353
Hello 1354
Hello 1355
Hello 1356
Hello 1357
Hello 1358
Hello 1359
Hello 1360
Hello 1361
Hello 1362
Hello 1363
Hello 1364
Hello 1365
Hello 1366
Hello 1367
Hello 1368
Hello 1369
Hello 1370
Hello 1371
Hello 1372

 67%|██████▋   | 3721/5542 [00:00<00:00, 6146.47it/s]

Hello 2535
Hello 2536
Hello 2537
Hello 2538
Hello 2539
Hello 2540
Hello 2541
Hello 2542
Hello 2543
Hello 2544
Hello 2545
Hello 2546
Hello 2547
Hello 2548
Hello 2549
Hello 2550
Hello 2551
Hello 2552
Hello 2553
Hello 2554
Hello 2555
Hello 2556
Hello 2557
Hello 2558
Hello 2559
Hello 2560
Hello 2561
Hello 2562
Hello 2563
Hello 2564
Hello 2565
Hello 2566
Hello 2567
Hello 2568
Hello 2569
Hello 2570
Hello 2571
Hello 2572
Hello 2573
Hello 2574
Hello 2575
Hello 2576
Hello 2577
Hello 2578
Hello 2579
Hello 2580
Hello 2581
Hello 2582
Hello 2583
Hello 2584
Hello 2585
Hello 2586
Hello 2587
Hello 2588
Hello 2589
Hello 2590
Hello 2591
Hello 2592
Hello 2593
Hello 2594
Hello 2595
Hello 2596
Hello 2597
Hello 2598
Hello 2599
Hello 2600
Hello 2601
Hello 2602
Hello 2603
Hello 2604
Hello 2605
Hello 2606
Hello 2607
Hello 2608
Hello 2609
Hello 2610
Hello 2611
Hello 2612
Hello 2613
Hello 2614
Hello 2615
Hello 2616
Hello 2617
Hello 2618
Hello 2619
Hello 2620
Hello 2621
Hello 2622
Hello 2623
Hello 2624
Hello 2625

 90%|█████████ | 4989/5542 [00:00<00:00, 6263.12it/s]

Hello 3951
Hello 3952
Hello 3953
Hello 3954
Hello 3955
Hello 3956
Hello 3957
Hello 3958
Hello 3959
Hello 3960
Hello 3961
Hello 3962
Hello 3963
Hello 3964
Hello 3965
Hello 3966
Hello 3967
Hello 3968
Hello 3969
Hello 3970
Hello 3971
Hello 3972
Hello 3973
Hello 3974
Hello 3975
Hello 3976
Hello 3977
Hello 3978
Hello 3979
Hello 3980
Hello 3981
Hello 3982
Hello 3983
Hello 3984
Hello 3985
Hello 3986
Hello 3987
Hello 3988
Hello 3989
Hello 3990
Hello 3991
Hello 3992
Hello 3993
Hello 3994
Hello 3995
Hello 3996
Hello 3997
Hello 3998
Hello 3999
Hello 4000
Hello 4001
Hello 4002
Hello 4003
Hello 4004
Hello 4005
Hello 4006
Hello 4007
Hello 4008
Hello 4009
Hello 4010
Hello 4011
Hello 4012
Hello 4013
Hello 4014
Hello 4015
Hello 4016
Hello 4017
Hello 4018
Hello 4019
Hello 4020
Hello 4021
Hello 4022
Hello 4023
Hello 4024
Hello 4025
Hello 4026
Hello 4027
Hello 4028
Hello 4029
Hello 4030
Hello 4031
Hello 4032
Hello 4033
Hello 4034
Hello 4035
Hello 4036
Hello 4037
Hello 4038
Hello 4039
Hello 4040
Hello 4041

100%|██████████| 5542/5542 [00:00<00:00, 6153.99it/s]

Hello 5236
Hello 5237
Hello 5238
Hello 5239
Hello 5240
Hello 5241
Hello 5242
Hello 5243
Hello 5244
Hello 5245
Hello 5246
Hello 5247
Hello 5248
Hello 5249
Hello 5250
Hello 5251
Hello 5252
Hello 5253
Hello 5254
Hello 5255
Hello 5256
Hello 5257
Hello 5258
Hello 5259
Hello 5260
Hello 5261
Hello 5262
Hello 5263
Hello 5264
Hello 5265
Hello 5266
Hello 5267
Hello 5268
Hello 5269
Hello 5270
Hello 5271
Hello 5272
Hello 5273
Hello 5274
Hello 5275
Hello 5276
Hello 5277
Hello 5278
Hello 5279
Hello 5280
Hello 5281
Hello 5282
Hello 5283
Hello 5284
Hello 5285
Hello 5286
Hello 5287
Hello 5288
Hello 5289
Hello 5290
Hello 5291
Hello 5292
Hello 5293
Hello 5294
Hello 5295
Hello 5296
Hello 5297
Hello 5298
Hello 5299
Hello 5300
Hello 5301
Hello 5302
Hello 5303
Hello 5304
Hello 5305
Hello 5306
Hello 5307
Hello 5308
Hello 5309
Hello 5310
Hello 5311
Hello 5312
Hello 5313
Hello 5314
Hello 5315
Hello 5316
Hello 5317
Hello 5318
Hello 5319
Hello 5320
Hello 5321
Hello 5322
Hello 5323
Hello 5324
Hello 5325
Hello 5326




In [None]:
words[4]

['nah', 'think', 'go', 'usf', 'live']

In [None]:
type(X)

list

Check the shapes of sparse matrices: Print out the shapes of the sparse matrices within X_sparse to identify any inconsistencies.

Ensure consistent shapes: If the shapes of the sparse matrices are inconsistent, you'll need to ensure they are all of the same shape. You can either pad or truncate the matrices to make them uniform.

Convert to dense arrays: Once all matrices have consistent shapes, you can safely convert them to dense arrays.

In [None]:
import numpy as np
from scipy.sparse import csr_matrix

# Assuming X is a list containing sparse matrices (e.g., csr_matrix)
X_sparse = [csr_matrix(matrix) for matrix in X]

# Determine the maximum shape among all sparse matrices
max_shape = max(matrix.shape for matrix in X_sparse)

# Ensure consistent shape by padding or truncating matrices
X_padded = [csr_matrix((matrix.data, matrix.indices, matrix.indptr), shape=max_shape) for matrix in X_sparse]

# Convert sparse matrices to dense arrays
X_new = np.array([matrix.toarray() for matrix in X_padded])


In [None]:
X_new[3]

array([[-0.11414403,  0.12705363,  0.02194945, -0.04721114, -0.00252792,
        -0.2513662 ,  0.11873197,  0.38547513, -0.11663031, -0.04239945,
        -0.08990753, -0.26193407, -0.00154519,  0.11914627,  0.02591185,
        -0.13589995, -0.01352204, -0.19701804, -0.03226523, -0.31120813,
         0.10374776,  0.07852536,  0.09180579, -0.06263826, -0.03452443,
        -0.02983291, -0.14499076, -0.17353587, -0.07326779,  0.0022444 ,
         0.13051701,  0.00800284,  0.02888533, -0.05173727, -0.08993032,
         0.22970115, -0.00747215, -0.21017338, -0.07030312, -0.28951669,
         0.03929091, -0.17043179, -0.06926004,  0.01397134,  0.15837905,
        -0.06026897, -0.16443737, -0.01469551,  0.11493635,  0.1275629 ,
         0.03043796, -0.14430654,  0.01339743,  0.00483506, -0.10618344,
         0.05632084,  0.10632021, -0.06017047, -0.12223942,  0.03688807,
         0.07914723,  0.00235755, -0.08334921, -0.01804435, -0.20602717,
         0.16986388,  0.07778055,  0.15919735, -0.2

In [None]:
X_new[0].shape

(1, 100)

In [None]:
# all input features

X_new

array([[[-0.12956353,  0.14310363,  0.02291874, ..., -0.17816888,
          0.11371288,  0.00418506]],

       [[-0.10391738,  0.11302704,  0.02263221, ..., -0.14694129,
          0.09367412,  0.00660572]],

       [[-0.15847664,  0.16959256,  0.02276544, ..., -0.21958637,
          0.13759115,  0.01723654]],

       ...,

       [[-0.03165034,  0.03300745,  0.0029191 , ..., -0.03381222,
          0.01430574,  0.00604215]],

       [[-0.2107498 ,  0.22445439,  0.03927683, ..., -0.28972599,
          0.1786238 ,  0.01474025]],

       [[-0.06483395,  0.06967121,  0.01039716, ..., -0.09574664,
          0.05473192,  0.00987403]]])

In [None]:
X_new.shape

(5542, 1, 100)

In [None]:
y.shape

(5572,)

In [None]:
# Train Test Split

import numpy as np
from sklearn.model_selection import train_test_split

# Assuming X_new and y have different lengths
# Here, we assume that y has more samples than X_new
# We will truncate y to match the length of X_new

# Determine the number of samples to keep
num_samples = min(len(X_new), len(y))

# Truncate y to match the number of samples in X_new
y_truncated = y[:num_samples]

# Perform train-test split with aligned X_new and y_truncated
X_train, X_test, y_train, y_test = train_test_split(X_new, y_truncated, test_size=0.20, random_state=0)


In [None]:
# Reshape X_train if it's a 3D array
if X_train.ndim > 2:
    num_samples, rows, columns = X_train.shape
    X_train = X_train.reshape(num_samples, rows * columns)

In [None]:
# Check for negative values in X_train_imputed
for i in range(len(X_train)):
    if np.any(X_train_imputed[i] < 0):
        # Handle negative values (e.g., set them to zero)
        X_train_imputed[i][X_train_imputed[i] < 0] = 0


In [None]:
# Check for negative values in X_train
if np.any(X_train < 0):
    # Handle negative values (e.g., set them to zero)
    X_train[X_train < 0] = 0

# Impute missing values in X_train
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)

# Reshape X_train if it's a 3D array
if X_train_imputed.ndim > 2:
    num_samples, rows, columns = X_train.shape
    X_train_imputed = X_train_imputed.reshape(num_samples, rows * columns)

# Impute missing values in X_test using the same imputer
X_test_imputed = imputer.transform(X_test)

# Reshape X_test if it's a 3D array
if X_test_imputed.ndim > 2:
    num_samples_test, rows_test, columns_test = X_test.shape
    X_test_imputed = X_test_imputed.reshape(num_samples_test, rows_test * columns_test)

# Create and fit the MultinomialNB model
spam_detect_model = MultinomialNB().fit(X_train_imputed, y_train)

# Perform prediction on X_test
y_pred = spam_detect_model.predict(X_test_imputed)

# Calculate accuracy score
score = accuracy_score(y_test, y_pred)
print("Accuracy:", score)

# Print classification report
print(classification_report(y_pred, y_test))


ValueError: Found array with dim 3. SimpleImputer expected <= 2.