In [None]:
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

import pandas as pd
import numpy as np
import gensim

In [None]:
df = pd.read_csv('data/SMSSpamCollection',sep='\t',names=['label','message'])

In [None]:
df.head()

In [None]:
df.info()

In [None]:
ws = WordNetLemmatizer()
corpus = []
for i in range(0, len(df)):
    review = re.sub('[^a-zA-Z]', ' ', df['message'][i])
    review = review.lower()
    review = review.split()

    review = [ws.lemmatize(word)
              for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [None]:
corpus[:4]

In [None]:
# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=1500)
X_bow = cv.fit_transform(corpus).toarray()

In [None]:
# Creating the TF-IDF model
from sklearn.feature_extraction.text import TfidfVectorizer
tf_idf = TfidfVectorizer()
X_tfidf = tf_idf.fit_transform(corpus).toarray()

In [None]:
X_bow

In [None]:
y = np.squeeze(np.array(pd.get_dummies(df['label']).iloc[:,:1]))
y

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
X_train, X_test, y_train, y_test = train_test_split(
    X_bow, y, test_size=0.20, random_state=0)

# Training model using Naive bayes classifier

spam_detect_model = MultinomialNB().fit(X_train, y_train)

y_pred = spam_detect_model.predict(X_test)
accuracy_score(y_test, y_pred)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X_tfidf, y, test_size=0.20, random_state=0)

# Training model using Naive bayes classifier

spam_detect_model = MultinomialNB().fit(X_train, y_train)

y_pred = spam_detect_model.predict(X_test)
accuracy_score(y_test, y_pred)

### Genism

In [None]:
review_text = df.message.apply(gensim.utils.simple_preprocess)

In [None]:
review_text[:5]

In [None]:
model = gensim.models.Word2Vec(
    window=10,
    min_count=2,
    workers=4,
)

In [None]:
model.build_vocab(review_text, progress_per=1000)

In [None]:
model.train(review_text, total_examples=model.corpus_count,
            epochs=model.epochs)

In [None]:
model.save("./word2vec-spamcollection.model")

In [None]:
model.wv.most_similar("man")

In [1]:
from transformers import BertTokenizer, TFBertModel, pipeline






In [2]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [3]:
model = TFBertModel.from_pretrained("bert-base-uncased")




Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [4]:
text = "Replace me by any text you'd like."
encoded_input = tokenizer(text, return_tensors='tf')
output = model(encoded_input)

In [5]:
unmasker = pipeline('fill-mask', model='bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [7]:
unmasker("Hello I'm a [MASK] model.")[0]

{'score': 0.10731078684329987,
 'token': 4827,
 'token_str': 'fashion',
 'sequence': "hello i'm a fashion model."}

In [8]:
unmasker("My dog is [MASK] too much.")[0]

{'score': 0.06587449461221695,
 'token': 5983,
 'token_str': 'eating',
 'sequence': 'my dog is eating too much.'}