In [1]:
import spacy
#command to install
#conda install -c conda-forge spacy-model-en_core_web_sm
nlp = spacy.load("en_core_web_sm") 

#### Text Classification with SpaCy
We will analyze a dataset of SMSs to classify them into spam and ham (not spam)

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [3]:
#import spam csv
dfspam = pd.read_csv('spam.csv', encoding='ISO-8859-1')

In [4]:
dfspam.head(10)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
5,spam,FreeMsg Hey there darling it's been 3 week's n...,,,
6,ham,Even my brother is not like to speak with me. ...,,,
7,ham,As per your request 'Melle Melle (Oru Minnamin...,,,
8,spam,WINNER!! As a valued network customer you have...,,,
9,spam,Had your mobile 11 months or more? U R entitle...,,,


In [5]:
# Create an empty model
nlp = spacy.blank('en')

In [6]:
# Create the TextCategorizer with exclusive classes and "bag of word(bow)" architecture
textcat = nlp.create_pipe("textcat",
              config={"exclusive_classes": True,"architecture": "bow"})

In [7]:
nlp.add_pipe(textcat)

In [8]:
#add classoifier label
textcat.add_label('ham')
textcat.add_label('spam')

1

In [9]:
#create train test split for training and evaluating model
X = dfspam['v2'].values
y = [{'cats': {'ham': label == 'ham',
                          'spam': label == 'spam'}} 
                for label in dfspam['v1']]
train_texts, test_texts, train_labels, test_labels = train_test_split(X, y, test_size=0.2, random_state=9)

In [10]:
train_data = list(zip(train_texts, train_labels))
train_data[0:3]

[('You can jot down things you want to remember later.',
  {'cats': {'ham': True, 'spam': False}}),
 ('So you think i should actually talk to him? Not call his boss in the morning? I went to this place last year and he told me where i could go and get my car fixed cheaper. He kept telling me today how much he hoped i would come back in, how he always regretted not getting my number, etc.',
  {'cats': {'ham': True, 'spam': False}}),
 ('Wat makes some people dearer is not just de happiness dat u feel when u meet them but de pain u feel when u miss dem!!!',
  {'cats': {'ham': True, 'spam': False}})]

In [11]:
import random

random.seed(1)
spacy.util.fix_random_seed(1)
optimizer = nlp.begin_training()

losses = {}
for epoch in range(10):
    random.shuffle(train_data)
    # Create the batch generator with batch size = 8
    batches = minibatch(train_data, size=8)
    # Iterate through minibatches
    for batch in batches:
        # Each batch is a list of (text, label) but we need to
        # send separate lists for texts and labels to update().
        # This is a quick way to split a list of tuples into lists
        texts, labels = zip(*batch)
        nlp.update(texts, labels, sgd=optimizer, losses=losses)
    print(losses)

NameError: name 'minibatch' is not defined

In [12]:
texts = ["Are you ready for the tea party????? It's gonna be wild",
         "URGENT Reply to this message for GUARANTEED FREE TEA" ]
docs = [nlp.tokenizer(text) for text in texts]
    
# Use textcat to get the scores for each doc
textcat = nlp.get_pipe('textcat')
scores, _ = textcat.predict(docs)

print(scores)

[[0.5 0.5]
 [0.5 0.5]]


In [13]:
# From the scores, find the label with the highest score/probability
predicted_labels = scores.argmax(axis=1)
print(predicted_labels)
print([textcat.labels[label] for label in predicted_labels])

[0 0]
['ham', 'ham']


In [14]:
#predict using the trained model
def predict(model, texts): 
    # Use the model's tokenizer to tokenize each input text
    docs = [model.tokenizer(text) for text in texts]
    
    # Use textcat to get the scores for each doc
    textcat = model.get_pipe('textcat')
    scores, _ = textcat.predict(docs)
    
    # From the scores, find the class with the highest score/probability
    predicted_class = scores.argmax(axis=1)
    
    return predicted_class

In [15]:
predictions = predict(nlp, ["Are you ready for the tea party????? It's gonna be wild",
         "URGENT Reply to this message for GUARANTEED FREE TEA"])
print([textcat.labels[label] for label in predictions])

['ham', 'ham']


In [16]:
#evaluate the model
def evaluate(model, texts, labels):
      
    # From the scores, find the class with the highest score/probability
    predicted_class = predict(model, texts)
    
    actual_class = [int(labels[i]['cats']['spam']) for i,label in enumerate(labels)]
    
    return accuracy_score(actual_class, predicted_class)

In [17]:
evaluate(nlp, test_texts, test_labels)

0.8932735426008969

===========================================END=======================================