# scikit-learn

In [1]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
vectorizer = CountVectorizer()

In [3]:
corpus = [
    'Hello there',
    'How are you?',
    'Hello! Hello!',
]

In [4]:
X = vectorizer.fit_transform(corpus)

In [5]:
X

<3x5 sparse matrix of type '<class 'numpy.int64'>'
	with 6 stored elements in Compressed Sparse Row format>

In [6]:
X.toarray()

array([[0, 1, 0, 1, 0],
       [1, 0, 1, 0, 1],
       [0, 2, 0, 0, 0]], dtype=int64)

In [7]:
feature_names = vectorizer.get_feature_names()

In [8]:
feature_names

['are', 'hello', 'how', 'there', 'you']

In [9]:
vectorizer.transform(['Are you doing fine?']).toarray()

array([[1, 0, 0, 0, 1]])

# NLTK

In [10]:
from nltk import word_tokenize

In [11]:
doc1 = 'I have a cat.'
doc2 = "I doesn't have a cat"

In [12]:
word_tokenize(doc1)

['I', 'have', 'a', 'cat', '.']

In [13]:
word_tokenize(doc2)

['I', 'does', "n't", 'have', 'a', 'cat']

# spaCy

In [14]:
import spacy

In [15]:
nlp = spacy.load('en_core_web_lg')

In [16]:
doc1 = nlp('I have a cat')

In [17]:
doc1

I have a cat

In [18]:
[w.text for w in doc1]

['I', 'have', 'a', 'cat']

In [19]:
[w.lemma_ for w in doc1]

['-PRON-', 'have', 'a', 'cat']

In [20]:
doc2 = nlp("I doesn't have a cat :(")

In [21]:
[w.text for w in doc2]

['I', 'does', "n't", 'have', 'a', 'cat', ':(']

In [22]:
[w.lemma_ for w in doc2]

['-PRON-', 'do', 'not', 'have', 'a', 'cat', ':(']

## Word vectors

In [23]:
doc1 = nlp('I have a dog')
doc2 = nlp('I have a cat')
doc3 = nlp('I have a banana')
doc4 = nlp('Congress voted to reopen the government')

In [24]:
doc1[3]

dog

In [25]:
doc2[3]

cat

In [26]:
doc3[3]

banana

In [27]:
doc1[3].similarity(doc2[3])

0.80168545

In [28]:
doc1[3].similarity(doc3[3])

0.24327643

In [29]:
doc1.similarity(doc2)

0.9681672529980867

In [30]:
doc1.similarity(doc3)

0.8753348768953094

In [31]:
doc1.similarity(doc4)

0.5484653936168645

# Keras

In [1]:
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Activation

Using TensorFlow backend.


In [2]:
# define the model
model = Sequential()
model.add(Dense(32, activation='relu', input_dim=100))
model.add(Dense(1, activation='sigmoid'))

In [3]:
# compile accepts the optimizer and the loss function
model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

In [115]:
# training data
data = np.random.random((1000, 100)).astype(np.float32)
labels = np.random.randint(2, size=(1000, 1)).astype(np.float32)

In [116]:
data.shape

(1000, 100)

In [117]:
data[:3 :10]

array([[0.0888392 , 0.52575684, 0.10466124, 0.096262  , 0.76197904,
        0.258842  , 0.63712853, 0.6515475 , 0.75484097, 0.84816116,
        0.6647068 , 0.91870683, 0.04878277, 0.13528776, 0.9946589 ,
        0.4604639 , 0.36947763, 0.196974  , 0.3097918 , 0.36896285,
        0.7101364 , 0.6462675 , 0.4810477 , 0.753921  , 0.03047181,
        0.2926624 , 0.03936019, 0.8873632 , 0.17017573, 0.71794784,
        0.95668125, 0.00934531, 0.46481675, 0.73416185, 0.11787887,
        0.34529576, 0.8840936 , 0.18120424, 0.7227192 , 0.7121059 ,
        0.2659569 , 0.86615205, 0.80185765, 0.71790755, 0.99298185,
        0.1982901 , 0.64232624, 0.29049882, 0.49908778, 0.5950033 ,
        0.42488784, 0.9648391 , 0.4178199 , 0.70975584, 0.16018994,
        0.69761485, 0.1611004 , 0.35617313, 0.8770781 , 0.9502502 ,
        0.30667582, 0.63330185, 0.53394866, 0.8001217 , 0.7815321 ,
        0.1878482 , 0.6871309 , 0.04023051, 0.95712036, 0.46058342,
        0.92053294, 0.8512616 , 0.06702872, 0.85

In [118]:
labels.shape

(1000, 1)

In [119]:
labels[:10]

array([[0.],
       [0.],
       [0.],
       [0.],
       [1.],
       [1.],
       [0.],
       [0.],
       [0.],
       [1.]], dtype=float32)

In [120]:
model.fit(data, labels, epochs=10, batch_size=32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f99af0e82b0>

# Tensorflow

In [121]:
import tensorflow as tf

## Introduction

In [122]:
a = tf.constant(3.0, dtype=tf.float32)
b = tf.constant(4.0)

In [123]:
a

<tf.Tensor 'Const_10:0' shape=() dtype=float32>

In [124]:
sess = tf.Session()

In [125]:
sess.run([a,b])

[3.0, 4.0]

In [126]:
c = a + b

In [127]:
c

<tf.Tensor 'add_2:0' shape=() dtype=float32>

In [128]:
sess.run(c)

7.0

In [129]:
a = tf.placeholder(tf.float32)
b = tf.placeholder(tf.float32)
adder = a + b

In [130]:
sess.run(adder, {a: 3, b:4})

7.0

In [131]:
sess.run(adder, {a: 10, b:20})

30.0

## The same model

In [132]:
inputs = tf.placeholder(tf.float32, shape=(None, 100))
targets = tf.placeholder(tf.float32, shape=(None, 1))

fc1 = tf.layers.dense(inputs, 32, activation=tf.nn.relu)
outputs = tf.layers.dense(fc1, 1, activation=None)

outputs_sigmoid = tf.nn.sigmoid(outputs)

In [133]:
inputs

<tf.Tensor 'Placeholder_14:0' shape=(?, 100) dtype=float32>

In [134]:
fc1

<tf.Tensor 'dense_12/Relu:0' shape=(?, 32) dtype=float32>

In [135]:
outputs

<tf.Tensor 'dense_13/BiasAdd:0' shape=(?, 1) dtype=float32>

In [136]:
loss = tf.losses.sigmoid_cross_entropy(targets, outputs)
accuracy = tf.reduce_mean(tf.to_float(tf.equal(targets, tf.to_float(outputs_sigmoid > 0.5))))

In [137]:
optimizer = tf.train.AdamOptimizer(0.001)
train = optimizer.minimize(loss)

In [138]:
init = tf.global_variables_initializer()
sess.run(init)

In [139]:
nb_epochs = 20
batch_size = 64
nb_batches = len(data) // batch_size
for epoch in range(nb_epochs):
    accuracies = []
    losses = []
    for batch_idx in range(nb_batches):
        batch_start = batch_idx * batch_size
        batch_end = batch_start + batch_size

        batch_inputs = data[batch_start:batch_end]
        batch_targets = labels[batch_start:batch_end]       
      
        _, loss_value, accuracy_value = sess.run(
            [train, loss, accuracy], {inputs: batch_inputs, targets: batch_targets}
        )
        losses.append(loss_value)        
        accuracies.append(accuracy_value)
        
    accuracy_value = np.mean(accuracies)
    loss_value = np.mean(loss_value)    
    print(f'Epoch: {epoch}, loss: {loss_value:.3f}, accuracy: {accuracy_value:.3f}')

Epoch: 0, loss: 0.725, accuracy: 0.471
Epoch: 1, loss: 0.682, accuracy: 0.509
Epoch: 2, loss: 0.693, accuracy: 0.528
Epoch: 3, loss: 0.683, accuracy: 0.546
Epoch: 4, loss: 0.683, accuracy: 0.557
Epoch: 5, loss: 0.678, accuracy: 0.566
Epoch: 6, loss: 0.678, accuracy: 0.578
Epoch: 7, loss: 0.675, accuracy: 0.582
Epoch: 8, loss: 0.674, accuracy: 0.584
Epoch: 9, loss: 0.671, accuracy: 0.593
Epoch: 10, loss: 0.669, accuracy: 0.599
Epoch: 11, loss: 0.668, accuracy: 0.602
Epoch: 12, loss: 0.665, accuracy: 0.614
Epoch: 13, loss: 0.663, accuracy: 0.607
Epoch: 14, loss: 0.661, accuracy: 0.615
Epoch: 15, loss: 0.660, accuracy: 0.624
Epoch: 16, loss: 0.657, accuracy: 0.632
Epoch: 17, loss: 0.656, accuracy: 0.639
Epoch: 18, loss: 0.654, accuracy: 0.643
Epoch: 19, loss: 0.652, accuracy: 0.657


Note that there's also a `tf.estimator` API that makes the training a little bit simplier

# PyTorch

In [140]:
import itertools

In [199]:
import torch
import torch.nn.functional as F
import torch.utils.data
from torch.autograd import Variable

from sklearn.metrics import accuracy_score

# Introduction

In [142]:
a = torch.ones(5)

In [143]:
a


 1
 1
 1
 1
 1
[torch.FloatTensor of size 5]

In [144]:
b = a + 10

In [145]:
b


 11
 11
 11
 11
 11
[torch.FloatTensor of size 5]

# The same model

In [209]:
class Net(torch.nn.Module):
    def __init__(self, input_size, hidden_size=32):
        super().__init__()
        
        self.fc1 = torch.nn.Linear(100, hidden_size)
        self.fc2 = torch.nn.Linear(hidden_size, 1)
        
    def forward(self, inputs):
        hidden = F.relu(self.fc1(inputs))
        outputs = self.fc2(hidden)
        
        outputs = outputs
        return outputs

In [210]:
model = Net(input_size=data.shape[1])

In [211]:
model

Net(
  (fc1): Linear(in_features=100, out_features=32, bias=True)
  (fc2): Linear(in_features=32, out_features=1, bias=True)
)

In [212]:
model(Variable(torch.rand(4, 100)))

Variable containing:
-0.0417
-0.1653
 0.0369
-0.0224
[torch.FloatTensor of size (4,1)]

In [213]:
model = model.cuda()

In [214]:
dataset = torch.utils.data.TensorDataset(torch.from_numpy(data), torch.from_numpy(labels))
data_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [215]:
len(dataset[0])

2

In [216]:
dataset[0][0][:10]


 0.0888
 0.5258
 0.1047
 0.0963
 0.7620
 0.2588
 0.6371
 0.6515
 0.7548
 0.8482
[torch.FloatTensor of size 10]

In [217]:
dataset[0][1]


 0
[torch.FloatTensor of size 1]

In [218]:
criterion = torch.nn.BCEWithLogitsLoss().cuda()
optimizer = torch.optim.Adam(model.parameters())

In [219]:
nb_epochs = 20
batch_size = 64
nb_batches = len(data) // batch_size
for epoch in range(nb_epochs):
    accuracies = []
    losses = []
    for i, (inputs, targets) in enumerate(data_loader):
        optimizer.zero_grad()
        
        inputs = Variable(inputs).cuda()
        targets = Variable(targets).cuda()
        
        outputs = model(inputs)
        
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()        

        y_pred = (F.sigmoid(outputs) > 0.5).data.cpu().numpy()
        y_true = targets.data.cpu().numpy()
        accuracy = accuracy_score(y_true, y_pred)
        
        losses.append(float(loss))
        accuracies.append(accuracy)
        
    accuracy = np.mean(accuracies)
    loss = np.mean(losses)    
    print(f'Epoch: {epoch}, loss: {loss:.3f}, accuracy: {accuracy:.3f}')        

Epoch: 0, loss: 0.696, accuracy: 0.493
Epoch: 1, loss: 0.695, accuracy: 0.513
Epoch: 2, loss: 0.690, accuracy: 0.553
Epoch: 3, loss: 0.688, accuracy: 0.562
Epoch: 4, loss: 0.687, accuracy: 0.580
Epoch: 5, loss: 0.687, accuracy: 0.529
Epoch: 6, loss: 0.685, accuracy: 0.564
Epoch: 7, loss: 0.685, accuracy: 0.523
Epoch: 8, loss: 0.684, accuracy: 0.560
Epoch: 9, loss: 0.679, accuracy: 0.600
Epoch: 10, loss: 0.677, accuracy: 0.589
Epoch: 11, loss: 0.675, accuracy: 0.624
Epoch: 12, loss: 0.672, accuracy: 0.623
Epoch: 13, loss: 0.669, accuracy: 0.642
Epoch: 14, loss: 0.667, accuracy: 0.636
Epoch: 15, loss: 0.665, accuracy: 0.648
Epoch: 16, loss: 0.662, accuracy: 0.650
Epoch: 17, loss: 0.658, accuracy: 0.645
Epoch: 18, loss: 0.656, accuracy: 0.647
Epoch: 19, loss: 0.655, accuracy: 0.642
