<font color="black"><font size="7"><br>
     Project 7 - Advanced Model
</font>

Uncomment and run cells if you don't have nltk and keras 2.4.1 installed on your environment

In [1]:
#!pip install nltk

Collecting nltk
  Downloading nltk-3.6.2-py3-none-any.whl (1.5 MB)
[K     |████████████████████████████████| 1.5 MB 19.9 MB/s eta 0:00:01
Installing collected packages: nltk
Successfully installed nltk-3.6.2


In [2]:
!pip install keras==2.4.1

Collecting keras==2.4.1
  Downloading Keras-2.4.1-py2.py3-none-any.whl (169 kB)
[K     |████████████████████████████████| 169 kB 19.1 MB/s eta 0:00:01
Installing collected packages: keras
  Attempting uninstall: keras
    Found existing installation: Keras 2.3.1
    Uninstalling Keras-2.3.1:
      Successfully uninstalled Keras-2.3.1
Successfully installed keras-2.4.1


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import keras
import math
import nltk
import azureml.core
from azureml.core import Workspace

Using TensorFlow backend.


In [2]:
keras.__version__

'2.4.0'

In [3]:
from azureml.core.dataset import Dataset
from azureml.core import Workspace, Datastore, Dataset

# Loading data

In [4]:
from azureml.core import Workspace, Dataset

subscription_id = 'your_subscription_id'
resource_group = 'OC-P7'
workspace_name = 'P7_ML'

workspace = Workspace(subscription_id, resource_group, workspace_name)

dataset = Dataset.get_by_name(workspace, name='Data_train')
data = dataset.to_pandas_dataframe()

In [5]:
data.shape

(1598400, 3)

# Preprocessing

## Tokenizer

In [6]:
def tokenizer(data, y = None):
    '''tokenizes input dataframe considering words of 2 and more characters
       and lowercase text and remove numbers
    
       Parameters
       ----------
       data : Pandas series to tokenize
       
       Returns
       --------
       Pandas series list of tokens'''
              
        
    tokenizer = nltk.RegexpTokenizer(r'\w{2,}')
    sentences = data.str.lower()
    sentences = sentences.str.replace('\d+', '',regex=True)
    results = sentences.apply(tokenizer.tokenize)
  
    return results

#Create a transformer for pipeline integration

from sklearn.preprocessing import FunctionTransformer

tokenizer_transformer = FunctionTransformer(func=tokenizer)

## Preparing text data to fit Keras requirement

the NN needs an input matrix with documents represented as an interger list, each interger is a word. we'll choose sequence length to be tweet_length(based on EDA it makes sense, if less than tweet_length use 0 padding) and vocabulary size max_token (based on preliminary EDA). We'll use Keras' vectorizer

In [7]:
tweet_length = 30
max_tokens = 60000

In [8]:
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

## Creating test and train sets

In [9]:
from sklearn.model_selection import train_test_split
X = data['text']
y = data['target']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.15,random_state=33,stratify=y)

In [10]:
print(f'X_train shape: {X_train.shape} \n y_train shape: {y_train.shape} \n X_test shape : {X_test.shape} \n y_test shape: {y_test.shape}')

X_train shape: (1358640,) 
 y_train shape: (1358640,) 
 X_test shape : (239760,) 
 y_test shape: (239760,)


In [11]:
def vectorize_sets(X_train,X_test,vocab_length=20000,tweet_length=30):
    '''Compute and return the vectors of the documents in X_train and X_test with a fixed length'''
    vectorizer = TextVectorization(max_tokens=vocab_length,output_sequence_length=tweet_length)
    vectorizer.adapt(X_train.values)
    voc = vectorizer.get_vocabulary() # vocabulary for futur use
    word_index = dict(zip(voc, range(len(voc)))) # word index for futur use
    return (vectorizer(X_train.values.reshape((X_train.values.shape[0],1))),
            vectorizer(X_test.values.reshape((X_test.values.shape[0],1))),
            voc,
            word_index)

### basic preprocessing

In [12]:
X_train_vect, X_test_vect, voc, word_index = vectorize_sets(
    tokenizer_transformer.transform(X_train).str.join(sep=' '),
    tokenizer_transformer.transform(X_test).str.join(sep=' '),
    vocab_length=max_tokens,
    tweet_length=30)

Save vocabulary for futur use

In [13]:
import pickle
with open('vocabulary.pkl','wb') as f:
    pickle.dump(voc,f)

In [14]:
with open('vocabulary.pkl','rb') as f:
    test_voc = pickle.load(f)

## Building the embedding matrix ( word / coeff matrix)

### Glove embedding

Since Glove embedding shows better results, we'll use it from now on

In [14]:
embeddings_index = {}
with open('glove.twitter.27B.100d.txt') as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print("Found %s word vectors." % len(embeddings_index))

Found 1193514 word vectors.


Compute the embedding matrix which can be used in a Keras Embedding layer.

In [15]:
num_tokens = len(voc) + 2
embedding_dim = 100
hits = 0
misses = 0
misses_word=[]

# Prepare embedding matrix
embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1
        misses_word.append(word)
print("Converted %d words (%d misses)" % (hits, misses))

Converted 42241 words (17759 misses)


# Embedding Layer

In [16]:
from keras.layers.embeddings import Embedding
import keras

embedding_layer = Embedding(
    num_tokens,
    embedding_dim,
    embeddings_initializer=keras.initializers.Constant(embedding_matrix),
    input_length=tweet_length,
    trainable=False,
)

# Textvectorization layer

In [17]:
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

vectorize_layer = TextVectorization(
    max_tokens=60000,
    output_sequence_length=30)
vectorize_layer.set_vocabulary(voc)

## Recurrent Neural Network

In [18]:
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Input
from keras.models import Sequential
import tensorflow as tf
import time

In [19]:
tf.config.run_functions_eagerly(True)

In [31]:
model_name = 'LSTM - GLoVe embedding'

lstm_out=128
lr=0.01
dp=0.2

model = Sequential()

model.add(Input(shape=(1,), dtype=tf.string))
model.add(vectorize_layer)
model.add(embedding_layer)
#model.add(LSTM(lstm_out,dropout=dp,return_sequences=True))
model.add(LSTM(lstm_out,dropout=dp))
#model.add(Dense(16,activation='relu'))
model.add(Dense(1,activation='sigmoid'))
opt = keras.optimizers.Adam(learning_rate=lr)
model.compile(loss = 'binary_crossentropy', optimizer=opt,metrics = ['accuracy','AUC'])
print(model.summary())

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
text_vectorization_1 (TextVe (None, 30)                0         
_________________________________________________________________
embedding (Embedding)        (None, 30, 100)           6000200   
_________________________________________________________________
lstm_5 (LSTM)                (None, 128)               117248    
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 129       
Total params: 6,117,577
Trainable params: 117,377
Non-trainable params: 6,000,200
_________________________________________________________________
None


In [32]:
start = time.time()
hist = model.fit(tokenizer_transformer.transform(X_train).str.join(sep=' '),
     y_train,
      batch_size=1024,
       epochs=5,
        validation_data=(tokenizer_transformer.transform(X_test).str.join(sep=' '), y_test))
stop = time.time()

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


# Saving the model

In [31]:
import os
model.save('my_model')

Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
INFO:tensorflow:Assets written to: my_model/assets


## Loading the model

In [4]:
from tensorflow import keras

In [5]:
model = keras.models.load_model('my_model')

## Performance on the common dataset

In [24]:
from azureml.core import Workspace, Dataset

subscription_id = '8781751c-70dd-441a-8c45-2274208851c0'
resource_group = 'OC-P7'
workspace_name = 'P7_ML'

workspace = Workspace(subscription_id, resource_group, workspace_name)

dataset = Dataset.get_by_name(workspace, name='tweets_common')
data = dataset.to_pandas_dataframe()

In [33]:
data['pred']=model.predict(tokenizer_transformer.transform(data['text']).str.join(sep=' '))

In [34]:
data['pred_bin']=[1 if data.loc[i,'pred']>=0.5 else 0 for i in data.index]

In [35]:
from sklearn.metrics import accuracy_score
accuracy_score(data['target'],data['pred_bin'])

0.798125

In [36]:
data[0:10]

Unnamed: 0,Column1,target,text,pred,pred_bin
0,1516037,1,"Wow, its later than I feel, better wrap up ano...",0.677863,1
1,589923,0,@lemonissimo I think the reason I twitted so m...,0.340094,0
2,213819,0,@GericaQuinn ahhhhh! dude u suck! lmao. jk! bu...,0.722751,1
3,10047,0,...aaaand there goes that great day RIP Mrs W...,0.221471,0
4,1330460,1,another morning joe free morning ahhhh ... sun...,0.913709,1
5,363111,0,@cdouglasroberts.... awww Im across the state...,0.111212,0
6,247946,0,My mum doesn't allow me to listen to Radio:Act...,0.183729,0
7,1179315,1,"awesome, i love the quality from my nikon came...",0.958652,1
8,1349427,1,i think @emmacade always looks hot no matter w...,0.611167,1
9,296578,0,that's all i have to say,0.393861,0


# Registering the model

In [17]:
from azureml.core.model import Model
model = Model.register(workspace, model_name='LSTM_Glove', model_path=os.getcwd()+'/my_model')

Registering model LSTM_Glove


# Define the environment

In [None]:
%%writefile conda_dependencies.yml

channels:
- conda-forge
dependencies:
- python=3.8.1
- pip:
  - azureml-defaults
  - keras<=2.4.3
  - nltk

In [18]:
tensorflow.__version__

NameError: name 'tensorflow' is not defined