**For running this notebook** open GCP console and go to Vertex AI / Workbench / New Notebook . 

Make sure that you create a machine with tensorflow already installed
(With a small machine you'll be fine). 

In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np 

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.utils import shuffle

import pickle
import os
import numpy as np

In [None]:
# Enable all the APIs that we are going to use
# You can also enable these APIs from GCP UI. 
! gcloud services enable compute.googleapis.com \
                       containerregistry.googleapis.com \
                       aiplatform.googleapis.com \
                       cloudbuild.googleapis.com \
                       ml.googleapis.com

In [59]:
print(tf.__version__)

2.8.0


In [2]:
!gsutil cp 'gs://cloudml-demo-lcm/SO_ml_tags_avocado_188k_v2.csv' ./

Copying gs://cloudml-demo-lcm/SO_ml_tags_avocado_188k_v2.csv...
- [1 files][276.7 MiB/276.7 MiB]                                                
Operation completed over 1 objects/276.7 MiB.                                    


In [3]:
file = os.path.join('.', 'SO_ml_tags_avocado_188k_v2.csv')

In [4]:
data = pd.read_csv('SO_ml_tags_avocado_188k_v2.csv', names=['tags', 'original_text,', 'text'], header=0)

In [5]:
data.head()

Unnamed: 0,tags,"original_text,",text
0,"matplotlib,pandas","python,matplotlib,pandas",setting xticks and yticks for scatter plot mat...
1,"scikitlearn,keras","python,numpy,scikit-learn,keras,grid-search",gridseachcv - valueerror: found input variable...
2,"matplotlib,scikitlearn","python,numpy,matplotlib,scikit-learn,nmf",non negative matrix factorisation in python on...
3,"pandas,tensorflow","python,pandas,tensorflow,time-series",avocado equivalent to avocado.dataframe.resamp...
4,"matplotlib,pandas","python,matplotlib,plot,pandas",how to plot on avocado python i have a data fr...


In [6]:
data= data.dropna()

In [7]:
data = data.drop(columns=['original_text,'])

# Feature engineering

In [8]:
data = shuffle(data, random_state=20)
data.head()

Unnamed: 0,tags,text
70357,pandas,"sqlalchemy is too slow, did i do anything wron..."
152810,tensorflow,"getting ""no module named queue"" when installin..."
180803,"tensorflow,keras",why does sigmoid & crossentropy of avocado/avo...
186450,"pandas,matplotlib",plot avocado columns with secondary y -axis an...
52130,pandas,“unknown string format”-error when parsing url...


In [9]:
data.iloc[0].text

'sqlalchemy is too slow, did i do anything wrong? when i do this command using mamp with mysql:  select * from cont_bar   it only takes (264,278 total, query took 0.0007 seconds.) however, when i try to load this table into a avocado data_frame, it becomes pretty slow. i tried two approaches.  first approach  import avocado as avocado from sqlalchemy import create_engine  engine = create_engine("mysql://{}:{}@{}:{}/{}".format(db_user, db_password, db_host, db_port, future_daily_bar_db))  conn = engine.connect() resoverall = conn.execute("select * from cont_bar") full_avocado = avocado.dataframe(resoverall.fetchall()) full_avocado.columns = resoverall.keys()   this one takes 20s.   second approach  engine = create_engine("mysql://{}:{}@{}:{}/{}".format(db_user, db_password, db_host, db_port, future_daily_bar_db)) conn = engine.connect() full_avocado_2 = avocado.read_sql("select * from cont_bar", conn)   this one takes 37s. i think this is really slow. is it the best sqlalchemy/any other

In [10]:
tags_split = [tags.split(',') for tags in data['tags'].values]

In [11]:
tags_split[2]

['tensorflow', 'keras']

In [12]:
tag_encoder = MultiLabelBinarizer()
tags_encoded = tag_encoder.fit_transform(tags_split)

In [13]:
num_tags =len(tags_encoded[0])

In [14]:
num_tags

5

In [15]:
print(tag_encoder.classes_)

['keras' 'matplotlib' 'pandas' 'scikitlearn' 'tensorflow']


In [16]:
tags_encoded[0]

array([0, 0, 1, 0, 0])

In [17]:
tags_encoded[2]

array([1, 0, 0, 0, 1])

In [18]:
train_size = int(len(data)*.8)
print("train size: %d" % train_size)

train size: 150559


In [19]:
print("test size: %d" % (len(data) -train_size))

test size: 37640


In [20]:
train_tags = tags_encoded[:train_size]
test_tags = tags_encoded[train_size:]

In [21]:
train_tags

array([[0, 0, 1, 0, 0],
       [0, 0, 0, 0, 1],
       [1, 0, 0, 0, 1],
       ...,
       [0, 0, 0, 0, 1],
       [0, 0, 1, 0, 0],
       [0, 0, 0, 1, 0]])

# Feature Engineering for our X's (predictors)

In [22]:
%%writefile preprocess.py

from tensorflow.keras.preprocessing import text

class TextPreprocessor(object):
    def __init__(self, vocab_size):
        self._vocab_size = vocab_size
        self._tokenizer = None
        
    def create_tokenizer(self, text_list):
        tokenizer = text.Tokenizer(num_words=self._vocab_size)
        tokenizer.fit_on_texts(text_list)
        self._tokenizer = tokenizer
    
    def transform_text(self, text_list):
        text_matrix=self._tokenizer.texts_to_matrix(text_list)
        return text_matrix

Overwriting preprocess.py


In [23]:
from preprocess import TextPreprocessor

In [24]:
train_qs = data['text'].values[:train_size]
test_qs =data['text'].values[train_size:]

In [25]:
print(type(train_qs))

<class 'numpy.ndarray'>


In [26]:
VOCAB_SIZE=400
processor = TextPreprocessor(VOCAB_SIZE)
type(processor)

preprocess.TextPreprocessor

In [27]:
processor.create_tokenizer(train_qs)

In [28]:
body_train = processor.transform_text(train_qs)
body_test = processor.transform_text(test_qs)

In [29]:
print(body_train[0])

[0. 1. 1. 1. 1. 0. 1. 1. 0. 1. 0. 1. 0. 0. 0. 1. 0. 1. 1. 0. 1. 0. 1. 1.
 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 1. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.
 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.
 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 1. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.

In [30]:
len(body_train[0])

400

In [31]:
import pickle
with open('./processor_state.pkl', 'wb') as f:
    pickle.dump(processor,f)

# Build and train our model

In [32]:
def create_model(vocab_size, num_tags):
    model = tf.keras.models.Sequential()
    
    model.add(tf.keras.layers.Dense(50, input_shape=(VOCAB_SIZE,), activation='relu'))
    model.add(tf.keras.layers.Dense(25, activation='relu'))
    model.add(tf.keras.layers.Dense(num_tags, activation='sigmoid'))
    
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    return model
                  
                                    

In [33]:
model =create_model(VOCAB_SIZE, num_tags)

2022-03-22 14:44:52.540466: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:/usr/local/cuda/lib:/usr/local/lib/x86_64-linux-gnu:/usr/local/nvidia/lib:/usr/local/nvidia/lib64:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
2022-03-22 14:44:52.577231: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2022-03-22 14:44:52.577300: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (vm-1a1132b4-6adb-45c9-b1ce-76bd1b8b9bc5): /proc/driver/nvidia/version does not exist
2022-03-22 14:44:52.616817: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  

In [34]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 50)                20050     
                                                                 
 dense_1 (Dense)             (None, 25)                1275      
                                                                 
 dense_2 (Dense)             (None, 5)                 130       
                                                                 
Total params: 21,455
Trainable params: 21,455
Non-trainable params: 0
_________________________________________________________________


In [35]:
model.fit(body_train, train_tags, epochs=5, batch_size=128, validation_split=0.1)

2022-03-22 14:44:54.096073: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 216804800 exceeds 10% of free system memory.


Epoch 1/5

2022-03-22 14:45:09.314140: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 24089600 exceeds 10% of free system memory.


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fac27458e10>

In [36]:
print('Eval loss/accuracy:{}'.format(model.evaluate(body_test, test_tags, batch_size=128)))

 35/295 [==>...........................] - ETA: 0s - loss: 0.0951 - accuracy: 0.9013

2022-03-22 14:46:19.733312: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 60224000 exceeds 10% of free system memory.


Eval loss/accuracy:[0.10052843391895294, 0.899282693862915]


In [37]:
model.save('keras_saved_model.h5')

In [38]:
%%writefile model_prediction.py
import pickle
import os
import numpy as np

class CustomModelPrediction(object):
    def __init__(self, model, processor):
        self._model= model
        self._processor = processor
    
    def predict(self, instances, **kwargs):
        preprocessed_data = self._processor.transform_text(instances)
        predictions = self._model.predict(preprocessed_data)
        return predictions.tolist()
    
    @classmethod
    def from_path(cls, model_dir):
        import os
        import tensorflow.keras as keras
        model = keras.models.load_model(os.path.join(model_dir,'keras_saved_model.h5'))
        with open(os.path.join(model_dir, 'processor_state.pkl'), 'rb') as f:
                  processor = pickle.load(f)
        return cls(model, processor)
                                        

Overwriting model_prediction.py


In [39]:
from model_prediction import CustomModelPrediction

In [40]:
classifier = CustomModelPrediction.from_path('.')

In [41]:
test_request = [
  "How to preprocess strings in Keras models Lambda layer? I have the problem that the value passed on to the Lambda layer (at compile time) is a placeholder generated by keras (without values). When the model is compiled, the .eval () method throws the error: You must feed a value for placeholder tensor 'input_1' with dtype string and shape [?, 1] def text_preprocess(x): strings = tf.keras.backend.eval(x) vectors = [] for string in strings: vector = string_to_one_hot(string.decode('utf-8')) vectors.append(vector) vectorTensor = tf.constant(np.array(vectors),dtype=tf.float32) return vectorTensor input_text = Input(shape=(1,), dtype=tf.string) embedding = Lambda(text_preprocess)(input_text) dense = Dense(256, activation='relu')(embedding) outputs = Dense(2, activation='softmax')(dense) model = Model(inputs=[input_text], outputs=outputs) model.compile(loss='categorical_crossentropy',optimizer='adam', metrics=['accuracy']) model.summary() model.save('test.h5') If I pass a string array into the input layer statically, I can compile the model, but I get the same error if I want to convert the model to tflite. #I replaced this line: input_text = Input(shape=(1,), dtype=tf.string) #by this lines: test = tf.constant(['Hello', 'World']) input_text = Input(shape=(1,), dtype=tf.string, tensor=test) #but calling this ... converter = TFLiteConverter.from_keras_model_file('string_test.h5') tfmodel = converter.convert() #... still leads to this error: InvalidArgumentError: You must feed a value for placeholder tensor 'input_3' with dtype string and shape [2] [[{{node input_3}}]] ",
  "Change the bar item name in Pandas I have a test excel file like: df = pd.DataFrame({'name':list('abcdefg'), 'age':[10,20,5,23,58,4,6]}) print (df) name  age 0    a   10 1    b   20 2    c    5 3    d   23 4    e   58 5    f    4 6    g    6 I use Pandas and matplotlib to read and plot it: import pandas as pd import numpy as np import matplotlib.pyplot as plt import os excel_file = 'test.xlsx' df = pd.read_excel(excel_file, sheet_name=0) df.plot(kind='bar') plt.show() the result shows: enter image description here it use index number as item name, how can I change it to the name, which stored in column name?"]

In [42]:
results=classifier.predict(test_request)

In [43]:
results[0]

[0.942192792892456,
 0.00045877695083618164,
 0.0001958012580871582,
 0.0010537803173065186,
 0.7719775438308716]

In [44]:
for i in range(len(results)):
  print('Predicted labels for text-{}:'.format(i))
  for idx, val in enumerate(results[i]):
    if val > 0.7:
      print(tag_encoder.classes_[idx])
  print('\n')

Predicted labels for text-0:
keras
tensorflow


Predicted labels for text-1:
matplotlib
pandas




# Package our Model and deploy it INTO AI PLATFORM!!!

In [46]:
!gcloud config set project itam-dpa-2022

Updated property [core/project].


To take a quick anonymous survey, run:
  $ gcloud survey



In [48]:
!gcloud config set ai_platform/region global

Updated property [ai_platform/region].


In [51]:
# You should change to the name of your bucket after: gs://
!gsutil cp keras_saved_model.h5 gs://itam-dpa-2022-text-classifier/v2/
!gsutil cp processor_state.pkl gs://itam-dpa-2022-text-classifier/v2/

Copying file://keras_saved_model.h5 [Content-Type=application/x-hdf5]...
/ [1 files][282.8 KiB/282.8 KiB]                                                
Operation completed over 1 objects/282.8 KiB.                                    
Copying file://processor_state.pkl [Content-Type=application/octet-stream]...
- [1 files][ 32.3 MiB/ 32.3 MiB]                                                
Operation completed over 1 objects/32.3 MiB.                                     


In [52]:
%%writefile setup.py

from setuptools import setup

setup(
    name="so_predict",
    version="0.2",
    include_package_data=True,
    scripts=["preprocess.py", "model_prediction.py"]
)

Writing setup.py


In [53]:
!python setup.py sdist

running sdist
running egg_info
creating so_predict.egg-info
writing so_predict.egg-info/PKG-INFO
writing dependency_links to so_predict.egg-info/dependency_links.txt
writing top-level names to so_predict.egg-info/top_level.txt
writing manifest file 'so_predict.egg-info/SOURCES.txt'
reading manifest file 'so_predict.egg-info/SOURCES.txt'
writing manifest file 'so_predict.egg-info/SOURCES.txt'

running check


creating so_predict-0.2
creating so_predict-0.2/so_predict.egg-info
copying files to so_predict-0.2...
copying model_prediction.py -> so_predict-0.2
copying preprocess.py -> so_predict-0.2
copying setup.py -> so_predict-0.2
copying so_predict.egg-info/PKG-INFO -> so_predict-0.2/so_predict.egg-info
copying so_predict.egg-info/SOURCES.txt -> so_predict-0.2/so_predict.egg-info
copying so_predict.egg-info/dependency_links.txt -> so_predict-0.2/so_predict.egg-info
copying so_predict.egg-info/top_level.txt -> so_predict-0.2/so_predict.egg-info
Writing so_predict-0.2/setup.cfg
creating di

In [54]:
# You should change to the name of your bucket after: gs://
!gsutil cp ./dist/so_predict-0.2.tar.gz gs://itam-dpa-2022-text-classifier/v2/packages/so_predict-0.2.tar.gz

Copying file://./dist/so_predict-0.2.tar.gz [Content-Type=application/x-tar]...
/ [1 files][  1.1 KiB/  1.1 KiB]                                                
Operation completed over 1 objects/1.1 KiB.                                      


In [56]:
# we create the model: 
!gcloud ai-platform models create itam_dpa_2022_text_classifier


Learn more about regional endpoints and see a list of available regions: https://cloud.google.com/ai-platform/prediction/docs/regional-endpoints
Using endpoint [https://ml.googleapis.com/]
[1;31mERROR:[0m (gcloud.ai-platform.models.create) Resource in projects [itam-dpa-2022] is the subject of a conflict: Field: model.name Error: A model with the same name already exists.
- '@type': type.googleapis.com/google.rpc.BadRequest
  fieldViolations:
  - description: A model with the same name already exists.
    field: model.name


In [57]:
!python --version

Python 3.7.12


In [58]:
!pip freeze | grep tensorflow

tensorflow @ file:///opt/conda/conda-bld/dlenv-tf-2-8-gpu_1643754343905/work/tensorflow-2.8.0-cp37-cp37m-linux_x86_64.whl
tensorflow-cloud==0.1.16
tensorflow-datasets==4.4.0
tensorflow-estimator==2.8.0
tensorflow-hub==0.12.0
tensorflow-io==0.23.1
tensorflow-io-gcs-filesystem==0.23.1
tensorflow-metadata==1.6.0
tensorflow-probability==0.14.1
tensorflow-serving-api==2.7.0
tensorflow-transform==1.6.0


!gcloud beta ai-platform versions create v2
--model itam_dpa_2022_text_classifier 
--python-version 3.7 
--runtime-version 2.7  
--origin gs://itam-dpa-2022-text-classifier/v1/ 
--package-uris gs://itam-dpa-2022-text-classifier/v1/packages/so_predict-0.1.tar.gz 
--prediction-class model_prediction.CustomModelPrediction

### We deploy the model with AI Platform: 

In [62]:
# --model refers to the model we already created, while --origin refers to the bucket and --package-uri is the unique location of the model we are going to deploy 
# in this example, the bucket and the model are both called the same way (which might be confusing), You should change this parameters: 
# After running this commando you ll be able to see your model in the UI: AI Platform / Models 
!gcloud beta ai-platform versions create v2 --model itam_dpa_2022_text_classifier --python-version 3.7 --runtime-version 2.7  --origin gs://itam-dpa-2022-text-classifier/v2/ --package-uris gs://itam-dpa-2022-text-classifier/v2/packages/so_predict-0.2.tar.gz --prediction-class model_prediction.CustomModelPrediction

Using endpoint [https://ml.googleapis.com/]
Creating version (this might take a few minutes)......done.                    


In [64]:
%%writefile predictions.txt
"I have a pandas data set, called 'df'. How can I do something like below;df.query(\"select * from df\") Thank you.For those who know R, there is a library called sqldf where you can execute SQL code in R, my question is basically, is there some library like sqldf in python"
"How to preprocess strings in Keras models Lambda layer? I have the problem that the value passed on to the Lambda layer (at compile time) is a placeholder generated by keras (without values). When the model is compiled, the .eval () method throws the error"
"I have a test excel file like:df = pd.DataFrame({'name':list('abcdefg'), 'age':[10,20,5,23,58,4,6]})print (df)name  age0    a   101    b   202    c    53    d   234    e   585    f    46    g    6I use Pandas and matplotlib to read and plot it:import pandas as pdimport numpy as npimport matplotlib.pyplot as pltimport osexcel_file = 'test.xlsx'df = pd.read_excel(excel_file, sheet_name=0)df.plot(kind='bar')plt.show()the result shows: enter image description hereit use index number as item name, how can I change it to the name, which stored in column name?"

Writing predictions.txt


In [65]:
#Now that the model is deployed, we can do predictions: 
predictions = !gcloud ai-platform predict --model=itam_dpa_2022_text_classifier --version=v2 --text-instances=predictions.txt

In [66]:
predictions

['Using endpoint [https://ml.googleapis.com/]',
 '[[0.004557758569717407, 0.22688749432563782, 0.7872121334075928, 0.08959212899208069, 0.0027919113636016846], [0.35895562171936035, 0.001184225082397461, 0.014009594917297363, 0.03241389989852905, 0.7719103097915649], [7.194867794169113e-05, 0.7333732843399048, 0.7804746627807617, 0.0009480714797973633, 2.0992163626942784e-05]]']

In [67]:
for sigmoid_arr in eval(predictions[1]):
  print(sigmoid_arr)
  for idx,probability in enumerate(sigmoid_arr):
    if probability > 0.7:
      print(tag_encoder.classes_[idx])
  print('\n')

[0.004557758569717407, 0.22688749432563782, 0.7872121334075928, 0.08959212899208069, 0.0027919113636016846]
pandas


[0.35895562171936035, 0.001184225082397461, 0.014009594917297363, 0.03241389989852905, 0.7719103097915649]
tensorflow


[7.194867794169113e-05, 0.7333732843399048, 0.7804746627807617, 0.0009480714797973633, 2.0992163626942784e-05]
matplotlib
pandas




In [68]:
predictions2 = !gcloud ai-platform predict --model=itam_dpa_2022_text_classifier --version=v2 --text-instances=predictions.txt

In [69]:
predictions2

['Using endpoint [https://ml.googleapis.com/]',
 '[[0.0113944411277771, 0.1355363130569458, 0.8665759563446045, 0.087117999792099, 0.00860169529914856], [0.43351656198501587, 0.002308487892150879, 0.006677567958831787, 0.013029724359512329, 0.8900078535079956], [2.8874205781903584e-06, 0.8157265186309814, 0.8848062753677368, 0.0002879500389099121, 1.5626831100234995e-06]]']

In [70]:
for sigmoid_arr in eval(predictions2[1]):
  print(sigmoid_arr)
  for idx,probability in enumerate(sigmoid_arr):
    if probability > 0.7:
      print(tag_encoder.classes_[idx])
  print('\n')

[0.0113944411277771, 0.1355363130569458, 0.8665759563446045, 0.087117999792099, 0.00860169529914856]
pandas


[0.43351656198501587, 0.002308487892150879, 0.006677567958831787, 0.013029724359512329, 0.8900078535079956]
tensorflow


[2.8874205781903584e-06, 0.8157265186309814, 0.8848062753677368, 0.0002879500389099121, 1.5626831100234995e-06]
matplotlib
pandas


