#### Init a SageMaker Instance

In [1]:
import sagemaker, boto3, json
from sagemaker import get_execution_role

aws_role = get_execution_role()
aws_region = boto3.Session().region_name
sess = sagemaker.Session()

  from pandas.core.computation.check import NUMEXPR_INSTALLED


sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


#### The required imports for our model training

In [2]:
import numpy as np 
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_fscore_support as score
import pickle
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report

#### Loading Data from our S3 Bucket

In [3]:
bucket='arbisoft-ner'
data_key = 'ner_dataset.csv'
data_location = 's3://{}/{}'.format(bucket, data_key)
data = pd.read_csv(data_location,encoding='unicode_escape')

severe performance issues, see also https://github.com/dask/dask/issues/10276

To fix, you should specify a lower version bound on s3fs, or
update the current installation.



In [4]:
data.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O


In [5]:
CLASSES = ['O', 'B-geo', 'B-per', 'B-org', 'I-geo', 'I-per', 'I-org',]

def KeepRelevantClasses(row):
    if row['Tag'] not in CLASSES:
        return 'O'
    return row['Tag']
data['Tag'] = data.apply(KeepRelevantClasses, axis=1)

data = data.head(15000)

print("NER Classes: ", data['Tag'].unique())

data = data.ffill()
data.head()

print("Unique Sentences Count: ", data['Sentence #'].nunique())
print("Unique Words Count: ", data.Word.nunique())
print("Unique POS Tags: ", data.POS.nunique())
print("Unique POS Tags: ", data.POS.unique())

NER Classes:  ['O' 'B-geo' 'B-per' 'I-geo' 'B-org' 'I-org' 'I-per']
Unique Sentences Count:  678
Unique Words Count:  3596
Unique POS Tags:  40
Unique POS Tags:  ['NNS' 'IN' 'VBP' 'VBN' 'NNP' 'TO' 'VB' 'DT' 'NN' 'CC' 'JJ' '.' 'VBD' 'WP'
 '``' 'CD' 'PRP' 'VBZ' 'POS' 'VBG' 'RB' ',' 'WRB' 'PRP$' 'MD' 'WDT' 'JJR'
 ':' 'JJS' 'WP$' 'RP' 'PDT' 'NNPS' 'EX' 'RBS' 'LRB' 'RRB' '$' 'RBR' ';']


In [6]:
def GenerateFeaturesForSentence(word, prevWord, nextWord, pos, label):
    # A single feature per word
    return { 
        "lowercase": word.lower(),
        "prevword": prevWord,
        "nextword": nextWord,
        "iscaps": str(word.isupper()),
        "istitlecase": str(word.istitle()),
        "isdigit": str(word.isdigit()),
        "pos": pos,
       }, label

examples = []
for index, group in data.groupby('Sentence #'):
    words = list(group['Word'])
    pos = list(group['POS'])
    tags = list(group['Tag'])
    
    for index, word in enumerate(words):
        if index == 0:
            prevWord = '<start>'
        else:
            prevWord = words[index - 1]
            
        if index + 1 < len(words):
            nextWord = words[index + 1]
        else:
            nextWord = '<end>'
        examples.append(GenerateFeaturesForSentence(word, prevWord, nextWord, pos[index], tags[index]))
X_train_Orig, X_test, y_train, y_test = train_test_split(
    [i[0] for i in examples], [i[1] for i in examples], test_size=0.33, random_state=0)

In [7]:
vectorizer = DictVectorizer(sparse=False)
X_train = vectorizer.fit_transform(X_train_Orig)

print(X_train_Orig[0])
vector = vectorizer.transform(X_train_Orig[0])
print(vector)
print(vectorizer.inverse_transform(vector))

print(X_train_Orig[1])
vector = vectorizer.transform(X_train_Orig[1])
print(vector)
print(vectorizer.inverse_transform(vector))

X_test = vectorizer.transform(X_test)

clf = LogisticRegression(random_state=0, max_iter=500).fit(X_train, y_train)
print('Training Accuracy: ', clf.score(X_train, y_train))
print('Testing Accuracy: ', clf.score(X_test, y_test))

print('Train Classification Report: ')
pred_train = clf.predict(X_train)
print(classification_report(y_train, pred_train))

print('Test Classification Report: ')
pred_test = clf.predict(X_test)
print(classification_report(y_test, pred_test))

{'lowercase': 'that', 'prevword': 'violence', 'nextword': 'left', 'iscaps': 'False', 'istitlecase': 'False', 'isdigit': 'False', 'pos': 'WDT'}
[[1. 0. 1. ... 0. 0. 0.]]
[{'iscaps=False': 1.0, 'isdigit=False': 1.0, 'istitlecase=False': 1.0, 'lowercase=that': 1.0, 'nextword=left': 1.0, 'pos=WDT': 1.0, 'prevword=violence': 1.0}]
{'lowercase': '50', 'prevword': 'to', 'nextword': '.', 'iscaps': 'False', 'istitlecase': 'False', 'isdigit': 'True', 'pos': 'CD'}
[[1. 0. 0. ... 0. 0. 0.]]
[{'iscaps=False': 1.0, 'isdigit=True': 1.0, 'istitlecase=False': 1.0, 'lowercase=50': 1.0, 'nextword=.': 1.0, 'pos=CD': 1.0, 'prevword=to': 1.0}]
Training Accuracy:  0.9925373134328358
Testing Accuracy:  0.9602020202020202
Train Classification Report: 
              precision    recall  f1-score   support

           O       0.93      0.93      0.93       240
       B-geo       0.98      0.92      0.95       174
       B-per       0.98      0.97      0.98       132
       I-geo       1.00      0.89      0.94   

#### Saving Model and Vectorizer

In [8]:
with open('model_vectorizer.pkl', 'wb') as fout:
    pickle.dump((vectorizer, clf), fout)
    
import tarfile
import os.path

def make_tarfile(output_filename, source_dir):
    with tarfile.open(output_filename, "w:gz") as tar:
        tar.add(source_dir, arcname=os.path.basename(source_dir))
make_tarfile("model_vectorizer.pkl.tar.gz","model_vectorizer.pkl")

#### Uploading the zipped modle to S3 bucket

In [9]:
s3 = boto3.client('s3')

object_key = "model_vectorizer.pkl.tar.gz"
with open('model_vectorizer.pkl.tar.gz', 'rb') as data:
    s3.upload_fileobj(data, "arbisoft-ner", object_key)

#### Deploying the model using SageMaker

In [None]:
from sagemaker.sklearn.model import SKLearnModel

# Define IAM role
role = sagemaker.get_execution_role()

# Create a SKLearnModel from the saved model file
model = SKLearnModel(model_data='s3://arbisoft-ner/model_vectorizer.pkl.tar.gz', 
                     role=role, entry_point='ner_inference.py',framework_version="1.2-1",py_version="py3")

# Deploy the model
predictor = model.deploy(instance_type='ml.t2.medium', initial_instance_count=1)

----------!

#### Getting inference from the Deployed Model

In [14]:
import json

# Specify the endpoint name
endpoint_name = 'sagemaker-scikit-learn-2024-02-18-06-10-30-578' #'your-endpoint-name'

# Initialize a SageMaker Predictor object
predictor = sagemaker.predictor.Predictor(endpoint_name)

value1 = {'input':["Barack/NNP Obama/NNP will/MD be/VB visiting/VBG Lahore/NNP Pakistan/NNP in/IN 2024/CD for/IN Chess/NNP competition/NN"]}
# Make predictions using the predictor
#json_value1 = json.load(value1)

data = json.dumps(value1)

result = predictor.predict(data, initial_args={'ContentType': 'application/json'})
                           
print(result.decode())  # Print the prediction result

{"output": ["B-per", "I-per", "O", "O", "O", "O", "O", "O", "O", "O", "I-org", "O"]}
