In [171]:
import boto3
import json
import logging
import pandas as pd
import random
import tarfile
from sklearn.model_selection import train_test_split
from IPython.display import JSON
from botocore.exceptions import ClientError

In [172]:
#Account Details 
region = boto3.session.Session().region_name
account_id = boto3.client('sts').get_caller_identity().get('Account')
print('REGION: ',region)
print('ACCOUNT: ',account_id)


#Created S3 Bucket
bucketName = 'myemailclassificationbucket'
bucketArn = 'arn:aws:s3:::{}'.format(bucketName)
iam = boto3.client('iam')
roleName = 'comprehend-bucketAccessRole'
roleArn=''
policyName='comprehend-DataAccessRolePolicy'
policyArn=''


#ROLE will be assumed by Comprehend

role_for_comprehend = {    
    "Version": "2012-10-17",
    "Statement": [
        {
            "Effect": "Allow",
            "Principal": 
            {
                "Service": "comprehend.amazonaws.com"
            },
            "Action": "sts:AssumeRole"
        }
    ]}


policy_for_comprehend = {
    "Version": "2012-10-17",
    "Statement": [
        {
            "Action": [
                "s3:GetObject",
                "s3:PutObject"
            ],
            "Resource": [
                "{}/*".format(bucketArn)
            ],
            "Effect": "Allow"
        },
        {
            "Action": [
                "s3:ListBucket"
            ],
            "Resource": [
                "{}".format(bucketArn)
            ],
            "Effect": "Allow"
        }
    ]
}

#Create ROLE
region = boto3.session.Session().region_name
account_id = boto3.client('sts').get_caller_identity().get('Account')
try:
    create_role_res = iam.create_role(
        RoleName=roleName,
        AssumeRolePolicyDocument=json.dumps(role_for_comprehend),
        Description='Comprehend Experiment Role',
    )
    roleArn = create_role_res['Role']['Arn']
except ClientError as error:
    if error.response['Error']['Code'] == 'EntityAlreadyExists':
        roleArn = 'arn:aws:iam::{0}:role/{1}'.format(account_id, roleName)
    else:
        print('Unexpected error occurred')

#Create Policy
region = boto3.session.Session().region_name
account_id = boto3.client('sts').get_caller_identity().get('Account')
try:
    policy_res = iam.create_policy(
        PolicyName=policyName,
        PolicyDocument=json.dumps(policy_for_comprehend)
    )
    policyArn = policy_res['Policy']['Arn']
except ClientError as error:
    if error.response['Error']['Code'] == 'EntityAlreadyExists':
        policyArn = 'arn:aws:iam::{0}:policy/{1}'.format(account_id, policyName)
    else:
        print('Unexpected error occurred')
        iam.delete_role(
            RoleName= roleName
        )


# Attach the policy-role
try:
    policy_attach_res = iam.attach_role_policy(
        RoleName=roleName,
        PolicyArn=policyArn)

except ClientError as error:
    print('Unexpected error occurred')
    iam.delete_role(
        RoleName=roleName
    )


REGION:  ap-south-1
ACCOUNT:  866834277637


In [173]:
print('Role ARN: "{}"'.format(roleArn))
print('Policy ARN: "{}"'.format(policyArn))
print('Bucket ARN: "{}"'.format(bucketArn))

Role ARN: "arn:aws:iam::866834277637:role/comprehend-bucketAccessRole"
Policy ARN: "arn:aws:iam::866834277637:policy/comprehend-DataAccessRolePolicy"
Bucket ARN: "arn:aws:s3:::myemailclassificationbucket"


In [174]:


#DATA SET PREPARATION # REMOVING NON UNICODE 
import pandas as pd
f = open('C:/Users/VMARA/OneDrive/Desktop/Python Codes/EmalDataset1.csv',encoding='UTF-8')
lines = f.readlines()
try:
    for l in lines:
        l.encode(encoding = 'UTF-8', errors = 'ignore')
except:
    print('Not able to encode')
corpus = pd.DataFrame(lines)
corpusdf = pd.DataFrame(corpus)
corpusdf.columns = ['Class']

corpusdf[['Class', 'Text']] = corpusdf.Class.str.split(',', n=1, expand=True)




# PREPARING TEST AND TRAINING DATA

TrainingDataset = corpusdf.sample(frac=0.8,random_state=3)
TestDataSet = corpusdf.sample(frac=0.2,random_state=25)

#print(TrainingDataset[TrainingDataset["Class"] == "\"\""])

#FILNAMES

TrainFile = 'TrainingDataset.csv'
TestFile = 'TestDataset.csv'

TrainingDataset.to_csv('C:/Users/VMARA/OneDrive/Desktop/Python Codes/'+TrainFile, header=None, index=False)
TestDataSet.to_csv('C:/Users/VMARA/OneDrive/Desktop/Python Codes/'+TestFile,header=None, index=False)


# UPLOAD FILE IN S3
s3 = boto3.client('s3')
status = s3.upload_file('C:/Users/VMARA/OneDrive/Desktop/Python Codes/'+TrainFile,'myemailclassificationbucket',TrainFile)

#Training file URI
training_file = 's3://myemailclassificationbucket/1'+TrainFile
outputfolder = 's3://myemailclassificationbucket/train/output/'



In [175]:
# Creating Custom Classifier
customClassifier = 'Custom-E-Classifier'
customclassifierArn= ''

comprehend = boto3.client('comprehend')
region = boto3.session.Session().region_name
account_id = boto3.client('sts').get_caller_identity().get('Account')

try:
    response = comprehend.create_document_classifier(
        DocumentClassifierName=customClassifier,
        DataAccessRoleArn=roleArn,
        InputDataConfig={
            'DataFormat': 'COMPREHEND_CSV',
            'S3Uri': training_file
        },
        OutputDataConfig={
            'S3Uri': outputfolder
        },
        LanguageCode='en'
    )
    document_classifier_arn = response['DocumentClassifierArn']
except ClientError as error:
    if error.response['Error']['Code'] == 'ResourceInUseException':
        print('A classifier with the name "{0}" already exists.'.format(customClassifier))
        document_classifier_arn = 'arn:aws:comprehend:{0}:{1}:document-classifier/{2}'.format(region, account_id, customClassifier)
print('Document Classifier ARN: ' + document_classifier_arn)

A classifier with the name "Custom-E-Classifier" already exists.
Document Classifier ARN: arn:aws:comprehend:ap-south-1:866834277637:document-classifier/Custom-E-Classifier


In [176]:
response = comprehend.describe_document_classifier(
        DocumentClassifierArn=document_classifier_arn
    )
status = response['DocumentClassifierProperties']['Status']
status

'TRAINED'

In [177]:
print(response['DocumentClassifierProperties'])

{'DocumentClassifierArn': 'arn:aws:comprehend:ap-south-1:866834277637:document-classifier/Custom-E-Classifier', 'LanguageCode': 'en', 'Status': 'TRAINED', 'SubmitTime': datetime.datetime(2022, 7, 8, 11, 6, 42, 716000, tzinfo=tzlocal()), 'EndTime': datetime.datetime(2022, 7, 8, 12, 8, 56, 955000, tzinfo=tzlocal()), 'TrainingStartTime': datetime.datetime(2022, 7, 8, 11, 10, 33, 882000, tzinfo=tzlocal()), 'TrainingEndTime': datetime.datetime(2022, 7, 8, 12, 7, 59, 920000, tzinfo=tzlocal()), 'InputDataConfig': {'DataFormat': 'COMPREHEND_CSV', 'S3Uri': 's3://myemailclassificationbucket/1TrainingDataset.csv'}, 'OutputDataConfig': {'S3Uri': 's3://myemailclassificationbucket/train/output/866834277637-CLR-0fb9c1e0c3097a135d064560513e604a/output/output.tar.gz'}, 'ClassifierMetadata': {'NumberOfLabels': 2, 'NumberOfTrainedDocuments': 4010, 'NumberOfTestDocuments': 445, 'EvaluationMetrics': {'Accuracy': 0.9933, 'Precision': 0.9961, 'Recall': 0.975, 'F1Score': 0.9852, 'MicroPrecision': 0.9933, 'Mic

In [179]:
#CREATING ENDPOINT FOR REAL TIME ANALYSIS
client = boto3.client('comprehend')
region = boto3.session.Session().region_name
account_id = boto3.client('sts').get_caller_identity().get('Account')
realtime_endpoint_name = customClassifier + '-endpoint'
try:
    response = client.create_endpoint(
        EndpointName=realtime_endpoint_name,
        ModelArn=document_classifier_arn,
        DesiredInferenceUnits=10
    )
    endpoint_arn = response['EndpointArn']
except ClientError as error:
    if error.response['Error']['Code'] == 'ResourceInUseException':
        print('An endpoint with the name "{0}" already exists.'.format(realtime_endpoint_name))
        endpoint_arn = 'arn:aws:comprehend:{0}:{1}:document-classifier-endpoint/{2}'.format(region, account_id, realtime_endpoint_name)
print('Document Classifier Endpoint ARN: ' + endpoint_arn)

An endpoint with the name "Custom-E-Classifier-endpoint" already exists.
Document Classifier Endpoint ARN: arn:aws:comprehend:ap-south-1:866834277637:document-classifier-endpoint/Custom-E-Classifier-endpoint


In [180]:

response = client.describe_endpoint(EndpointArn=endpoint_arn)
status = response['EndpointProperties']['Status']
print(status)

IN_SERVICE


In [188]:
#TEST 

test_emails = 'TestDataSet.csv'
column = ['CLASS','TEXT']
test_df = pd.read_csv('C:/Users/VMARA/OneDrive/Desktop/Python Codes/'+test_emails, names=column)
test_df

sample_email = '''
"Please call our customer service representative on 0800 169 6031 between 10am-9pm as you have WON a guaranteed Ã¥Â£1000 cash or Ã¥Â£5000 prize!
"


'''

In [189]:
response = client.classify_document(Text=sample_email,EndpointArn=endpoint_arn)

In [190]:
response['Classes'][0]['Name']

'Spam'

In [None]:
for i in range(0,25):
    response = client.classify_document(Text=test_df['CLASS'][i],EndpointArn=endpoint_arn)