In [1]:
import boto3
from datetime import datetime

In [2]:
#connecting to the comprehend service in aws
client = boto3.client('comprehend')

In [3]:
#create a document classifier in order to train the data
response = client.create_document_classifier(
    DocumentClassifierName='bbc-train',
    DataAccessRoleArn='arn:aws:iam::788405746112:role/uday',
    Tags=[
        {
            'Key': 'comprehendkey',
            'Value': 'comprehendvalue'
        },
    ],
    InputDataConfig={
        'S3Uri': 's3://bbc-classification/training_set.csv',
        #'LabelDelimiter': ','
    },
    OutputDataConfig={
        'S3Uri': 's3://bbc-output/'
    },
    ClientRequestToken='topic-classification',
    LanguageCode='en',
    Mode='MULTI_CLASS'
)

In [4]:
response.get('DocumentClassifierArn')

'arn:aws:comprehend:us-west-2:788405746112:document-classifier/bbc-train'

In [5]:
response

{'DocumentClassifierArn': 'arn:aws:comprehend:us-west-2:788405746112:document-classifier/bbc-train',
 'ResponseMetadata': {'RequestId': 'b5f41532-c0b3-406b-86b0-b92ae251c335',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'b5f41532-c0b3-406b-86b0-b92ae251c335',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '99',
   'date': 'Mon, 18 May 2020 13:25:37 GMT'},
  'RetryAttempts': 0}}

# Endpoint Creation,Describing,listing and Deleting

In [3]:
#creating an endpoint for the predictions purpose this is a api like key that stores the output in a file 
endpoint_response = client.create_endpoint(
    EndpointName='bbc-output',
    ModelArn='arn:aws:comprehend:us-west-2:788405746112:document-classifier/bbc-train',
    DesiredInferenceUnits=10,
    ClientRequestToken='topic-classification',
    Tags=[
        {
            'Key': 'comprehendkey',
            'Value': 'comprehendvalue'
        },
    ]
)

In [4]:
endpoint_response

{'EndpointArn': 'arn:aws:comprehend:us-west-2:788405746112:document-classifier-endpoint/bbc-output',
 'ResponseMetadata': {'RequestId': '708d7649-2259-4921-9f21-4030dac4c208',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '708d7649-2259-4921-9f21-4030dac4c208',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '99',
   'date': 'Sat, 23 May 2020 06:24:41 GMT'},
  'RetryAttempts': 0}}

# predicting the classes for the test data

In [4]:
#importing the test data set
import pandas as pd
df=pd.read_csv('test_set.csv')
df.head()

Unnamed: 0,0
0,Italy 17-28 Ireland Two moments of magic from...
1,Labour battle plan 'hides Blair' The Tories h...
2,Rivals of the Â£400 Apple... The Mac mini is ...
3,Libya takes $1bn in unfrozen funds Libya has ...
4,"The comic book genius of Stan Lee Stan Lee, t..."


In [5]:
df.shape

(25, 1)

In [7]:
describe_endpoint_response = client.describe_endpoint(
    EndpointArn='arn:aws:comprehend:us-west-2:788405746112:document-classifier-endpoint/bbc-output'
)

In [8]:
describe_endpoint_response

{'EndpointProperties': {'EndpointArn': 'arn:aws:comprehend:us-west-2:788405746112:document-classifier-endpoint/bbc-output',
  'Status': 'CREATING',
  'ModelArn': 'arn:aws:comprehend:us-west-2:788405746112:document-classifier/bbc-train',
  'DesiredInferenceUnits': 10,
  'CurrentInferenceUnits': 0,
  'CreationTime': datetime.datetime(2020, 5, 23, 11, 54, 41, 281000, tzinfo=tzlocal()),
  'LastModifiedTime': datetime.datetime(2020, 5, 23, 11, 54, 41, 281000, tzinfo=tzlocal())},
 'ResponseMetadata': {'RequestId': '6756da48-90fa-4bef-91da-c41d339bc766',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '6756da48-90fa-4bef-91da-c41d339bc766',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '348',
   'date': 'Sat, 23 May 2020 06:30:31 GMT'},
  'RetryAttempts': 0}}

In [19]:
#list of end points in use
list_endpoint_response = client.list_endpoints(
    Filter={
        'ModelArn': 'arn:aws:comprehend:us-west-2:788405746112:document-classifier/bbc-train',
        #'Status': 'IN_SERVICE',
        #'CreationTimeBefore': datetime(2020,5, 24),
        #'CreationTimeAfter': datetime(2020,5, 10)
    },
    #NextToken='next token for result',
    #MaxResults=100
)

In [20]:
list_endpoint_response

{'EndpointPropertiesList': [{'EndpointArn': 'arn:aws:comprehend:us-west-2:788405746112:document-classifier-endpoint/bbc-output',
   'Status': 'IN_SERVICE',
   'ModelArn': 'arn:aws:comprehend:us-west-2:788405746112:document-classifier/bbc-train',
   'DesiredInferenceUnits': 10,
   'CurrentInferenceUnits': 10,
   'CreationTime': datetime.datetime(2020, 5, 23, 11, 54, 41, 281000, tzinfo=tzlocal()),
   'LastModifiedTime': datetime.datetime(2020, 5, 23, 11, 54, 41, 281000, tzinfo=tzlocal())}],
 'ResponseMetadata': {'RequestId': 'e84d7e56-da9b-45b6-9de4-6a7fcdbaa5e6',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'e84d7e56-da9b-45b6-9de4-6a7fcdbaa5e6',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '357',
   'date': 'Sat, 23 May 2020 06:51:21 GMT'},
  'RetryAttempts': 0}}

In [26]:
#predictions of the test data can be get using this for loop
#for the endpoint classification the length of text should be less than 5000 words so that it wont raises an error
predictions=[]
for text in df['0']:
    response = client.classify_document(
        Text=text,
        EndpointArn='arn:aws:comprehend:us-west-2:788405746112:document-classifier-endpoint/bbc-output'
    )
    predictions.append(response)

In [27]:
response

{'Classes': [{'Name': 'tech', 'Score': 1.0},
  {'Name': 'sport', 'Score': 0.0},
  {'Name': 'entertainment', 'Score': 0.0}],
 'ResponseMetadata': {'RequestId': 'd3845bb9-fd69-4d2b-9ae9-d40684e5a0b0',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'd3845bb9-fd69-4d2b-9ae9-d40684e5a0b0',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '107',
   'date': 'Sat, 23 May 2020 07:09:01 GMT'},
  'RetryAttempts': 2}}

# deleting the endpoint

In [28]:
response = client.delete_endpoint(
    EndpointArn='arn:aws:comprehend:us-west-2:788405746112:document-classifier-endpoint/bbc-output'
)

In [29]:
response

{'ResponseMetadata': {'RequestId': '6c7e6b46-7b0c-4005-b41d-180a18a555a9',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '6c7e6b46-7b0c-4005-b41d-180a18a555a9',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '2',
   'date': 'Sat, 23 May 2020 07:11:43 GMT'},
  'RetryAttempts': 0}}

# Classifier Document job

In [3]:
#start document classification job
classification_response = client.start_document_classification_job(
    JobName='test-classification',
    DocumentClassifierArn='arn:aws:comprehend:us-west-2:788405746112:document-classifier/bbc-train',
    InputDataConfig={
        'S3Uri':'s3://bbc-classification/test_set_aws.csv',
        'InputFormat':'ONE_DOC_PER_LINE'
    },
    OutputDataConfig={
        'S3Uri': 's3://bbc-output/',
        #'KmsKeyId': 'string'
    },
    DataAccessRoleArn='arn:aws:iam::788405746112:role/uday',
    ClientRequestToken='topic-classification'
)

In [16]:
classification_response

{'JobId': 'cdc02addb61eaedd73c6591c56d843ea',
 'JobStatus': 'SUBMITTED',
 'ResponseMetadata': {'RequestId': '599aa461-bcaf-4aeb-a3d4-067f9e177299',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '599aa461-bcaf-4aeb-a3d4-067f9e177299',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '68',
   'date': 'Wed, 27 May 2020 09:33:15 GMT'},
  'RetryAttempts': 0}}

In [17]:
describe_document_classification_job_response = client.describe_document_classification_job(
    JobId='cdc02addb61eaedd73c6591c56d843ea'
)
print(describe_document_classification_job_response)

{'DocumentClassificationJobProperties': {'JobId': 'cdc02addb61eaedd73c6591c56d843ea', 'JobName': 'test-classification', 'JobStatus': 'COMPLETED', 'SubmitTime': datetime.datetime(2020, 5, 27, 15, 3, 15, 94000, tzinfo=tzlocal()), 'EndTime': datetime.datetime(2020, 5, 27, 15, 9, 23, 358000, tzinfo=tzlocal()), 'DocumentClassifierArn': 'arn:aws:comprehend:us-west-2:788405746112:document-classifier/bbc-train', 'InputDataConfig': {'S3Uri': 's3://bbc-classification/test_set_aws.csv', 'InputFormat': 'ONE_DOC_PER_LINE'}, 'OutputDataConfig': {'S3Uri': 's3://bbc-output/788405746112-CLN-cdc02addb61eaedd73c6591c56d843ea/output/output.tar.gz'}, 'DataAccessRoleArn': 'arn:aws:iam::788405746112:role/uday'}, 'ResponseMetadata': {'RequestId': '17b67121-2e15-48a9-aba3-00ac809f5648', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': '17b67121-2e15-48a9-aba3-00ac809f5648', 'content-type': 'application/x-amz-json-1.1', 'content-length': '575', 'date': 'Wed, 27 May 2020 10:36:43 GMT'}, 'RetryAttempts'

In [19]:
list_document_classification_jobs_response = client.list_document_classification_jobs(
    Filter={
         'JobName': 'test-classification',
#        'JobStatus': 'COMPLETED',
#         'SubmitTimeBefore': datetime(2020, 5, 24),
#         'SubmitTimeAfter': datetime(2020, 5, 22)
    },
#     NextToken='next token optional',
#     MaxResults=123
)
list_document_classification_jobs_response

{'DocumentClassificationJobPropertiesList': [{'JobId': 'cdc02addb61eaedd73c6591c56d843ea',
   'JobName': 'test-classification',
   'JobStatus': 'COMPLETED',
   'SubmitTime': datetime.datetime(2020, 5, 27, 15, 3, 15, 94000, tzinfo=tzlocal()),
   'EndTime': datetime.datetime(2020, 5, 27, 15, 9, 23, 358000, tzinfo=tzlocal()),
   'DocumentClassifierArn': 'arn:aws:comprehend:us-west-2:788405746112:document-classifier/bbc-train',
   'InputDataConfig': {'S3Uri': 's3://bbc-classification/test_set_aws.csv',
    'InputFormat': 'ONE_DOC_PER_LINE'},
   'OutputDataConfig': {'S3Uri': 's3://bbc-output/788405746112-CLN-cdc02addb61eaedd73c6591c56d843ea/output/output.tar.gz'},
   'DataAccessRoleArn': 'arn:aws:iam::788405746112:role/uday'}],
 'ResponseMetadata': {'RequestId': '5614220c-f092-437e-82d6-96324e6f7441',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '5614220c-f092-437e-82d6-96324e6f7441',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '581',
   'date': '

In [20]:
import jsonlines
import pandas as pd

In [23]:
out_dict=[]
with jsonlines.open('predictions.jsonl','r') as f:
    for line in f.iter():
        out_dict.append(line['Classes'][0])

In [24]:
out_dict

[{'Name': 'sport', 'Score': 0.9801},
 {'Name': 'politics', 'Score': 0.9991},
 {'Name': 'tech', 'Score': 1.0},
 {'Name': 'business', 'Score': 0.8709},
 {'Name': 'entertainment', 'Score': 0.9851},
 {'Name': 'business', 'Score': 0.5055},
 {'Name': 'entertainment', 'Score': 0.9458},
 {'Name': 'tech', 'Score': 0.9979},
 {'Name': 'business', 'Score': 0.9722},
 {'Name': 'sport', 'Score': 0.9257},
 {'Name': 'tech', 'Score': 1.0},
 {'Name': 'politics', 'Score': 0.9925},
 {'Name': 'sport', 'Score': 0.9976},
 {'Name': 'business', 'Score': 0.9889},
 {'Name': 'sport', 'Score': 0.9854},
 {'Name': 'business', 'Score': 0.9628},
 {'Name': 'entertainment', 'Score': 0.9655},
 {'Name': 'politics', 'Score': 0.9898},
 {'Name': 'sport', 'Score': 0.9828},
 {'Name': 'tech', 'Score': 1.0},
 {'Name': 'tech', 'Score': 1.0},
 {'Name': 'politics', 'Score': 0.9981},
 {'Name': 'business', 'Score': 0.8699},
 {'Name': 'politics', 'Score': 0.7993},
 {'Name': 'entertainment', 'Score': 0.9776}]

In [25]:
pred_values=pd.DataFrame(out_dict,index=None)
pred_values.head()

Unnamed: 0,Name,Score
0,sport,0.9801
1,politics,0.9991
2,tech,1.0
3,business,0.8709
4,entertainment,0.9851


In [30]:
test_data=pd.read_csv('test_set.csv')
actual=pd.read_csv('test_data_with_labels.csv')
print('*'*10,"test_set without labels",'*'*10)
print(test_data.head())
print('\n')
print('*'*10,"test_set with labels",'*'*10)
print(actual.head())

********** test_set without labels **********
                                                   0
0  Actress Roberts takes spider role  Actress Jul...
1  Kirwan demands Italy consistency  Italy coach ...
2  Labour battle plan 'hides Blair'  The Tories h...
3  Italy 17-28 Ireland  Two moments of magic from...
4  Jones happy with Henson heroics  Wales fly-hal...


********** test_set with labels **********
                                                   0              1
0  Actress Roberts takes spider role  Actress Jul...  entertainment
1  Kirwan demands Italy consistency  Italy coach ...          sport
2  Labour battle plan 'hides Blair'  The Tories h...       politics
3  Italy 17-28 Ireland  Two moments of magic from...          sport
4  Jones happy with Henson heroics  Wales fly-hal...          sport


In [36]:
result=pd.concat([actual,pred_values['Name']],axis=1)

In [41]:
result

Unnamed: 0,0,1,Name
0,Actress Roberts takes spider role Actress Jul...,entertainment,sport
1,Kirwan demands Italy consistency Italy coach ...,sport,politics
2,Labour battle plan 'hides Blair' The Tories h...,politics,tech
3,Italy 17-28 Ireland Two moments of magic from...,sport,business
4,Jones happy with Henson heroics Wales fly-hal...,sport,entertainment
5,UKIP outspent Labour on EU poll The UK Indepe...,politics,business
6,Film production 'falls' 40% in UK The number ...,entertainment,entertainment
7,Election deal faltered over Heath role The To...,politics,tech
8,O'Sullivan quick to hail Italians Ireland coa...,sport,business
9,Libya takes $1bn in unfrozen funds Libya has ...,business,sport


In [40]:
result["Name"]==result['1']

0     False
1     False
2     False
3     False
4     False
5     False
6      True
7     False
8     False
9     False
10    False
11    False
12    False
13    False
14    False
15    False
16     True
17    False
18    False
19     True
20     True
21    False
22     True
23    False
24    False
dtype: bool